Java tutorial
/* * Copyright 2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ajax4jsf.templatecompiler.utils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * A class which detects the open/close tags in an HTML document and reports * them to a listener class. * <p> * This is unfortunately necessary when using JSF with JSP, as tags in the body * of the document can need to output commands into the document at points * earlier than the tag occurred (particularly into the document HEAD section). * This can only be implemented by buffering the response and post-processing it * to find the relevant HTML tags and modifying the buffer as needed. * <p> * This class tries to do the parsing as quickly as possible; many of the * details of HTML are not relevant for the purposes this class is used for. * * @version $Revision: 1.1.2.1 $ $Date: 2006/12/20 18:56:39 $ */ public class ReducedHTMLParser { // IMPLEMENTATION NOTE: // // Many of the methods on this class are package-scope. This is intended // solely for the purpose of unit-testing. This class does not expect // other classes in this package to access its methods. private static final Log log = LogFactory.getLog(ReducedHTMLParser.class); public static final int BODY_TAG = 0; public static final int HEAD_TAG = 1; public static final int SCRIPT_TAG = 2; private static final int STATE_READY = 0; private static final int STATE_IN_COMMENT = 1; private static final int STATE_IN_TAG = 2; private static final int STATE_IN_MARKED_SECTION = 3; private static final int STATE_EXPECTING_ETAGO = 4; private int _offset; private int _lineNumber; private CharSequence _seq; private CallbackListener _listener; public static void parse(CharSequence seq, CallbackListener l) { new ReducedHTMLParser(seq, l).parse(); } /** * Constructor, package-scope for unit testing. * * @param s * is the sequence of chars to parse. * @param l * is the listener to invoke callbacks on. */ ReducedHTMLParser(CharSequence s, CallbackListener l) { this._seq = s; this._listener = l; } /** * Return true if there are no more characters to parse. */ boolean isFinished() { return this._offset >= this._seq.length(); } int getCurrentLineNumber() { return this._lineNumber; } /** * Advance the current parse position over any whitespace characters. */ void consumeWhitespace() { boolean crSeen = false; while (this._offset < this._seq.length()) { char c = this._seq.charAt(this._offset); if (!Character.isWhitespace(c)) { break; } // Track line number for error messages. if (c == '\r') { ++this._lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++this._lineNumber; } else { crSeen = false; } ++this._offset; } } /** * Eat up a sequence of non-whitespace characters and return them. */ String consumeNonWhitespace() { int wordStart = this._offset; while (this._offset < this._seq.length()) { char c = this._seq.charAt(this._offset); if (Character.isWhitespace(c)) { break; } ++this._offset; } if (wordStart == this._offset) { return null; } else { return this._seq.subSequence(wordStart, this._offset).toString(); } } /** * If the next chars in the input sequence exactly match the specified * string then skip over them and return true. * <p> * If there is not a match then leave the current parse position unchanged * and return false. * * @param s * is the exact string to match. * @return true if the input contains exactly the param s */ boolean consumeMatch(String s) { if (this._offset + s.length() > this._seq.length()) { // seq isn't long enough to contain the specified string return false; } int i = 0; while (i < s.length()) { if (this._seq.charAt(this._offset + i) == s.charAt(i)) { ++i; } else { return false; } } this._offset += i; return true; } /** * Eat up a sequence of chars which form a valid XML element name. * <p> * TODO: implement this properly in compliance with spec */ String consumeElementName() { consumeWhitespace(); int nameStart = this._offset; while (!isFinished()) { boolean ok = false; char c = this._seq.charAt(this._offset); if (Character.isLetterOrDigit(this._seq.charAt(this._offset))) { ok = true; } else if (c == '_') { ok = true; } else if (c == '-') { ok = true; } else if (c == ':') { ok = true; } if (!ok) { break; } ++this._offset; } if (nameStart == this._offset) { return null; } else { return this._seq.subSequence(nameStart, this._offset).toString(); } } /** * Eat up a sequence of chars which form a valid XML attribute name. * <p> * TODO: implement this properly in compliance with spec */ String consumeAttrName() { // for now, assume elements and attributes have same rules return consumeElementName(); } /** * Eat up a string which is terminated with the specified quote character. * This means handling escaped quote chars within the string. * <p> * This method assumes that the leading quote has already been consumed. */ String consumeString(char quote) { // TODO: should we consider a string to be terminated by a newline? // that would help with runaway strings but I think that multiline // strings *are* allowed... // // TODO: detect newlines within strings and increment lineNumber. // This isn't so important, though; they aren't common and being a // few lines out in an error message isn't serious either. StringBuffer stringBuf = new StringBuffer(); boolean escaping = false; while (!isFinished()) { char c = this._seq.charAt(this._offset); ++this._offset; if (c == quote) { if (!escaping) { break; } else { stringBuf.append(c); escaping = false; } } else if (c == '\\') { if (escaping) { // append a real backslash stringBuf.append(c); escaping = false; } else { escaping = true; } } else { stringBuf.append(c); } } return stringBuf.toString(); } /** * Assuming we have already encountered "attrname=", consume the value part * of the attribute definition. Note that unlike XML, HTML doesn't have to * quote its attribute values. * * @return the attribute value. If the attr-value was quoted, the returned * value will not include the quote chars. */ String consumeAttrValue() { consumeWhitespace(); if (consumeMatch("'")) { return consumeString('\''); } else if (consumeMatch("\"")) { return consumeString('"'); } else { return consumeNonWhitespace(); } } /** * Discard all characters in the input until one in the specified string * (character-set) is found. * * @param s * is a set of characters that should not be discarded. */ void consumeExcept(String s) { boolean crSeen = false; while (this._offset < this._seq.length()) { char c = this._seq.charAt(this._offset); if (s.indexOf(c) >= 0) { // char is in the exception set return; } // Track line number for error messages. if (c == '\r') { ++this._lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++this._lineNumber; } else { crSeen = false; } // Track line number for error messages. if (c == '\r') { ++this._lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++this._lineNumber; } else { crSeen = false; } ++this._offset; } } /** * Process the entire input buffer, invoking callbacks on the listener * object as appropriate. */ void parse() { int state = STATE_READY; int currentTagStart = -1; String currentTagName = null; this._lineNumber = 1; this._offset = 0; int lastOffset = this._offset - 1; while (this._offset < this._seq.length()) { // Sanity check; each pass through this loop must increase the // offset. // Failure to do this means a hang situation has occurred. if (this._offset <= lastOffset) { // throw new RuntimeException("Infinite loop detected in // ReducedHTMLParser"); log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped"); // return; } lastOffset = this._offset; if (state == STATE_READY) { // in this state, nothing but "<" has any significance consumeExcept("<"); if (isFinished()) { break; } if (consumeMatch("<!--")) { // Note that whitespace is *not* permitted in <!-- state = STATE_IN_COMMENT; } else if (consumeMatch("<![")) { // Start of a "marked section", eg "<![CDATA" or // "<![INCLUDE" or "<![IGNORE". These always terminate // with "]]>" log.debug("Marked section found at line " + getCurrentLineNumber()); state = STATE_IN_MARKED_SECTION; } else if (consumeMatch("<!DOCTYPE")) { log.debug("DOCTYPE found at line " + getCurrentLineNumber()); // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. // // TODO: Handle case where the DOCTYPE includes an internal // DTD. In // that case there *will* be embedded < chars in the // document. However // that's very unlikely to be used in a JSF page, so this is // pretty low // priority. } else if (consumeMatch("<?")) { // xml processing instruction or <!DOCTYPE> tag // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. log.debug("PI found at line " + getCurrentLineNumber()); } else if (consumeMatch("</")) { if (!processEndTag()) { // message already logged return; } // stay in state READY state = STATE_READY; } else if (consumeMatch("<")) { // We can't tell the user that the tag has closed until // after we have // processed any attributes and found the real end of the // tag. So save // the current info until the end of this tag. currentTagStart = this._offset - 1; currentTagName = consumeElementName(); if (currentTagName == null) { log.warn("Invalid HTML; bare lessthan sign found at line " + getCurrentLineNumber()); // remain in STATE_READY; this isn't really the start of // an xml element. } else { state = STATE_IN_TAG; } } else { // should never get here throw new Error("Internal error at line " + getCurrentLineNumber()); } continue; } if (state == STATE_IN_COMMENT) { // TODO: handle "-- >", which is a valid way to close a // comment according to the specs. // in this state, nothing but "--" has any significance consumeExcept("-"); if (isFinished()) { break; } if (consumeMatch("-->")) { state = STATE_READY; } else { // false call; hyphen is not end of comment consumeMatch("-"); } continue; } if (state == STATE_IN_TAG) { consumeWhitespace(); if (consumeMatch("/>")) { // ok, end of element state = STATE_READY; closedTag(currentTagStart, this._offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; } else if (consumeMatch(">")) { if (currentTagName.equalsIgnoreCase("script") || currentTagName.equalsIgnoreCase("style")) { // We've just started a special tag which can contain // anything except // the ETAGO marker ("</"). See // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data state = STATE_EXPECTING_ETAGO; } else { state = STATE_READY; } // end of open tag, but not end of element openedTag(currentTagStart, this._offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; } else { // xml attribute String attrName = consumeAttrName(); if (attrName == null) { // Oops, we found something quite unexpected in this // tag. // The best we can do is probably to drop back to // looking // for "/>", though that does risk us misinterpreting // the // contents of an attribute's associated string value. log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'" + " at line " + getCurrentLineNumber()); state = STATE_EXPECTING_ETAGO; // and consume one character ++this._offset; } else { consumeWhitespace(); // html can have "stand-alone" attributes with no // following equals sign if (consumeMatch("=")) { String attrValue = consumeAttrValue(); } } } continue; } if (state == STATE_IN_MARKED_SECTION) { // in this state, nothing but "]]>" has any significance consumeExcept("]"); if (isFinished()) { break; } if (consumeMatch("]]>")) { state = STATE_READY; } else { // false call; ] is not end of cdata section consumeMatch("]"); } continue; } if (state == STATE_EXPECTING_ETAGO) { // The term "ETAGO" is the official spec term for "</". consumeExcept("<"); if (isFinished()) { log.debug("Malformed input page; input terminated while tag not closed."); break; } if (consumeMatch("</")) { if (!processEndTag()) { return; } state = STATE_READY; } else { // false call; < does not start an ETAGO consumeMatch("<"); } continue; } } } /** * Invoked when "</" has been seen in the input, this method handles the * parsing of the end tag and the invocation of the appropriate callback * method. * * @return true if the tag was successfully parsed, and false if there was a * fatal parsing error. */ private boolean processEndTag() { int tagStart = this._offset - 2; String tagName = consumeElementName(); consumeWhitespace(); if (!consumeMatch(">")) { log.error("Malformed end tag at line " + getCurrentLineNumber() + "; skipping parsing"); return false; } // inform user that the tag has been closed closedTag(tagStart, this._offset, tagName); // We can't verify that the tag names balance because this is HTML // we are processing, not XML. return true; } /** * Invoke a callback method to inform the listener that we have found a * start tag. * * @param startOffset * @param endOffset * @param tagName */ void openedTag(int startOffset, int endOffset, String tagName) { // log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" // + tagName); if ("head".equalsIgnoreCase(tagName)) { this._listener.openedStartTag(startOffset, HEAD_TAG); this._listener.closedStartTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { this._listener.openedStartTag(startOffset, BODY_TAG); this._listener.closedStartTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { this._listener.openedStartTag(startOffset, SCRIPT_TAG); this._listener.closedStartTag(endOffset, SCRIPT_TAG); } } void closedTag(int startOffset, int endOffset, String tagName) { // log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" // + tagName); if ("head".equalsIgnoreCase(tagName)) { this._listener.openedEndTag(startOffset, HEAD_TAG); this._listener.closedEndTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { this._listener.openedEndTag(startOffset, BODY_TAG); this._listener.closedEndTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { this._listener.openedEndTag(startOffset, SCRIPT_TAG); this._listener.closedEndTag(endOffset, SCRIPT_TAG); } } }