Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.myfaces.renderkit.html.util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * A class which detects the open/close tags in an HTML document and reports * them to a listener class. * <p> * This is unfortunately necessary when using JSF with JSP, as tags in the body * of the document can need to output commands into the document at points * earlier than the tag occurred (particularly into the document HEAD section). * This can only be implemented by buffering the response and post-processing * it to find the relevant HTML tags and modifying the buffer as needed. * <p> * This class tries to do the parsing as quickly as possible; many of the * details of HTML are not relevant for the purposes this class is used for. * * @version $Revision$ $Date$ */ public class ReducedHTMLParser { // IMPLEMENTATION NOTE: // // Many of the methods on this class are package-scope. This is intended // solely for the purpose of unit-testing. This class does not expect // other classes in this package to access its methods. private static final Log log = LogFactory.getLog(ReducedHTMLParser.class); public static final int BODY_TAG = 0; public static final int HEAD_TAG = 1; public static final int SCRIPT_TAG = 2; private static final int STATE_READY = 0; private static final int STATE_IN_COMMENT = 1; private static final int STATE_IN_TAG = 2; private static final int STATE_IN_MARKED_SECTION = 3; private static final int STATE_EXPECTING_ETAGO = 4; private int _offset; private int _lineNumber; private CharSequence _seq; private CallbackListener _listener; public static void parse(CharSequence seq, CallbackListener l) { new ReducedHTMLParser(seq, l).parse(); } /** * Constructor, package-scope for unit testing. * * @param s is the sequence of chars to parse. * @param l is the listener to invoke callbacks on. */ ReducedHTMLParser(CharSequence s, CallbackListener l) { _seq = s; _listener = l; } /** * Return true if there are no more characters to parse. */ boolean isFinished() { return _offset >= _seq.length(); } int getCurrentLineNumber() { return _lineNumber; } /** * Advance the current parse position over any whitespace characters. */ void consumeWhitespace() { boolean crSeen = false; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (!Character.isWhitespace(c)) { break; } // Track line number for error messages. if (c == '\r') { ++_lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++_lineNumber; } else { crSeen = false; } ++_offset; } } /** * Eat up a sequence of non-whitespace characters and return them. */ String consumeNonWhitespace() { int wordStart = _offset; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (Character.isWhitespace(c)) { break; } ++_offset; } if (wordStart == _offset) { return null; } else { return _seq.subSequence(wordStart, _offset).toString(); } } /** * If the next chars in the input sequence exactly match the specified * string then skip over them and return true. * <p> * If there is not a match then leave the current parse position * unchanged and return false. * * @param s is the exact string to match. * @return true if the input contains exactly the param s */ boolean consumeMatch(String s) { if (_offset + s.length() > _seq.length()) { // seq isn't long enough to contain the specified string return false; } int i = 0; while (i < s.length()) { if (_seq.charAt(_offset + i) == s.charAt(i)) { ++i; } else { return false; } } _offset += i; return true; } /** * Eat up a sequence of chars which form a valid XML element name. * <p> * TODO: implement this properly in compliance with spec */ String consumeElementName() { consumeWhitespace(); int nameStart = _offset; while (!isFinished()) { boolean ok = false; char c = _seq.charAt(_offset); if (Character.isLetterOrDigit(_seq.charAt(_offset))) { ok = true; } else if (c == '_') { ok = true; } else if (c == '-') { ok = true; } else if (c == ':') { ok = true; } if (!ok) { break; } ++_offset; } if (nameStart == _offset) { return null; } else { return _seq.subSequence(nameStart, _offset).toString(); } } /** * Eat up a sequence of chars which form a valid XML attribute name. * <p> * TODO: implement this properly in compliance with spec */ String consumeAttrName() { // for now, assume elements and attributes have same rules return consumeElementName(); } /** * Eat up a string which is terminated with the specified quote * character. This means handling escaped quote chars within the * string. * <p> * This method assumes that the leading quote has already been * consumed. */ String consumeString(char quote) { // TODO: should we consider a string to be terminated by a newline? // that would help with runaway strings but I think that multiline // strings *are* allowed... // // TODO: detect newlines within strings and increment lineNumber. // This isn't so important, though; they aren't common and being a // few lines out in an error message isn't serious either. StringBuffer stringBuf = new StringBuffer(); boolean escaping = false; while (!isFinished()) { char c = _seq.charAt(_offset); ++_offset; if (c == quote) { if (!escaping) { break; } else { stringBuf.append(c); escaping = false; } } else if (c == '\\') { if (escaping) { // append a real backslash stringBuf.append(c); escaping = false; } else { escaping = true; } } else { if (escaping) { stringBuf.append('\\'); escaping = false; } stringBuf.append(c); } } return stringBuf.toString(); } /** * Assuming we have already encountered "attrname=", consume the * value part of the attribute definition. Note that unlike XML, * HTML doesn't have to quote its attribute values. * * @return the attribute value. If the attr-value was quoted, * the returned value will not include the quote chars. */ String consumeAttrValue() { consumeWhitespace(); if (consumeMatch("'")) { return consumeString('\''); } else if (consumeMatch("\"")) { return consumeString('"'); } else { return consumeNonWhitespace(); } } /** * Discard all characters in the input until one in the specified * string (character-set) is found. * * @param s is a set of characters that should not be discarded. */ void consumeExcept(String s) { boolean crSeen = false; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (s.indexOf(c) >= 0) { // char is in the exception set return; } // Track line number for error messages. if (c == '\r') { ++_lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++_lineNumber; } else { crSeen = false; } ++_offset; } } /** * Process the entire input buffer, invoking callbacks on the listener * object as appropriate. */ void parse() { int state = STATE_READY; int currentTagStart = -1; String currentTagName = null; _lineNumber = 1; _offset = 0; int lastOffset = _offset - 1; while (_offset < _seq.length()) { // Sanity check; each pass through this loop must increase the offset. // Failure to do this means a hang situation has occurred. if (_offset <= lastOffset) { // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser"); log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped." + " Surroundings: '" + getTagSurroundings() + "'."); //return; } lastOffset = _offset; if (state == STATE_READY) { // in this state, nothing but "<" has any significance consumeExcept("<"); if (isFinished()) { break; } if (consumeMatch("<!--")) { // Note that whitespace is *not* permitted in <!-- state = STATE_IN_COMMENT; } else if (consumeMatch("<![")) { // Start of a "marked section", eg "<![CDATA" or // "<![INCLUDE" or "<![IGNORE". These always terminate // with "]]>" log.debug("Marked section found at line " + getCurrentLineNumber() + ". " + "Surroundings: '" + getTagSurroundings() + "'."); state = STATE_IN_MARKED_SECTION; } else if (consumeMatch("<!DOCTYPE")) { log.debug("DOCTYPE found at line " + getCurrentLineNumber()); // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. // // TODO: Handle case where the DOCTYPE includes an internal DTD. In // that case there *will* be embedded < chars in the document. However // that's very unlikely to be used in a JSF page, so this is pretty low // priority. } else if (consumeMatch("<?")) { // xml processing instruction or <!DOCTYPE> tag // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. log.debug("PI found at line " + getCurrentLineNumber()); } else if (consumeMatch("</")) { if (!processEndTag()) { // message already logged return; } // stay in state READY state = STATE_READY; } else if (consumeMatch("<")) { // We can't tell the user that the tag has closed until after we have // processed any attributes and found the real end of the tag. So save // the current info until the end of this tag. currentTagStart = _offset - 1; currentTagName = consumeElementName(); if (currentTagName == null) { log.warn("Invalid HTML; bare lessthan sign found at line " + getCurrentLineNumber() + ". " + "Surroundings: '" + getTagSurroundings() + "'."); // remain in STATE_READY; this isn't really the start of // an xml element. } else { state = STATE_IN_TAG; } } else { // should never get here throw new Error("Internal error at line " + getCurrentLineNumber()); } continue; } if (state == STATE_IN_COMMENT) { // TODO: handle "-- >", which is a valid way to close a // comment according to the specs. // in this state, nothing but "--" has any significance consumeExcept("-"); if (isFinished()) { break; } if (consumeMatch("-->")) { state = STATE_READY; } else { // false call; hyphen is not end of comment consumeMatch("-"); } continue; } if (state == STATE_IN_TAG) { consumeWhitespace(); if (consumeMatch("/>")) { // ok, end of element state = STATE_READY; closedTag(currentTagStart, _offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; } else if (consumeMatch(">")) { if (currentTagName.equalsIgnoreCase("script") || currentTagName.equalsIgnoreCase("style")) { // We've just started a special tag which can contain anything except // the ETAGO marker ("</"). See // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data state = STATE_EXPECTING_ETAGO; } else { state = STATE_READY; } // end of open tag, but not end of element openedTag(currentTagStart, _offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; } else { // xml attribute String attrName = consumeAttrName(); if (attrName == null) { // Oops, we found something quite unexpected in this tag. // The best we can do is probably to drop back to looking // for "/>", though that does risk us misinterpreting the // contents of an attribute's associated string value. log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'" + " at line " + getCurrentLineNumber() + ". " + "Surroundings: '" + getTagSurroundings() + "'."); state = STATE_EXPECTING_ETAGO; // and consume one character ++_offset; } else { consumeWhitespace(); // html can have "stand-alone" attributes with no following equals sign if (consumeMatch("=")) { consumeAttrValue(); } } } continue; } if (state == STATE_IN_MARKED_SECTION) { // in this state, nothing but "]]>" has any significance consumeExcept("]"); if (isFinished()) { break; } if (consumeMatch("]]>")) { state = STATE_READY; } else { // false call; ] is not end of cdata section consumeMatch("]"); } continue; } if (state == STATE_EXPECTING_ETAGO) { // The term "ETAGO" is the official spec term for "</". consumeExcept("<"); if (isFinished()) { log.debug("Malformed input page; input terminated while tag not closed."); break; } if (consumeMatch("</")) { if (!processEndTag()) { return; } state = STATE_READY; } else { // false call; < does not start an ETAGO consumeMatch("<"); } continue; } } } /** * Get details about malformed HTML tag. * * @return Tag surroundings. */ private String getTagSurroundings() { int maxLength = 30; int end = _seq.length(); if (end - _offset > maxLength) { end = _offset + maxLength; } return _seq.subSequence(_offset, end).toString(); } /** * Invoked when "</" has been seen in the input, this method * handles the parsing of the end tag and the invocation of the * appropriate callback method. * * @return true if the tag was successfully parsed, and false * if there was a fatal parsing error. */ private boolean processEndTag() { int tagStart = _offset - 2; String tagName = consumeElementName(); consumeWhitespace(); if (!consumeMatch(">")) { // log details about malformed end tag log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber() + "; skipping parsing. Surroundings: '" + getTagSurroundings() + "'."); return false; } // inform user that the tag has been closed closedTag(tagStart, _offset, tagName); // We can't verify that the tag names balance because this is HTML // we are processing, not XML. return true; } /** * Invoke a callback method to inform the listener that we have found a start tag. * * @param startOffset * @param endOffset * @param tagName */ void openedTag(int startOffset, int endOffset, String tagName) { //log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName); if ("head".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, HEAD_TAG); _listener.closedStartTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, BODY_TAG); _listener.closedStartTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, SCRIPT_TAG); _listener.closedStartTag(endOffset, SCRIPT_TAG); } } void closedTag(int startOffset, int endOffset, String tagName) { //log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName); if ("head".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, HEAD_TAG); _listener.closedEndTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, BODY_TAG); _listener.closedEndTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, SCRIPT_TAG); _listener.closedEndTag(endOffset, SCRIPT_TAG); } } }