org.apache.wicket.markup.parser.XmlPullParser.java Source code

Introduction

Here is the source code for org.apache.wicket.markup.parser.XmlPullParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.wicket.markup.parser;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.text.ParseException;
import java.util.Locale;

import org.apache.wicket.markup.parser.XmlTag.TagType;
import org.apache.wicket.markup.parser.XmlTag.TextSegment;
import org.apache.wicket.util.io.FullyBufferedReader;
import org.apache.wicket.util.io.IOUtils;
import org.apache.wicket.util.io.XmlReader;
import org.apache.wicket.util.lang.Args;
import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser;
import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser;
import org.apache.wicket.util.string.Strings;

/**
 * A fairly shallow markup pull parser which parses a markup string of a given type of markup (for
 * example, html, xml, vxml or wml) into ComponentTag and RawMarkup tokens.
 * 
 * @author Jonathan Locke
 * @author Juergen Donnerstag
 */
public final class XmlPullParser implements IXmlPullParser {
    /** */
    public static final String STYLE = "style";

    /** */
    public static final String SCRIPT = "script";

    /**
     * The encoding of the XML.
     */
    private String encoding;

    /**
     * A XML independent reader which loads the whole source data into memory and which provides
     * convenience methods to access the data.
     */
    private FullyBufferedReader input;

    /** temporary variable which will hold the name of the closing tag. */
    private String skipUntilText;

    /** The last substring selected from the input */
    private CharSequence lastText;

    /** Everything in between &lt;!DOCTYPE ... &gt; */
    private CharSequence doctype;

    /** The type of what is in lastText */
    private HttpTagType lastType = HttpTagType.NOT_INITIALIZED;

    /** The last tag found */
    private XmlTag lastTag;

    /**
     * Construct.
     */
    public XmlPullParser() {
    }

    @Override
    public final String getEncoding() {
        return encoding;
    }

    @Override
    public final CharSequence getDoctype() {
        return doctype;
    }

    @Override
    public final CharSequence getInputFromPositionMarker(final int toPos) {
        return input.getSubstring(toPos);
    }

    @Override
    public final CharSequence getInput(final int fromPos, final int toPos) {
        return input.getSubstring(fromPos, toPos);
    }

    /**
     * Whatever will be in between the current index and the closing tag, will be ignored (and thus
     * treated as raw markup (text). This is useful for tags like 'script'.
     * 
     * @throws ParseException
     */
    private void skipUntil() throws ParseException {
        // this is a tag with non-XHTML text as body - skip this until the
        // skipUntilText is found.
        final int startIndex = input.getPosition();
        final int tagNameLen = skipUntilText.length();

        int pos = input.getPosition() - 1;
        String endTagText = null;
        int lastPos = 0;
        while (!skipUntilText.equalsIgnoreCase(endTagText)) {
            pos = input.find("</", pos + 1);
            if ((pos == -1) || ((pos + (tagNameLen + 2)) >= input.size())) {
                throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex);
            }

            lastPos = pos + 2;
            endTagText = input.getSubstring(lastPos, lastPos + tagNameLen).toString();
        }

        input.setPosition(pos);
        lastText = input.getSubstring(startIndex, pos);
        lastType = HttpTagType.BODY;

        // Check that the tag is properly closed
        lastPos = input.find('>', lastPos + tagNameLen);
        if (lastPos == -1) {
            throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex);
        }

        // Reset the state variable
        skipUntilText = null;
    }

    /**
     * 
     * @return line and column number
     */
    private String getLineAndColumnText() {
        return " (line " + input.getLineNumber() + ", column " + input.getColumnNumber() + ")";
    }

    /**
     * @return XXX
     * @throws ParseException
     */
    @Override
    public final HttpTagType next() throws ParseException {
        // Reached end of markup file?
        if (input.getPosition() >= input.size()) {
            return HttpTagType.NOT_INITIALIZED;
        }

        if (skipUntilText != null) {
            skipUntil();
            return lastType;
        }

        // Any more tags in the markup?
        final int openBracketIndex = input.find('<');

        // Tag or Body?
        if (input.charAt(input.getPosition()) != '<') {
            // It's a BODY
            if (openBracketIndex == -1) {
                // There is no next matching tag.
                lastText = input.getSubstring(-1);
                input.setPosition(input.size());
                lastType = HttpTagType.BODY;
                return lastType;
            }

            lastText = input.getSubstring(openBracketIndex);
            input.setPosition(openBracketIndex);
            lastType = HttpTagType.BODY;
            return lastType;
        }

        // Determine the line number
        input.countLinesTo(openBracketIndex);

        // Get index of closing tag and advance past the tag
        int closeBracketIndex = -1;

        if (openBracketIndex != -1 && openBracketIndex < input.size() - 1) {
            char nextChar = input.charAt(openBracketIndex + 1);

            if ((nextChar == '!') || (nextChar == '?'))
                closeBracketIndex = input.find('>', openBracketIndex);
            else
                closeBracketIndex = input.findOutOfQuotes('>', openBracketIndex);
        }

        if (closeBracketIndex == -1) {
            throw new ParseException("No matching close bracket at" + getLineAndColumnText(), input.getPosition());
        }

        // Get the complete tag text
        lastText = input.getSubstring(openBracketIndex, closeBracketIndex + 1);

        // Get the tagtext between open and close brackets
        String tagText = lastText.subSequence(1, lastText.length() - 1).toString();
        if (tagText.length() == 0) {
            throw new ParseException("Found empty tag: '<>' at" + getLineAndColumnText(), input.getPosition());
        }

        // Type of the tag, to be determined next
        final TagType type;

        // If the tag ends in '/', it's a "simple" tag like <foo/>
        if (tagText.endsWith("/")) {
            type = TagType.OPEN_CLOSE;
            tagText = tagText.substring(0, tagText.length() - 1);
        } else if (tagText.startsWith("/")) {
            // The tag text starts with a '/', it's a simple close tag
            type = TagType.CLOSE;
            tagText = tagText.substring(1);
        } else {
            // It must be an open tag
            type = TagType.OPEN;

            // If open tag and starts with "s" like "script" or "style", than ...
            if ((tagText.length() > STYLE.length()) && ((tagText.charAt(0) == 's') || (tagText.charAt(0) == 'S'))) {
                final String lowerCase = tagText.toLowerCase(Locale.ROOT);
                if (lowerCase.startsWith(SCRIPT)) {
                    String typeAttr = "type=";
                    int idxOfType = lowerCase.indexOf(typeAttr);
                    if (idxOfType > 0) {
                        // +1 to remove the ' or "
                        String typePrefix = lowerCase.substring(idxOfType + typeAttr.length() + 1);
                        if (typePrefix.startsWith("text/javascript")) {
                            // prepare to skip everything between the open and close tag
                            skipUntilText = SCRIPT;
                        }
                        // any other type is assumed to be a template so it can contain child nodes.
                        // See WICKET-5288
                    } else {
                        // no type attribute so it is 'text/javascript'
                        // prepare to skip everything between the open and close tag
                        skipUntilText = SCRIPT;
                    }
                } else if (lowerCase.startsWith(STYLE)) {
                    // prepare to skip everything between the open and close tag
                    skipUntilText = STYLE;
                }
            }
        }

        // Handle special tags like <!-- and <![CDATA ...
        final char firstChar = tagText.charAt(0);
        if ((firstChar == '!') || (firstChar == '?')) {
            specialTagHandling(tagText, openBracketIndex, closeBracketIndex);

            input.countLinesTo(openBracketIndex);
            TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
                    input.getColumnNumber());
            lastTag = new XmlTag(text, type);

            return lastType;
        }

        TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
                input.getColumnNumber());
        XmlTag tag = new XmlTag(text, type);
        lastTag = tag;

        // Parse the tag text and populate tag attributes
        if (parseTagText(tag, tagText)) {
            // Move to position after the tag
            input.setPosition(closeBracketIndex + 1);
            lastType = HttpTagType.TAG;
            return lastType;
        } else {
            throw new ParseException("Malformed tag" + getLineAndColumnText(), openBracketIndex);
        }
    }

    /**
     * Handle special tags like <!-- --> or <![CDATA[..]]> or <?xml>
     * 
     * @param tagText
     * @param openBracketIndex
     * @param closeBracketIndex
     * @throws ParseException
     */
    protected void specialTagHandling(String tagText, final int openBracketIndex, int closeBracketIndex)
            throws ParseException {
        // Handle comments
        if (tagText.startsWith("!--")) {
            // downlevel-revealed conditional comments e.g.: <!--[if (gt IE9)|!(IE)]><!-->
            if (tagText.contains("![endif]--")) {
                lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;

                // Move to position after the tag
                input.setPosition(closeBracketIndex + 1);
                return;
            }

            // Conditional comment? E.g.
            // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->"
            if (tagText.startsWith("!--[if ") && tagText.endsWith("]")) {
                int pos = input.find("]-->", openBracketIndex + 1);
                if (pos == -1) {
                    throw new ParseException("Unclosed conditional comment beginning at" + getLineAndColumnText(),
                            openBracketIndex);
                }

                pos += 4;
                lastText = input.getSubstring(openBracketIndex, pos);

                // Actually it is no longer a comment. It is now
                // up to the browser to select the section appropriate.
                input.setPosition(closeBracketIndex + 1);
                lastType = HttpTagType.CONDITIONAL_COMMENT;
            } else {
                // Normal comment section.
                // Skip ahead to "-->". Note that you can not simply test for
                // tagText.endsWith("--") as the comment might contain a '>'
                // inside.
                int pos = input.find("-->", openBracketIndex + 1);
                if (pos == -1) {
                    throw new ParseException("Unclosed comment beginning at" + getLineAndColumnText(),
                            openBracketIndex);
                }

                pos += 3;
                lastText = input.getSubstring(openBracketIndex, pos);
                lastType = HttpTagType.COMMENT;
                input.setPosition(pos);
            }
            return;
        }

        // The closing tag of a conditional comment, e.g.
        // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->
        // and also <!--<![endif]-->"
        if (tagText.equals("![endif]--")) {
            lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;
            input.setPosition(closeBracketIndex + 1);
            return;
        }

        // CDATA sections might contain "<" which is not part of an XML tag.
        // Make sure escaped "<" are treated right
        if (tagText.startsWith("![")) {
            final String startText = (tagText.length() <= 8 ? tagText : tagText.substring(0, 8));
            if (startText.toUpperCase(Locale.ROOT).equals("![CDATA[")) {
                int pos1 = openBracketIndex;
                do {
                    // Get index of closing tag and advance past the tag
                    closeBracketIndex = findChar('>', pos1);

                    if (closeBracketIndex == -1) {
                        throw new ParseException("No matching close bracket at" + getLineAndColumnText(),
                                input.getPosition());
                    }

                    // Get the tagtext between open and close brackets
                    tagText = input.getSubstring(openBracketIndex + 1, closeBracketIndex).toString();

                    pos1 = closeBracketIndex + 1;
                } while (tagText.endsWith("]]") == false);

                // Move to position after the tag
                input.setPosition(closeBracketIndex + 1);

                lastText = tagText;
                lastType = HttpTagType.CDATA;
                return;
            }
        }

        if (tagText.charAt(0) == '?') {
            lastType = HttpTagType.PROCESSING_INSTRUCTION;

            // Move to position after the tag
            input.setPosition(closeBracketIndex + 1);
            return;
        }

        if (tagText.startsWith("!DOCTYPE")) {
            lastType = HttpTagType.DOCTYPE;

            // Get the tagtext between open and close brackets
            doctype = input.getSubstring(openBracketIndex + 1, closeBracketIndex);

            // Move to position after the tag
            input.setPosition(closeBracketIndex + 1);
            return;
        }

        // Move to position after the tag
        lastType = HttpTagType.SPECIAL_TAG;
        input.setPosition(closeBracketIndex + 1);
    }

    /**
     * @return MarkupElement
     */
    @Override
    public final XmlTag getElement() {
        return lastTag;
    }

    /**
     * @return The xml string from the last element
     */
    @Override
    public final CharSequence getString() {
        return lastText;
    }

    /**
     * @return The next XML tag
     * @throws ParseException
     */
    public final XmlTag nextTag() throws ParseException {
        while (next() != HttpTagType.NOT_INITIALIZED) {
            switch (lastType) {
            case TAG:
                return lastTag;

            case BODY:
                break;

            case COMMENT:
                break;

            case CONDITIONAL_COMMENT:
                break;

            case CDATA:
                break;

            case PROCESSING_INSTRUCTION:
                break;

            case SPECIAL_TAG:
                break;
            }
        }

        return null;
    }

    /**
     * Find the char but ignore any text within ".." and '..'
     * 
     * @param ch
     *            The character to search
     * @param startIndex
     *            Start index
     * @return -1 if not found, else the index
     */
    private int findChar(final char ch, int startIndex) {
        char quote = 0;

        for (; startIndex < input.size(); startIndex++) {
            final char charAt = input.charAt(startIndex);
            if (quote != 0) {
                if (quote == charAt) {
                    quote = 0;
                }
            } else if ((charAt == '"') || (charAt == '\'')) {
                quote = charAt;
            } else if (charAt == ch) {
                return startIndex;
            }
        }

        return -1;
    }

    /**
     * Parse the given string.
     * <p>
     * Note: xml character encoding is NOT applied. It is assumed the input provided does have the
     * correct encoding already.
     * 
     * @param string
     *            The input string
     * @throws IOException
     *             Error while reading the resource
     */
    @Override
    public void parse(final CharSequence string) throws IOException {
        Args.notNull(string, "string");

        this.input = new FullyBufferedReader(new StringReader(string.toString()));
        this.encoding = null;
    }

    /**
     * Reads and parses markup from an input stream, using UTF-8 encoding by default when not
     * specified in XML declaration.
     * 
     * @param in
     *            The input stream to read and parse
     * @throws IOException
     * 
     * @see {@link #parse(InputStream, String)}
     */
    @Override
    public void parse(final InputStream in) throws IOException {
        // When XML declaration does not specify encoding, it defaults to UTF-8
        parse(in, "UTF-8");
    }

    /**
     * Reads and parses markup from an input stream.
     * <p>
     * Note: The input is closed after parsing.
     * 
     * @param inputStream
     *            The input stream to read and parse
     * @param encoding
     *            The default character encoding of the input
     * @throws IOException
     */
    @Override
    public void parse(final InputStream inputStream, final String encoding) throws IOException {
        Args.notNull(inputStream, "inputStream");

        try {
            XmlReader xmlReader = new XmlReader(new BufferedInputStream(inputStream, 4000), encoding);
            this.input = new FullyBufferedReader(xmlReader);
            this.encoding = xmlReader.getEncoding();
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    }

    @Override
    public final void setPositionMarker() {
        input.setPositionMarker(input.getPosition());
    }

    @Override
    public final void setPositionMarker(final int pos) {
        input.setPositionMarker(pos);
    }

    @Override
    public String toString() {
        return input.toString();
    }

    /**
     * Parses the text between tags. For example, "a href=foo.html".
     * 
     * @param tag
     * @param tagText
     *            The text between tags
     * @return false in case of an error
     * @throws ParseException
     */
    private boolean parseTagText(final XmlTag tag, final String tagText) throws ParseException {
        // Get the length of the tagtext
        final int tagTextLength = tagText.length();

        // If we match tagname pattern
        final TagNameParser tagnameParser = new TagNameParser(tagText);
        if (tagnameParser.matcher().lookingAt()) {
            // Extract the tag from the pattern matcher
            tag.name = tagnameParser.getName();
            tag.namespace = tagnameParser.getNamespace();

            // Are we at the end? Then there are no attributes, so we just
            // return the tag
            int pos = tagnameParser.matcher().end(0);
            if (pos == tagTextLength) {
                return true;
            }

            // Extract attributes
            final VariableAssignmentParser attributeParser = new VariableAssignmentParser(tagText);
            while (attributeParser.matcher().find(pos)) {
                // Get key and value using attribute pattern
                String value = attributeParser.getValue();

                // In case like <html xmlns:wicket> will the value be null
                if (value == null) {
                    value = "";
                }

                // Set new position to end of attribute
                pos = attributeParser.matcher().end(0);

                // Chop off double quotes or single quotes
                if (value.startsWith("\"") || value.startsWith("\'")) {
                    value = value.substring(1, value.length() - 1);
                }

                // Trim trailing whitespace
                value = value.trim();

                // Unescape
                value = Strings.unescapeMarkup(value).toString();

                // Get key
                final String key = attributeParser.getKey();

                // Put the attribute in the attributes hash
                if (null != tag.getAttributes().put(key, value)) {
                    throw new ParseException("Same attribute found twice: " + key + getLineAndColumnText(),
                            input.getPosition());
                }

                // The input has to match exactly (no left over junk after
                // attributes)
                if (pos == tagTextLength) {
                    return true;
                }
            }

            return true;
        }

        return false;
    }
}