org.jamwiki.parser.jflex.JFlexParserUtil.java Source code

Introduction

Here is the source code for org.jamwiki.parser.jflex.JFlexParserUtil.java
Source

/**
 * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the latest version of the GNU Lesser General
 * Public License as published by the Free Software Foundation;
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program (LICENSE.txt); if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jamwiki.parser.jflex;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.jamwiki.Environment;
import org.jamwiki.model.WikiReference;
import org.jamwiki.parser.ParserException;
import org.jamwiki.parser.ParserInput;
import org.jamwiki.parser.ParserOutput;
import org.jamwiki.utils.LinkUtil;
import org.jamwiki.utils.Utilities;
import org.jamwiki.utils.WikiLink;
import org.jamwiki.utils.WikiLogger;

/**
 * Utility methods used with the Mediawiki lexers.
 */
public class JFlexParserUtil {

    private static final WikiLogger logger = WikiLogger.getLogger(JFlexParserUtil.class.getName());
    private static final String emptyBodyTagPattern = "(br|div|hr|td|th)";
    private static final String nonNestingTagPattern = "(dd|dl|dt|hr|li|ol|table|tbody|td|tfoot|th|thead|tr|ul)";
    private static final String nonTextBodyTagPattern = "(dl|ol|table|tr|ul)";
    private static final String nonInlineTagPattern = "(caption|dd|div|dl|dt|hr|li|ol|p|table|td|th|tr|ul)";
    private static final String nonInlineTagStartPattern = "<" + nonInlineTagPattern + ">.*";
    private static final String nonInlineTagEndPattern = ".*</" + nonInlineTagPattern + ">";
    private static final Pattern EMPTY_BODY_TAG_PATTERN = Pattern.compile(emptyBodyTagPattern,
            Pattern.CASE_INSENSITIVE);
    /** Pattern to catch script insertions of the form "onsubmit=" or insertions that use a javascript url. */
    private static final Pattern JAVASCRIPT_PATTERN = Pattern.compile("(( on[a-z]{3,}=)+)|((javascript\\s*\\:)+)",
            Pattern.CASE_INSENSITIVE);
    private static final Pattern NON_NESTING_TAG_PATTERN = Pattern.compile(nonNestingTagPattern,
            Pattern.CASE_INSENSITIVE);
    private static final Pattern NON_TEXT_BODY_TAG_PATTERN = Pattern.compile(nonTextBodyTagPattern,
            Pattern.CASE_INSENSITIVE);
    private static final Pattern NON_INLINE_TAG_PATTERN = Pattern.compile(nonInlineTagPattern,
            Pattern.CASE_INSENSITIVE);
    private static final Pattern NON_INLINE_TAG_START_PATTERN = Pattern.compile(nonInlineTagStartPattern,
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    private static final Pattern NON_INLINE_TAG_END_PATTERN = Pattern.compile(nonInlineTagEndPattern,
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    private static final Pattern TAG_PATTERN = Pattern.compile("(<\\s*[/]?\\s*)([a-z]+)(\\s*(.*?))([/]?\\s*>)",
            Pattern.CASE_INSENSITIVE);

    /**
     *
     */
    private JFlexParserUtil() {
    }

    /**
     * An empty body tag is one that contains no content, such as "br".
     */
    protected static boolean isEmptyBodyTag(String tagType) {
        if (isRootTag(tagType)) {
            return true;
        }
        Matcher matcher = EMPTY_BODY_TAG_PATTERN.matcher(tagType);
        return matcher.matches();
    }

    /**
     * An inline tag is a tag that does not affect page flow such as
     * "b" or "i".  A non-inline tag such as "div" is one that creates
     * its own display box.
     */
    protected static boolean isInlineTag(String tagType) {
        if (isRootTag(tagType)) {
            return true;
        }
        Matcher matcher = NON_INLINE_TAG_PATTERN.matcher(tagType);
        return !matcher.matches();
    }

    /**
     * A non-nesting tag is a tag such as "li" which cannot be nested within
     * another "li" tag.
     */
    protected static boolean isNonNestingTag(String tagType) {
        Matcher matcher = NON_NESTING_TAG_PATTERN.matcher(tagType);
        return matcher.matches();
    }

    /**
     *
     */
    protected static boolean isNonInlineTagEnd(String tagText) {
        Matcher matcher = NON_INLINE_TAG_END_PATTERN.matcher(tagText);
        return matcher.matches();
    }

    /**
     *
     */
    protected static boolean isNonInlineTagStart(String tagText) {
        Matcher matcher = NON_INLINE_TAG_START_PATTERN.matcher(tagText);
        return matcher.matches();
    }

    /**
     * Evaluate the tag to determine whether it is the parser root tag
     * that indicates the bottom of the parser tag stack.
     */
    protected static boolean isRootTag(String tagType) {
        return tagType.equals(JFlexTagItem.ROOT_TAG);
    }

    /**
     * Determine whether the tag allows text body content.  Some tags, such
     * as "table", allow only tag content and no text content.
     */
    protected static boolean isTextBodyTag(String tagType) {
        if (isRootTag(tagType)) {
            return true;
        }
        Matcher matcher = NON_TEXT_BODY_TAG_PATTERN.matcher(tagType);
        return !matcher.matches();
    }

    /**
     * Provide a way to run the pre-processor against a fragment of text, such
     * as an image caption.  This method should be used sparingly since it is
     * not very efficient.
     */
    public static String parseFragment(ParserInput parserInput, String raw, int mode) throws ParserException {
        if (StringUtils.isBlank(raw)) {
            return raw;
        }
        JFlexParser parser = new JFlexParser(parserInput);
        ParserOutput parserOutput = new ParserOutput();
        return parser.parseFragment(parserOutput, raw, mode);
    }

    /**
     * Parse a raw Wiki link of the form "[[link|text]]", and return a WikiLink
     * object representing the link.
     *
     * @param raw The raw Wiki link text.
     * @return A WikiLink object that represents the link.
     */
    protected static WikiLink parseWikiLink(String raw) {
        if (StringUtils.isBlank(raw)) {
            return new WikiLink();
        }
        raw = raw.trim();
        String suffix = ((!raw.endsWith("]]")) ? raw.substring(raw.lastIndexOf("]]") + 2) : null);
        // for performance reasons use String methods rather than regex
        // private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[\\s*(\\:\\s*)?\\s*(.+?)(\\s*\\|\\s*(.+))?\\s*\\]\\]([a-z]*)");
        raw = raw.substring(raw.indexOf("[[") + 2, raw.lastIndexOf("]]")).trim();
        boolean colon = false;
        if (raw.startsWith(":")) {
            colon = true;
            raw = raw.substring(1).trim();
        }
        String text = null;
        int pos = raw.indexOf('|');
        if (pos != -1 && pos != (raw.length() - 1)) {
            text = raw.substring(pos + 1).trim();
            raw = raw.substring(0, pos).trim();
        }
        WikiLink wikiLink = LinkUtil.parseWikiLink(raw);
        wikiLink.setColon(colon);
        wikiLink.setText(text);
        if (!StringUtils.isBlank(suffix)) {
            wikiLink.setText((StringUtils.isBlank(text) ? wikiLink.getDestination() : text) + suffix);
        }
        return wikiLink;
    }

    /**
     * Clean up HTML tags to make them XHTML compliant (lowercase, no
     * unnecessary spaces).
     */
    protected static String sanitizeHtmlTag(String tag) {
        String result = tag.trim();
        result = StringUtils.remove(result, " ").toLowerCase();
        if (result.endsWith("/>")) {
            // spaces were stripped, so make sure tag is of the form "<br />"
            result = result.substring(0, result.length() - 2) + " />";
        }
        return result;
    }

    /**
     * Given a tag of the form "<tag>content</tag>", return all content between
     * the tags.  Consider the following examples:
     *
     * "<tag>content</tag>" returns "content".
     * "<tag />" returns and empty string.
     * "<tag><sub>content</sub></tag>" returns "<sub>content</sub>".
     *
     * @param raw The raw tag content to be analyzed.
     * @return The content for the tag being analyzed.
     */
    protected static String tagContent(String raw) {
        int start = raw.indexOf('>') + 1;
        int end = raw.lastIndexOf('<');
        if (start == 0) {
            // no tags
            return raw;
        }
        if (end <= start) {
            return "";
        }
        return raw.substring(start, end);
    }

    /**
     * Given an HTML tag, split it into its tag type and tag attributes,
     * cleaning up the attribtues in the process - allowing Javascript
     * action tags to be used as attributes (onmouseover, etc) is
     * a bad thing, so clean up HTML tags to remove any such attributes.
     */
    protected static String[] parseHtmlTag(String tag) {
        Matcher m = TAG_PATTERN.matcher(tag);
        String[] result = new String[4];
        if (!m.find()) {
            logger.severe("Failure while attempting to match html tag for pattern " + tag);
            return result;
        }
        String tagType = m.group(2).toLowerCase().trim();
        String tagAttributes = m.group(3).trim();
        String tagOpen = m.group(1).trim();
        String tagClose = m.group(5).trim();
        if (!StringUtils.isBlank(tagAttributes)) {
            tagAttributes = JFlexParserUtil.validateHtmlTagAttributes(tagAttributes).trim();
        }
        result[0] = tagType;
        result[1] = tagAttributes;
        result[2] = tagOpen;
        result[3] = tagClose;
        return result;
    }

    /**
     * During parsing the reference objects will be stored as a temporary array.  This method
     * parses that array and returns the reference objects.
     *
     * @param parserInput The current ParserInput object for the topic that is being parsed.
     * @return A list of reference objects (never <code>null</code>) for the current topic that
     *  is being parsed.
     */
    protected static List<WikiReference> retrieveReferences(ParserInput parserInput) {
        List<WikiReference> references = (List<WikiReference>) parserInput.getTempParams()
                .get(WikiReferenceTag.REFERENCES_PARAM);
        if (references == null) {
            references = new ArrayList<WikiReference>();
            parserInput.getTempParams().put(WikiReferenceTag.REFERENCES_PARAM, references);
        }
        return references;
    }

    /**
     * Parse a template string of the form "param1|param2|param3" into tokens
     * (param1, param2, and param3 in the example), handling such cases as
     * "param1|[[foo|bar]]|param3" correctly.
     */
    protected static List<String> tokenizeParamString(String content) {
        List<String> tokens = new ArrayList<String>();
        int pos = 0;
        int endPos = -1;
        String substring = "";
        String value = "";
        while (pos < content.length()) {
            substring = content.substring(pos);
            endPos = -1;
            if (substring.startsWith("{{{")) {
                // template parameter
                endPos = Utilities.findMatchingEndTag(content, pos, "{{{", "}}}");
            } else if (substring.startsWith("{{")) {
                // template
                endPos = Utilities.findMatchingEndTag(content, pos, "{{", "}}");
            } else if (substring.startsWith("[[")) {
                // link
                endPos = Utilities.findMatchingEndTag(content, pos, "[[", "]]");
            } else if (substring.startsWith("{|")) {
                // table
                endPos = Utilities.findMatchingEndTag(content, pos, "{|", "|}");
            } else if (content.charAt(pos) == '|') {
                // new token
                tokens.add(value);
                value = "";
                pos++;
                continue;
            }
            if (endPos != -1) {
                value += content.substring(pos, endPos);
                pos = endPos;
            } else {
                value += content.charAt(pos);
                pos++;
            }
        }
        // add the last one
        tokens.add(value);
        return tokens;
    }

    /**
     * Allowing Javascript action tags to be used as attributes (onmouseover, etc) is
     * a bad thing, so clean up HTML tags to remove any such attributes.
     */
    protected static String validateHtmlTag(String tag) {
        String[] tagInfo = JFlexParserUtil.parseHtmlTag(tag);
        String tagOpen = tagInfo[2];
        String tagKeyword = tagInfo[0];
        String attributes = tagInfo[1];
        String tagClose = tagInfo[3];
        StringBuilder result = new StringBuilder("<");
        if (tagOpen.indexOf('/') != -1) {
            result.append('/');
        }
        result.append(tagKeyword);
        if (!StringUtils.isBlank(attributes)) {
            result.append(' ').append(attributes);
        }
        if (tagClose.indexOf('/') != -1) {
            result.append(" />");
        } else {
            result.append(tagClose.trim());
        }
        return result.toString();
    }

    /**
     * Allowing Javascript action tags to be used as attributes (onmouseover, etc) is
     * a bad thing, so clean up HTML tags to remove any such attributes.
     */
    protected static String validateHtmlTagAttributes(String attributes) {
        if (StringUtils.isBlank(attributes)) {
            return attributes;
        }
        if (!Environment.getBooleanValue(Environment.PROP_PARSER_ALLOW_JAVASCRIPT)) {
            // pattern requires a space prior to the "onFoo", so make sure one exists
            Matcher m = JAVASCRIPT_PATTERN.matcher(" " + attributes);
            if (m.find()) {
                logger.warning("Attempt to include Javascript in Wiki syntax " + attributes);
                return "";
            }
        }
        return attributes;
    }
}