org.apache.jmeter.protocol.http.parser.JTidyHTMLParser.java Source code

Introduction

Here is the source code for org.apache.jmeter.protocol.http.parser.JTidyHTMLParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.apache.jmeter.protocol.http.parser;

import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;

import org.apache.commons.lang3.StringUtils;
import org.apache.jmeter.protocol.http.util.ConversionUtils;
import org.apache.jorphan.logging.LoggingManager;
import org.apache.log.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.xml.sax.SAXException;

/**
 * HtmlParser implementation using JTidy.
 *
 */
class JTidyHTMLParser extends HTMLParser {
    private static final Logger log = LoggingManager.getLoggerForClass();

    protected JTidyHTMLParser() {
        super();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection urls,
            String encoding) throws HTMLParseException {
        Document dom = null;
        try {
            dom = (Document) getDOM(html, encoding);
        } catch (SAXException se) {
            throw new HTMLParseException(se);
        }

        // Now parse the DOM tree

        scanNodes(dom, urls, baseUrl);

        return urls.iterator();
    }

    /**
     * Scan nodes recursively, looking for embedded resources
     *
     * @param node -
     *            initial node
     * @param urls -
     *            container for URLs
     * @param baseUrl -
     *            used to create absolute URLs
     *
     * @return new base URL
     */
    private URL scanNodes(Node node, URLCollection urls, URL baseUrl) throws HTMLParseException {
        if (node == null) {
            return baseUrl;
        }

        String name = node.getNodeName();

        int type = node.getNodeType();

        switch (type) {

        case Node.DOCUMENT_NODE:
            scanNodes(((Document) node).getDocumentElement(), urls, baseUrl);
            break;

        case Node.ELEMENT_NODE:

            NamedNodeMap attrs = node.getAttributes();
            if (name.equalsIgnoreCase(TAG_BASE)) {
                String tmp = getValue(attrs, ATT_HREF);
                if (tmp != null) {
                    try {
                        baseUrl = ConversionUtils.makeRelativeURL(baseUrl, tmp);
                    } catch (MalformedURLException e) {
                        throw new HTMLParseException(e);
                    }
                }
                break;
            }

            if (name.equalsIgnoreCase(TAG_IMAGE) || name.equalsIgnoreCase(TAG_EMBED)) {
                urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
                break;
            }

            if (name.equalsIgnoreCase(TAG_APPLET)) {
                urls.addURL(getValue(attrs, "code"), baseUrl);
                break;
            }

            if (name.equalsIgnoreCase(TAG_OBJECT)) {
                String data = getValue(attrs, "codebase");
                if (!StringUtils.isEmpty(data)) {
                    urls.addURL(data, baseUrl);
                }

                data = getValue(attrs, "data");
                if (!StringUtils.isEmpty(data)) {
                    urls.addURL(data, baseUrl);
                }
                break;
            }

            if (name.equalsIgnoreCase(TAG_INPUT)) {
                String src = getValue(attrs, ATT_SRC);
                String typ = getValue(attrs, ATT_TYPE);
                if ((src != null) && (typ.equalsIgnoreCase(ATT_IS_IMAGE))) {
                    urls.addURL(src, baseUrl);
                }
                break;
            }
            if (name.equalsIgnoreCase(TAG_LINK) && getValue(attrs, ATT_REL).equalsIgnoreCase(STYLESHEET)) {
                urls.addURL(getValue(attrs, ATT_HREF), baseUrl);
                break;
            }
            if (name.equalsIgnoreCase(TAG_SCRIPT)) {
                urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
                break;
            }
            if (name.equalsIgnoreCase(TAG_FRAME)) {
                urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
                break;
            }
            if (name.equalsIgnoreCase(TAG_IFRAME)) {
                urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
                break;
            }
            String back = getValue(attrs, ATT_BACKGROUND);
            if (back != null) {
                urls.addURL(back, baseUrl);
            }
            if (name.equalsIgnoreCase(TAG_BGSOUND)) {
                urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
                break;
            }

            String style = getValue(attrs, ATT_STYLE);
            if (style != null) {
                HtmlParsingUtils.extractStyleURLs(baseUrl, urls, style);
            }

            NodeList children = node.getChildNodes();
            if (children != null) {
                int len = children.getLength();
                for (int i = 0; i < len; i++) {
                    baseUrl = scanNodes(children.item(i), urls, baseUrl);
                }
            }

            break;

        // case Node.TEXT_NODE:
        // break;

        default:
            // ignored
            break;
        }

        return baseUrl;

    }

    /*
     * Helper method to get an attribute value, if it exists @param attrs list
     * of attributs @param attname attribute name @return
     */
    private String getValue(NamedNodeMap attrs, String attname) {
        String v = null;
        Node n = attrs.getNamedItem(attname);
        if (n != null) {
            v = n.getNodeValue();
        }
        return v;
    }

    /**
     * Returns <code>tidy</code> as HTML parser.
     *
     * @return a <code>tidy</code> HTML parser
     */
    private static Tidy getTidyParser(String encoding) {
        log.debug("Start : getParser");
        Tidy tidy = new Tidy();
        tidy.setInputEncoding(encoding);
        tidy.setOutputEncoding("UTF8");
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        if (log.isDebugEnabled()) {
            log.debug("getParser : tidy parser created - " + tidy);
        }
        log.debug("End   : getParser");
        return tidy;
    }

    /**
     * Returns a node representing a whole xml given an xml document.
     *
     * @param text
     *            an xml document (as a byte array)
     * @return a node representing a whole xml
     *
     * @throws SAXException
     *             indicates an error parsing the xml document
     */
    private static Node getDOM(byte[] text, String encoding) throws SAXException {
        log.debug("Start : getDOM");
        Node node = getTidyParser(encoding).parseDOM(new ByteArrayInputStream(text), null);
        if (log.isDebugEnabled()) {
            log.debug("node : " + node);
        }
        log.debug("End   : getDOM");
        return node;
    }
}