org.archive.wayback.replay.html.transformer.JSStringTransformer.java Source code

Introduction

Here is the source code for org.archive.wayback.replay.html.transformer.JSStringTransformer.java
Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.replay.html.transformer;

import java.lang.reflect.Method;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.replay.html.StringTransformer;

/**
 * Translates absolute URLs found in JavaScript code block.
 * <p>Looks for http/https absolute URLs in JavaScript code and translates
 * them with {@link ReplayParseContext#contextualizeUrl(String)}.</p>
 * <p>You can customize the pattern for finding URLs with {@code regex} property.
 * Regular expression must have at least one <em>capturing group</em>, and the first
 * capturing group is assumed to enclose URL to be rewritten.
 * (new feature 2014-04-22) Any matching text preceding and
 * following the first group will be preserved in the output.</p>
 * <p>For example: if you want to replace protocol-relative URL in addition to
 * regular full URL in JavaScript, you could use conservative regex like:
 * <pre>
 * "[\"']((?:https?:)?//(?:[^/]+@)?[^@:/]+(?:\\.[^@:/]+)+(?:[0-9]+)?)"
 * </pre>
 * Note single/double quote preceding URL is preserved in 2014-04-22 version and on.</p>
 * <p>TODO: org.archive.wayback.archivalurl.ArchivalUrlJSReplayRenderer has
 * similar code.  can be consolidated, like ArchivalURLJSStringTransformerReplayRenderer?</p>
 * <p>May 1, 2014: slight design change:
 * Now JSStringTransformer does not run it's own should-rewrite check and sends all matching
 * text to {@link ReplayParseContext#contextualizeUrl(String)}. More specifically it no longer
 * be affected by {@code rewriteHttpsOnly} flag. This is a design choice to keep
 * {@code StringTransformer} detached from replay mode knowledge and focus on find-and-replace URLs
 * </p>
 * @author brad
 *
 */
public class JSStringTransformer implements StringTransformer {
    private static final Logger LOGGER = Logger.getLogger(JSStringTransformer.class.getName());

    private final static Pattern defaultHttpPattern = Pattern.compile("(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)");

    private Pattern pattern = defaultHttpPattern;
    private String escaping;
    // we could expose SourceEscaping interface and sourceEscaping for higher
    // degree of customization
    private SourceEscaping sourceEscaping;

    public interface SourceEscaping {
        public String unescape(String text);

        public String escape(String text);
    }

    /**
     * SourceEscaping implemented with commons-lang {@link StringEscapeUtils}
     */
    public class CommonsLangEscaping implements SourceEscaping {
        private Method escapeMethod;
        private Method unescapeMethod;

        /**
         * Initialize with escaping scheme name.
         * See {@code escape}... methods of {@link StringEscapeUtils} for supported
         * escaping scheme names. {@code null} or empty string are valid, and will
         * make a no-op escaping.
         * @param name escaping scheme name, ex. "JavaScript"
         */
        public CommonsLangEscaping(String name) {
            if (name == null || name.isEmpty()) {
                escapeMethod = unescapeMethod = null;
            } else {
                if (name.equals("javascript")) {
                    name = "JavaScript";
                } else {
                    if (Character.isLowerCase(name.charAt(0))) {
                        name = name.substring(0, 1).toUpperCase() + name.substring(1);
                    }
                }
                try {
                    escapeMethod = StringEscapeUtils.class.getMethod("escape" + name, String.class);
                } catch (NoSuchMethodException ex) {
                    throw new IllegalArgumentException("StringEscapeUtils.escape" + name, ex);
                }
                try {
                    unescapeMethod = StringEscapeUtils.class.getMethod("unescape" + name, String.class);
                } catch (NoSuchMethodException ex) {
                    throw new IllegalArgumentException("StringEscapeUtils.unescape" + name, ex);
                }
            }
        }

        public String unescape(String text) {
            if (unescapeMethod != null) {
                try {
                    return (String) unescapeMethod.invoke(null, text);
                } catch (Exception ex) {
                    LOGGER.warn("Error unescaping text \"" + text + "\" with " + unescapeMethod, ex);
                }
            }
            return text;
        }

        public String escape(String text) {
            if (escapeMethod != null) {
                try {
                    return (String) escapeMethod.invoke(null, text);
                } catch (Exception ex) {
                    LOGGER.warn("Error escaping text \"" + text + "\" with " + escapeMethod, ex);
                }
            }
            return text;
        }
    }

    /**
     * a regular expression for searching URLs in the target resource.
     * @param regex
     */
    public void setRegex(String regex) {
        pattern = Pattern.compile(regex);
    }

    public String getRegex() {
        return pattern.pattern();
    }

    /**
     * Naming of escaping scheme applied to extracted text.
     * @param escaping escaping scheme name, such as {@code JavaScript},
     * {@code Xml}
     * @see StringEscapeUtils
     */
    public void setEscaping(String escaping) {
        this.sourceEscaping = new CommonsLangEscaping(escaping);
        this.escaping = escaping;
    }

    public String getEscaping() {
        return escaping;
    }

    public String transform(ReplayParseContext context, String input) {
        StringBuffer replaced = new StringBuffer(input.length());
        Matcher m = pattern.matcher(input);
        while (m.find()) {
            String rawUrl = m.group(1);
            String pre = input.substring(m.start(), m.start(1));
            String post = input.substring(m.end(1), m.end());

            String origUrl = sourceEscaping != null ? sourceEscaping.unescape(rawUrl) : rawUrl;
            String url = context.contextualizeUrl(origUrl);

            if (url != origUrl) {
                // reverse some changes made to url by contextualizeUrl method, that
                // may break assumptions in subsequent JavaScript processing.
                // eg. "http://example.org" -> "/20140101012345/http://example.org/"
                // eg. "https://domain" + ".example.org" -> "http://domain/" + ".example.org"
                // eg. "https://domain." + "example.org" -> "http://domain" + "example.org"

                // remove trailing "/" if origUrl doesn't have it.  As Wayback does not need
                // trailing slash, it may make sense to this everywhere.  Just doing this fix
                // in JavaScript for now.
                if (url.endsWith("/") && !origUrl.endsWith("/")) {
                    url = url.substring(0, url.length() - 1);
                }

                // add trailing "." (removed by canonicalizer) back, if origUrl has it.
                if (origUrl.endsWith(".") && !url.endsWith(".")) {
                    url = url + ".";
                }

                if (sourceEscaping != null) {
                    url = sourceEscaping.escape(url);
                }
            } else {
                // use the original rawUrl
                url = rawUrl;
            }
            m.appendReplacement(replaced, Matcher.quoteReplacement(pre + url + post));
        }
        m.appendTail(replaced);
        return replaced.toString();
    }
}