org.archive.wayback.webapp.ServerRelativeArchivalRedirect.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.wayback.webapp.ServerRelativeArchivalRedirect.java

Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.webapp;

import java.io.IOException;
import java.util.logging.Logger;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.wayback.util.Timestamp;
import org.archive.wayback.util.url.UrlOperations;
import org.archive.wayback.util.webapp.AbstractRequestHandler;
import org.archive.wayback.util.webapp.RequestHandler;

/**
 * {@code ServerRelativeArchivalRedirect} is a {@link RequestHandler}
 * that redirects <i>leaked</i> server-relative URL back to replay request
 * URL.
 * <p>For example, assuming {@code Referer} is
 * {@code http://web.archive.org/web/20010203040506/http://example.com/index.html},
 * it redirects request {@code http://web.archive.org/js/foo.js}
 * to {@code http://web.archive.org/web/20010203040506/http://example.com/js/foo.js}.</p>
 * <p>It is typically set up as catch-all {@code RequestHandler}</p>
 * <p>
 * Refactoring Thoughts: parsing and construction of Archival-URL in this
 * class must match that of Archival-URL access point. Consider delegating
 * those to AccessPoint for easier configuration.
 * </p>
 * @author brad
 */
public class ServerRelativeArchivalRedirect extends AbstractRequestHandler {
    private static final Logger LOGGER = Logger.getLogger(ServerRelativeArchivalRedirect.class.getName());

    boolean useCollection = false;
    private String matchHost = null;
    private int matchPort = -1;
    private String replayPrefix;

    /**
     * A little helper class for passing parsed Archival-URL.
     * This may have wider utility... like ArchivalUrl? (we
     * have ArchivalUrl class, but it's tied to WaybackRequest.)
     */
    public static final class ArchivalUrlRef {
        public String root;
        public String collection;
        public String datespec;
        public String url;

        public ArchivalUrlRef(String root, String collection, String datespec, String url) {
            this.root = root;
            this.collection = collection;
            this.datespec = datespec;
            this.url = url;
        }
    }

    /**
     * Return the Archival-URL projection of request origin, that
     * should give a context necessary to redirect the leaked request
     * back to Archival-URL space.
     * <p>
     * Default implementation parses {@code Referer} header.
     * Sub class may override this method to use alternative method
     * of obtaining equivalent information. Typically you want to
     * call super method first, then resort to alternative method
     * if super method returns {@code null} or incomplete info.
     * </p>
     * @param httpRequest request object
     * @return ArchivalUrlCaptureRef object or {@code null}
     * if valid information cannot be found.
     */
    protected ArchivalUrlRef getOrigin(HttpServletRequest httpRequest) {
        // TODO: check if this works with non-empty context path.
        // I believe requestURI starts with context path. So non-empty
        // context path will break collection and timestamp extraction
        // code below. It'd be desirable for this class to get a pattern
        // of replay URL from somewhere - replayPreifx?

        // hope that it's a server relative request, with a valid referrer:
        String referer = httpRequest.getHeader("Referer");
        if (referer == null)
            return null;

        final UsableURI refuri;
        final String host;
        final String authority;
        String path;
        try {
            refuri = UsableURIFactory.getInstance(referer);
            host = refuri.getHost();
            authority = refuri.getAuthority();
            path = refuri.getPath();
        } catch (URIException ex) {
            LOGGER.info("Ignoring unparsable Referer: " + referer);
            return null;
        }

        // Check that the Referer is our current wayback path
        // before attempting to use referer as base archival url

        if ((matchHost != null && !matchHost.equals(host))
                || (matchPort != -1 && refuri.getPort() != -1 && matchPort != refuri.getPort())) {
            LOGGER.info("Server-Relative-Redirect: Skipping, Referer " + host + ":" + refuri.getPort()
                    + " not from matching wayback host:port\t");
            return null;
        }

        String collection = null;
        if (useCollection) {
            int colSlash = path.indexOf('/', 1);
            if (colSlash == -1)
                return null;

            // !! collection has leading '/'. It's bad, but existing
            // sub-class may break if we change it.
            collection = modifyCollection(path.substring(0, colSlash));
            path = path.substring(colSlash + 1);
        } else {
            // next line expects path does not start with "/"
            path = path.substring(1);
        }

        int tsSlash = path.indexOf('/');
        if (tsSlash == -1)
            return null;

        String datespec = path.substring(0, tsSlash);
        if (!datespec.isEmpty() && !Character.isDigit(datespec.charAt(0))) {
            datespec = null;
        }

        String url = path.substring(tsSlash + 1);
        url = UrlOperations.fixupScheme(url);
        url = ArchiveUtils.addImpliedHttpIfNecessary(url);

        final String root = refuri.getScheme() + "://" + authority;
        return new ArchivalUrlRef(root, collection, datespec, url);
    }

    private String handleRequestWithCollection(HttpServletRequest httpRequest, HttpServletResponse httpResponse)
            throws ServletException, IOException {
        ArchivalUrlRef ref = getOrigin(httpRequest);
        if (ref == null)
            return null;

        String thisPath = httpRequest.getRequestURI();
        String queryString = httpRequest.getQueryString();
        if (queryString != null) {
            thisPath += "?" + queryString;
        }
        String resolved = UrlOperations.resolveUrl(ref.url, thisPath);

        String contextPath = httpRequest.getContextPath();
        StringBuilder sb = new StringBuilder(ref.root);
        sb.append(contextPath);
        sb.append(ref.collection);
        sb.append("/");
        if (ref.datespec != null) {
            sb.append(ref.datespec);
            sb.append("/");
        }
        sb.append(resolved);

        return sb.toString();
    }

    /**
     * modify collection if necessary.
     * <p>default implementation simply return {@code collection}.</p>
     * @param collection (the first path component of Referer URL.)
     * note value has leading slash, which must be retained.
     * @return possibly modified collection.
     */
    protected String modifyCollection(String collection) {
        return collection;
    }

    private String handleRequestWithoutCollection(HttpServletRequest httpRequest, HttpServletResponse httpResponse)
            throws ServletException, IOException {

        ArchivalUrlRef ref = getOrigin(httpRequest);
        if (ref == null)
            return null;

        String thisPath = httpRequest.getRequestURI();
        String queryString = httpRequest.getQueryString();
        if (queryString != null) {
            thisPath += "?" + queryString;
        }

        String resolved = UrlOperations.resolveUrl(ref.url, thisPath);
        String contextPath = httpRequest.getContextPath();
        String finalUrl = ref.root + //uri.getScheme() + "://" + uri.getAuthority() +
                contextPath + "/" + ref.datespec + "/" + resolved;

        return finalUrl;
    }

    @Override
    public boolean handleRequest(HttpServletRequest httpRequest, HttpServletResponse httpResponse)
            throws ServletException, IOException {
        if (matchHost != null) {
            if (!matchHost.equals(httpRequest.getServerName())) {
                LOGGER.fine("Wrong host for ServerRelativeRed(" + httpRequest.getServerName() + ")");
                return false;
            }
        }
        if (matchPort != -1) {
            if (matchPort != httpRequest.getLocalPort()) {
                LOGGER.fine("Wrong port for ServerRealtiveRed(" + httpRequest.getServerName() + ")("
                        + httpRequest.getLocalPort() + ") :" + httpRequest.getRequestURI());
                return false;
            }
        }
        String replayUrl = (useCollection ? handleRequestWithCollection(httpRequest, httpResponse)
                : handleRequestWithoutCollection(httpRequest, httpResponse));
        if (replayUrl == null && replayPrefix != null) {
            String thisPath = httpRequest.getRequestURI();
            String queryString = httpRequest.getQueryString();
            if (queryString != null) {
                thisPath += "?" + queryString;
            }
            // TODO: rethink this fallback, for now adding https support as
            // well
            if (thisPath.startsWith("/http://") || thisPath.startsWith("/https://")) {
                // assume a replay request:
                StringBuilder sb = new StringBuilder(thisPath.length() + replayPrefix.length() + 16);
                sb.append(replayPrefix);
                sb.append(Timestamp.currentTimestamp().getDateStr());
                sb.append(thisPath);
                replayUrl = sb.toString();
            }
        }
        if (replayUrl != null) {
            // Gotta make sure this is properly cached, or
            // weird things happen:
            httpResponse.addHeader("Vary", "Referer");
            httpResponse.sendRedirect(replayUrl);
            return true;
        }
        return false;
    }

    /**
     * @return the useCollection
     */
    public boolean isUseCollection() {
        return useCollection;
    }

    /**
     * whether replay URL has <i>collection</i> part.
     * <p>set this to {@code true} if replay URL has <i>collection</i>
     * part, path component between context path and timestamp (although
     * it's called <i>collection</i> based on the common usage of this part,
     * there's no need to have particular semantics.)</p>
     * <p>collection part will be passed to {@link #modifyCollection(String)}
     * before constructing final replay URL to redirect to.<p>
     * @param useCollection the useCollection to set
     */
    public void setUseCollection(boolean useCollection) {
        this.useCollection = useCollection;
    }

    /**
     * @return the matchHost
     */
    public String getMatchHost() {
        return matchHost;
    }

    /**
     * optional host name {@code Referer} URL should match.
     * @param matchHost the matchHost to set
     */
    public void setMatchHost(String matchHost) {
        this.matchHost = matchHost;
    }

    /**
     * @return the matchPort
     */
    public int getMatchPort() {
        return matchPort;
    }

    /**
     * optional port number {@code Referer} URL should match.
     * @param matchPort the matchPort to set
     */
    public void setMatchPort(int matchPort) {
        this.matchPort = matchPort;
    }

    /**
     * @return the replayPrefix
     */
    public String getReplayPrefix() {
        return replayPrefix;
    }

    /**
     * optional replay URL prefix used by fallback method.
     * @param replayPrefix the replayPrefix to set
     */
    public void setReplayPrefix(String replayPrefix) {
        this.replayPrefix = replayPrefix;
    }
}