org.archive.wayback.core.Resource.java Source code

Introduction

Here is the source code for org.archive.wayback.core.Resource.java
Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.core;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.httpclient.ChunkedInputStream;

/**
 * Abstraction on top of a document stored in a WaybackCollection.
 *
 * TODO: This implementation needs some pretty drastic refactoring.. May have to wait
 * for 2.0. This should be a byte-oriented record, and allow wrapping the
 * interior byte-stream in on the more full featured HTTP libraries
 * (jetty/apache-http-client/w3c-http-reference).
 *
 * For now, it is a system-wide assumption that all resources are HTTP based.
 *
 * TODO: Some code downcasts Resource to its sub-classes to gain access to
 * methods only available in specific implementation.  Consider adding more methods
 * to make downcast unnecessary.  More sub-classes are expected, for encapsulating
 * revisit-original pair of Resources as single Resource, for example.
 *
 * @see org.archive.wayback.ResourceStore#retrieveResource(CaptureSearchResult)
 *
 * @author Brad Tofel
 */
public abstract class Resource extends InputStream {

    private InputStream is;

    public abstract void close() throws IOException;

    /**
     * Assumes an HTTP resource - return the HTTP response code
     * @return the HTTP response code from the HTTP message
     */
    public abstract int getStatusCode();

    /**
     * @return the size in bytes of the record payload, including HTTP header
     */
    public abstract long getRecordLength();

    /**
     * Assumes an HTTP response - return the HTTP headers, not including the
     * HTTP Message header
     * @return key-value Map of HTTP headers
     */
    public abstract Map<String, String> getHttpHeaders();

    // URL-Agnostic Revisit Support

    /**
     * return {@code WARC-Refer-To-Target-URI} WARC record header value or
     * equivalent.
     * Default implementation returns {@code null}.
     * @return header value (URI)
     */
    public String getRefersToTargetURI() {
        return null;
    }

    /**
     * return {@code WARC-Refers-To-Date} WARC record header value or
     * equivalent.
     * Default implementation returns {@code null}
     * @return 14-digit timestamp string ({@code yyyyMMddHHmmss})
     */
    public String getRefersToDate() {
        return null;
    }

    public void parseHeaders() throws IOException {
        //Implemented in warc/arc reader
    }

    public String getHeader(String headerName) {
        Map<String, String> httpHeaders = getHttpHeaders();

        if (httpHeaders == null) {
            return null;
        }

        Iterator<String> keys = httpHeaders.keySet().iterator();

        String headerUp = headerName.toUpperCase();

        while (keys.hasNext()) {
            String key = keys.next();

            if (key.toUpperCase().equals(headerUp)) {
                return httpHeaders.get(key);
            }
        }

        return null;
    }

    private void validate() throws IOException {
        if (is == null) {
            throw new IOException("No InputStream");
        }
    }

    protected void setInputStream(InputStream is) {
        if (is.markSupported()) {
            this.is = is;
        } else {
            this.is = new BufferedInputStream(is);
        }
    }

    /**
     * indicate that there is a {@code Transfer-Encoding: chunked} header, so the input
     *   data should be dechunked as it is read. This method actually peeks
     *   ahead to verify that there is a hex-encoded chunk length before
     *   assuming the data is chunked.
     * @throws IOException for usual reasons
     */
    public void setChunkedEncoding() throws IOException {
        validate();
        // peek ahead and make sure we have a line with hex numbers:
        int max = 50;
        is.mark(max + 2);
        int cur = 0;
        int hexFound = 0;
        boolean isChunked = false;
        while (cur < max) {
            int nextC = is.read();
            // allow CRLF and plain ole LF:
            if ((nextC == 13) || (nextC == 10)) {
                // must have read at least 1 hex char:
                if (hexFound > 0) {
                    if (nextC == 10) {
                        isChunked = true;
                        break;
                    }
                    nextC = is.read();
                    if (nextC == 10) {
                        isChunked = true;
                        break;
                    }
                }
                // keep looking to allow some blank lines.
            } else {
                // better be a hex character:
                if (isHex(nextC)) {
                    hexFound++;
                } else if (nextC != ' ') {
                    // allow whitespace before or after chunk...
                    // not a hex digit: not a chunked stream.
                    break;
                }
            }
            cur++;
        }
        is.reset();
        if (isChunked) {
            setInputStream(new ChunkedInputStream(is));
        }
    }

    private boolean isHex(int c) {
        if ((c >= '0') && (c <= '9')) {
            return true;
        }
        if ((c >= 'a') && (c <= 'f')) {
            return true;
        }
        if ((c >= 'A') && (c <= 'F')) {
            return true;
        }
        return false;
    }

    public int available() throws IOException {
        validate();
        return is.available();
    }

    public void mark(int readlimit) {
        if (is != null) {
            is.mark(readlimit);
        }
    }

    public boolean markSupported() {
        if (is == null) {
            return false;
        }
        return is.markSupported();
    }

    public int read() throws IOException {
        validate();
        return is.read();
    }

    public int read(byte[] b, int off, int len) throws IOException {
        validate();
        return is.read(b, off, len);
    }

    public int read(byte[] b) throws IOException {
        validate();
        return is.read(b);
    }

    public void reset() throws IOException {
        validate();
        is.reset();
    }

    public long skip(long n) throws IOException {
        validate();
        return is.skip(n);
    }
}