dk.netarkivet.viewerproxy.ARCArchiveAccess.java Source code

Introduction

Here is the source code for dk.netarkivet.viewerproxy.ARCArchiveAccess.java
Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.viewerproxy;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.Constants;
import dk.netarkivet.common.distribute.arcrepository.ARCLookup;
import dk.netarkivet.common.distribute.arcrepository.ResultStream;
import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.harvester.HarvesterSettings;

/**
 * The ARCArchiveAccess class implements reading of ARC indexes and files. It builds on the Java ARC utils and Lucene
 * indexes, and handles using these in an HTTP context.
 */
public class ARCArchiveAccess implements URIResolver {
    // Class constants
    /** Transfer encoding header. */
    private static final String TRANSFER_ENCODING_HTTP_HEADER = "Transfer-encoding";

    /** HTTP status code for page not found. */
    private static final int HTTP_NOTFOUND_VALUE = 404;
    /** HTTP header for page not found. */
    private static final String NOTFOUND_HEADER = "HTTP/1.1 404 Not found";
    /** Content-type header used for page not found. */
    private static final String CONTENT_TYPE_STRING = "Content-type: text/html";
    /** Inserted before page not found response. */
    private static final String HTML_HEADER = "<html><head><title>" + "Not found</title></head><body>";
    /** Inserted after page not found response. */
    private static final String HTML_FOOTER = "</body></html>";

    /**
     * Matches HTTP header lines like HTTP/1.1 404 Page has gone south Groups: 111 2222222222222222222.
     */
    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");

    /** The underlying ARC record lookup object. */
    private ARCLookup lookup;

    /** Logger for this class. */
    private static final Logger log = LoggerFactory.getLogger(ARCArchiveAccess.class);

    /**
     * If the value is true, we will try to lookup w/ ftp instead of http, if we don't get a hit in the index.
     */
    private static final boolean tryToLookupUriAsFtp = Settings.getBoolean(HarvesterSettings.TRY_LOOKUP_URI_AS_FTP);

    /**
     * Initialise new ARCArchiveAccess with no index file.
     *
     * @param arcRepositoryClient The arcRepositoryClient to use when retrieving
     * @throws ArgumentNotValid if arcRepositoryClient is null.
     */
    public ARCArchiveAccess(ViewerArcRepositoryClient arcRepositoryClient) {
        ArgumentNotValid.checkNotNull(arcRepositoryClient, "ArcRepositoryClient arcRepositoryClient");
        lookup = new ARCLookup(arcRepositoryClient);
        lookup.setTryToLookupUriAsFtp(tryToLookupUriAsFtp);
        log.info("Constructed instance of ARCArchiveAccess with TryToLookupUriAsFtp: {}", tryToLookupUriAsFtp);
    }

    /**
     * This method resets the Lucene index this object works on, and replaces it with the given index.
     *
     * @param index The new index file, a directory containing Lucene files.
     * @throws ArgumentNotValid If argument is null
     * @throws IOFailure if the file cannot be read
     */
    public void setIndex(File index) {
        lookup.setIndex(index);
        log.info("ARCArchiveAccess instance now uses indexfile {}", index);
    }

    /**
     * Look up a given URI and add its contents to the Response given.
     *
     * @param request The request to look up record for
     * @param response The response to return to the browser
     * @return The response code for this page if found, or URIResolver.NOT_FOUND otherwise.
     * @throws IOFailure on trouble looking up the request (timeout, i/o, etc.)
     * @see URIResolver#lookup(Request, Response)
     */
    public int lookup(Request request, Response response) {
        ArgumentNotValid.checkNotNull(request, "Request request");
        ArgumentNotValid.checkNotNull(response, "Response response");
        URI uri = request.getURI();
        ResultStream content = null;
        InputStream contentStream = null;
        log.debug("Doing Lookup of URI '{}'", uri);
        try {
            content = lookup.lookup(uri);
            if (content == null) {
                // If the object wasn't found, return an appropriate message.
                log.debug("Missing URL '{}'", uri);
                createNotFoundResponse(uri, response);
                return URIResolver.NOT_FOUND;
            }
            contentStream = content.getInputStream();
            // First write the original header.
            if (content.containsHeader()) {
                log.debug("Write first the original header");
                writeHeader(contentStream, response);
            }
            // Now flush the content to the browser.
            readPage(contentStream, response.getOutputStream());
        } finally {
            if (contentStream != null) {
                try {
                    contentStream.close();
                } catch (IOException e) {
                    log.debug("Error writing response to browser for '{}'. Giving up!", uri, e);
                }
            }
        }
        return response.getStatus();
    }

    /**
     * Generate an appropriate response when a URI is not found. If this fails, it is logged, but otherwise ignored.
     *
     * @param uri The URI attempted read that could not be found
     * @param response The Response object to write the error response into.
     */
    protected void createNotFoundResponse(URI uri, Response response) {
        try {
            // first write a header telling the browser to expect text/html
            response.setStatus(HTTP_NOTFOUND_VALUE);
            writeHeader(new ByteArrayInputStream((NOTFOUND_HEADER + '\n' + CONTENT_TYPE_STRING).getBytes()),
                    response);
            // Now flush an error screen to the browser
            OutputStream browserOut = response.getOutputStream();
            browserOut.write((HTML_HEADER + "Can't find URL: " + uri + HTML_FOOTER).getBytes());
            browserOut.flush();
        } catch (IOFailure e) {
            log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e);
        } catch (IOException e) {
            log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e);
        }
        // Do not close stream! That is left to the servlet.
    }

    /**
     * Apply filters to HTTP headers. Can be overridden in subclasses. Currently only removes Transfer-encoding headers.
     *
     * @param headername The name of the header field, e.g. Content-Type Remember that this is not case sensitive
     * @param headercontents The contents of the header field, e.g. text/html
     * @return A (possibly modified) header contents string, or null if the header should be skipped.
     */
    protected String filterHeader(String headername, String headercontents) {
        // Cannot get chunked output to work, so we must remove
        // any chunked encoding lines
        if (headername.equalsIgnoreCase(TRANSFER_ENCODING_HTTP_HEADER)) {
            log.debug("Ignoring headerline: '{}','{}'", headername, headercontents);
            return null;
        }
        return headercontents;
    }

    /**
     * Write HTTP header, including status and status reason.
     *
     * @param is A stream to read the header from.
     * @param response A Response to write the header, status and reason to.
     * @throws IOFailure If the underlying reads or writes fail.
     */
    private void writeHeader(InputStream is, Response response) {
        // Reads until the end of the header (indicated by an empty line)
        try {
            for (String line = readLine(is); (line != null) && (line.length() > 0); line = readLine(is)) {
                // Try to match lines like "HTTP/1.0 200 OK"
                Matcher m = HTTP_HEADER_PATTERN.matcher(line);
                if (m.matches()) {
                    String responsecode = m.group(1);
                    String responsetext = m.group(2);
                    // Note: Always parsable int, due to the regexp, so no reason
                    // to check for parse errors
                    log.debug("SetStatus '{}':'{}", responsecode, responsetext);
                    response.setStatus(Integer.parseInt(responsecode), responsetext);
                } else {
                    // try to match header-lines containing colon,
                    // like "Content-Type: text/html"
                    String[] parts = line.split(":", 2);
                    if (parts.length != 2) {
                        log.debug("Malformed header line '" + line + "'");
                    } else {
                        String name = parts[0];
                        String contents = filterHeader(name, parts[1].trim());
                        if (contents != null) {
                            // filter out unwanted headers
                            log.debug("Added header-field '{}' with contents '{}'", name, contents);
                            response.addHeaderField(name, contents);
                        }
                    }
                }
            }
        } catch (IOException e) {
            throw new IOFailure("Trouble reading from input stream or writing" + " to output stream", e);
        }
    }

    /**
     * Read an entire page body into some stream.
     *
     * @param content The stream to read the page from. Not closed afterwards.
     * @param out The stream to write the results to. Not closed afterwards.
     * @throws IOFailure If the underlying reads or writes fail
     */
    private void readPage(InputStream content, OutputStream out) {
        BufferedInputStream page = new BufferedInputStream(content);
        BufferedOutputStream responseOut = new BufferedOutputStream(out);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try {
            byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
            int bytesRead;
            while ((bytesRead = page.read(buffer)) != -1) {
                baos.write(buffer, 0, bytesRead);
                responseOut.write(buffer, 0, bytesRead);
            }
            responseOut.flush();
            log.debug("pagecontents: ", new String(baos.toByteArray(), "UTF-8"));
        } catch (IOException e) {
            throw new IOFailure("Could not read or write data", e);
        }
    }

    /**
     * Read a line of bytes from an InputStream. Useful when an InputStream may contain both text and binary data.
     *
     * @param inputStream A source of data
     * @return A line of text read from inputStream, with terminating \r\n or \n removed, or null if no data is
     * available.
     * @throws IOException on trouble reading from input stream
     */
    private String readLine(InputStream inputStream) throws IOException {
        byte[] rawdata = readRawLine(inputStream);
        if (rawdata == null) {
            return null;
        }
        int len = rawdata.length;
        if (len > 0) {
            if (rawdata[len - 1] == '\n') {
                len--;
                if (len > 0) {
                    if (rawdata[len - 1] == '\r') {
                        len--;
                    }
                }
            }
        }
        return new String(rawdata, 0, len);
    }

    /**
     * Reads a raw line from an InputStream, up till \n. Since HTTP allows \r\n and \n as terminators, this gets the
     * whole line. This code is adapted from org.apache.commons.httpclient.HttpParser
     *
     * @param inputStream A stream to read from.
     * @return Array of bytes read or null if none are available.
     * @throws IOException if the underlying reads fail
     */
    private static byte[] readRawLine(InputStream inputStream) throws IOException {
        ByteArrayOutputStream buf = new ByteArrayOutputStream();
        int ch;
        while ((ch = inputStream.read()) >= 0) {
            buf.write(ch);
            if (ch == '\n') {
                break;
            }
        }
        if (buf.size() == 0) {
            return null;
        }
        return buf.toByteArray();
    }
}