dk.netarkivet.wayback.NetarchiveResourceStore.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.wayback.NetarchiveResourceStore.java

Source

/*
 * #%L
 * Netarchivesuite - wayback
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.wayback;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.wayback.ResourceStore;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.ResourceNotAvailableException;
import org.archive.wayback.resourcestore.resourcefile.ArcResource;

import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
import dk.netarkivet.common.utils.InputStreamUtils;

/**
 * This is the connector between netarchivesuite and wayback. And is based on PrototypeNetarchiveResourceStore.java
 * which was made as a prototype connector.
 */
public class NetarchiveResourceStore implements ResourceStore {

    /** JMS ArcRepositoryClient. */
    protected ViewerArcRepositoryClient client;

    /** Pattern for matching http version header. */
    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");

    /** Logger. */
    private Log logger = LogFactory.getLog(getClass().getName());

    /**
     * Constructor.
     */
    public NetarchiveResourceStore() {
        client = ArcRepositoryClientFactory.getViewerInstance();
    }

    /**
     * Transforms search result into a resource, according to the ResourceStore interface.
     *
     * @param captureSearchResult the search result.
     * @return a valid resource containing metadata and a link to the ARC record.
     * @throws ResourceNotAvailableException if something went wrong fetching record.
     */
    public Resource retrieveResource(CaptureSearchResult captureSearchResult) throws ResourceNotAvailableException {
        long offset;
        String responseCode = null;
        Map<String, Object> metadata = new HashMap<String, Object>();
        ARCRecord arcRecord;
        ArchiveRecordHeader header;

        String arcfile = captureSearchResult.getFile();
        try {
            offset = captureSearchResult.getOffset();
        } catch (NumberFormatException e) {
            logger.error("Error looking for non existing resource", e);
            throw new ResourceNotAvailableException(
                    "NetarchiveResourceStore " + "thows NumberFormatException when reading offset.");
        } catch (NullPointerException e) {
            logger.error("Error looking for non existing resource", e);
            throw new ResourceNotAvailableException("NetarchiveResourceStore "
                    + "throws NullPointerException when accessing " + "CaptureResult given from Wayback.");
        }
        logger.info("Received request for resource from file '" + arcfile + "' at offset '" + offset + "'");
        BitarchiveRecord bitarchiveRecord = client.get(arcfile, offset);
        if (bitarchiveRecord == null) {
            throw new ResourceNotAvailableException(
                    "NetarchiveResourceStore: " + "Bitarchive didn't return the requested record.");
        }
        logger.info("Retrieved resource from file '" + arcfile + "' at offset '" + offset + "'");

        InputStream is = bitarchiveRecord.getData();
        // Match header-lines (until empty line).
        try {
            for (String line = InputStreamUtils.readLine(is); line != null
                    && line.length() > 0; line = InputStreamUtils.readLine(is)) {
                Matcher m = HTTP_HEADER_PATTERN.matcher(line);
                if (m.matches()) {
                    responseCode = m.group(1);
                    logger.debug("Setting response code '" + responseCode + "'");

                } else {
                    String[] parts = line.split(":", 2);
                    if (parts.length != 2) {
                        logger.debug("Malformed header line '" + line + "'");
                    } else {
                        String name = parts[0];
                        String contents = parts[1].trim();
                        if (contents != null) {
                            if (name.equals("Content-Length")) {
                                logger.info("Setting length header to '" + contents + "'");
                                metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, contents);
                            } else if (name.equals("Content-Type")) {
                                logger.info("Setting Content-Type header to '" + contents + "'");
                                metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, contents);
                            } else if (name.equals("Location")) {
                                logger.info("Setting redirect Location header to '" + contents + "'");
                                metadata.put("Location", contents);
                            }
                        }
                    }
                }
            }
        } catch (IOException e) {
            logger.error("Error looking for empty line", e);
            throw new ResourceNotAvailableException(e.getMessage());
        }
        // fill metadata for ARC record.
        metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getUrlKey());
        // TODO the following is the correct way to set the URL. If we do
        // things this way then we should be able to get arcrecord to parse
        // the headers for us.
        /*
         * metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getOriginalUrl());
         */
        try {
            metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, captureSearchResult.getOriginalHost());
        } catch (NullPointerException ex) {
            metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, "");
        }
        metadata.put(ARCRecordMetaData.DATE_FIELD_KEY, captureSearchResult.getCaptureDate().toString());
        metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, captureSearchResult.getMimeType());
        metadata.put(ARCRecordMetaData.VERSION_FIELD_KEY, captureSearchResult.getHttpCode());
        metadata.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, "" + offset);
        metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, "" + bitarchiveRecord.getLength());
        if (responseCode != null) {
            metadata.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, responseCode);
        }

        // create header.
        try {
            header = new ARCRecordMetaData(arcfile, metadata);
        } catch (IOException e) {
            logger.error("Could not create header", e);
            throw new ResourceNotAvailableException(e.getMessage());
        }

        // create ARCRecord.
        try {
            arcRecord = new ARCRecord(is, header, 0, false, false, true);
            int code = arcRecord.getStatusCode();
            logger.debug("ARCRecord created with code '" + code + "'");
            arcRecord.skipHttpHeader();
        } catch (NullPointerException e) {
            logger.error("Could not create ARCRecord", e);
            throw new ResourceNotAvailableException("ARC record doesn't contain" + " valid http URL");
        } catch (IOException e) {
            logger.error("Could not create ARCRecord", e);
            throw new ResourceNotAvailableException(e.getMessage());
        }
        final String statusCode = responseCode;
        final Map<String, Object> metadataF = metadata;
        // TODO This the sleaziest thing in this class. Why does the
        // ARCRecord give the wrong status code if we don't override this method?
        Resource resource = new ArcResource(arcRecord, (ArchiveReader) null) {
            public int getStatusCode() {
                return Integer.parseInt(statusCode);
            }
            // FIXME incompatible, needed?
            /*
            @Override
            public Map<String, String> getHttpHeaders() {
            return metadataF;
            }
            */
        };
        logger.info("Returning resource '" + resource + "'");
        return resource;
    }

    /**
     * Shuts down this resource store, closing the arcrepository client.
     *
     * @throws IOException if an exception occurred while closing the client.
     */
    public void shutdown() throws IOException {
        // Close JMS connection.
        client.close();
    }
}