dk.netarkivet.common.utils.arc.ARCUtils.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.common.utils.arc.ARCUtils.java

Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package dk.netarkivet.common.utils.arc;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.arc.ARCWriter;
import org.archive.util.ArchiveUtils;

import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.InputStreamUtils;
import dk.netarkivet.common.utils.SystemUtils;

/**
 * Various utilities that do stuff that ARCWriter does not provide.
 * Also includes method for converting an ARCRecord to a byte array.
 */
public final class ARCUtils {
    /** Default constructor to avoid initialization.*/
    private ARCUtils() {
    }

    /** The log. */
    private static Log log = LogFactory.getLog(ARCUtils.class.getName());

    /** Matches HTTP header lines like
     * HTTP/1.1 404 Page has gone south
     * Groups:  111 2222222222222222222. */
    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");

    /** Extra ARC Record metadata. */
    public static final String RESPONSETEXT = "RESPONSETEXT";

    /** Insert the contents of an ARC file (skipping an optional initial
     *  filedesc: header) in another ARCfile.
     *
     * @param arcFile An ARC file to read.
     * @param aw A place to write the arc records
     * @throws IOFailure if there are problems reading the file.
     */
    public static void insertARCFile(File arcFile, ARCWriter aw) {
        ArgumentNotValid.checkNotNull(aw, "ARCWriter aw");
        ArgumentNotValid.checkNotNull(arcFile, "File arcFile");
        ARCReader r;

        try {
            r = ARCReaderFactory.get(arcFile);
        } catch (IOException e) {
            String message = "Error while copying ARC records from " + arcFile;
            log.warn(message, e);
            throw new IOFailure(message, e);
        }
        Iterator<ArchiveRecord> it = r.iterator();
        ARCRecord record;
        it.next(); //Skip ARC file header
        // ARCReaderFactory guarantees the first record exists and is a
        // filedesc, or it would throw exception
        while (it.hasNext()) {
            record = (ARCRecord) it.next();
            copySingleRecord(aw, record);
        }
    }

    /**
     * Writes the given ARCRecord on the given ARCWriter.
     * 
     * Note that the ARCWriter.write method takes the metadata fields as
     * separate arguments instead of accepting an ARCRecordMetaData object. It
     * uses the ArchiveUtils.getDate method to convert an ARCstyle datestring to
     * a Date object.
     * 
     * @see ArchiveUtils#getDate(java.lang.String)
     * @param aw
     *            The ARCWriter to output the record on.
     * @param record
     *            The record to output
     */
    private static void copySingleRecord(ARCWriter aw, ARCRecord record) {
        try {
            //Prepare metadata...
            ARCRecordMetaData meta = record.getMetaData();
            String uri = meta.getUrl();
            String mime = meta.getMimetype();
            String ip = meta.getIp();
            // Note the ArchiveUtils.getDate() converts an ARC-style datestring 
            // to a Date object
            long timeStamp = ArchiveUtils.getDate(meta.getDate()).getTime();
            //...and write the given files content into the writer
            // Note ARCRecord extends InputStream            
            aw.write(uri, mime, ip, timeStamp, meta.getLength(), record);
        } catch (Exception e) {
            throw new IOFailure("Error occurred while writing an ARC record" + record, e);
        }
    }

    /**
     * Create new ARCWriter, writing to arcfile newFile.
     * @param newFile the ARCfile, that the ARCWriter writes to.
     * @return new ARCWriter, writing to arcfile newFile.
     */
    public static ARCWriter createARCWriter(File newFile) {
        ARCWriter aw;
        PrintStream ps = null;
        try {
            ps = new PrintStream(new FileOutputStream(newFile));
            aw = new ARCWriter(new AtomicInteger(), ps,
                    //This name is used for the first (file metadata) record
                    newFile, false, //Don't compress
                    //Use current time
                    ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null //No particular file metadata to add
            );
        } catch (IOException e) {
            if (ps != null) {
                ps.close();
            }
            String message = "Could not create ARCWriter to file '" + newFile + "'.\n";
            log.warn(message);
            throw new IOFailure(message, e);
        }
        return aw;
    }

    /**
     * Write a file to an ARC file. The writing is done by
     * an existing ARCWriter.
     * An ARCRecord will be added, which contains a header and the contents
     * of the file. The date of the record written will be set to
     * the lastModified value of the file being written.
     * @param aw The ARCWriter doing the writing
     * @param file The file we want to write to the ARC file
     * @param uri The uri for the ARCRecord being written
     * @param mime The mimetype for the ARCRecord being written
     * @throws ArgumentNotValid if any arguments aw and file are null
     *  and arguments uri and mime are null or empty.
     */
    public static void writeFileToARC(ARCWriter aw, File file, String uri, String mime) {
        ArgumentNotValid.checkNotNull(aw, "ARCWriter aw");
        ArgumentNotValid.checkNotNull(file, "File file");
        ArgumentNotValid.checkNotNullOrEmpty(uri, "String uri");
        ArgumentNotValid.checkNotNullOrEmpty(mime, "String mime");

        InputStream is = null;
        try {
            try {
                //Prepare metadata...
                String ip = SystemUtils.getLocalIP();
                long timeStamp = file.lastModified();
                long length = file.length();
                //...and write the CDX file's content into the writer
                is = new FileInputStream(file);
                aw.write(uri, mime, ip, timeStamp, length, is);
            } finally {
                if (is != null) {
                    is.close();
                }
            }
        } catch (IOException e) {
            String msg = "Error writing '" + file + "' to " + aw + " as " + uri;
            log.warn(msg, e);
            throw new IOFailure(msg, e);
        }
    }

    /**
     * Return an ARCWriter suitable for the tools ArcMerge and ArcWrap.
     * @param stream the given PrintStream.
     * @param destinationArcfile the given destination ARC file.
     * @return ARCWriter to be used by tools ArcMerge and ArcWrap
     * @throws IOException redirect from ARCWriter constructure
     */
    public static ARCWriter getToolsARCWriter(PrintStream stream, File destinationArcfile) throws IOException {
        return new ARCWriter(new AtomicInteger(), stream, destinationArcfile, false, //Don't compress
                // Use current time
                ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null // //No particular file metadata to add
        );
    }

    /** 
     * Read the contents of an ARC record into a byte array.
     *
     * @param in An ARC record to read from.  After reading, the ARC Record
     * will no longer have its own data available for reading.
     * @return A byte array containing the contents of the ARC record.  Note
     * that the size of this may be different from the size given in the
     * ARC record metadata.
     * @throws IOException If there is an error reading the data, or if the
     * record is longer than Integer.MAX_VALUE (since we can't make bigger
     * arrays).
     */
    public static byte[] readARCRecord(ARCRecord in) throws IOException {
        ArgumentNotValid.checkNotNull(in, "ARCRecord in");
        if (in.getMetaData().getLength() > Integer.MAX_VALUE) {
            throw new IOFailure("ARC Record too long to fit in array: " + in.getMetaData().getLength() + " > "
                    + Integer.MAX_VALUE);
        }
        // read from stream
        // The arcreader has a number of "features" that complicates the read
        //  1) the record at offset 0, returns too large a length
        //  2) readfully does not work
        //  3) ARCRecord.read(buf, offset, length) is broken.
        // TODO verify if these "features" are still around: See bugs #903, #904,
        // #905
        int dataLength = (int) in.getMetaData().getLength();
        byte[] tmpbuffer = new byte[dataLength];
        byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
        int bytesRead;
        int totalBytes = 0;
        for (; (totalBytes < dataLength) && ((bytesRead = in.read(buffer)) != -1); totalBytes += bytesRead) {
            System.arraycopy(buffer, 0, tmpbuffer, totalBytes, bytesRead);
        }
        // Check if the number of bytes read (=i) matches the
        // size of the buffer.
        if (tmpbuffer.length != totalBytes) {
            // make sure we only return an array with bytes we actualy read
            byte[] truncateBuffer = new byte[totalBytes];
            System.arraycopy(tmpbuffer, 0, truncateBuffer, 0, totalBytes);
            return truncateBuffer;
        } else {
            return tmpbuffer;
        }
    }

    /**
     * TODO write unit test.
     * @param in pointing at start of ARC record.
     * @param offset into ARC file.
     * @return pairwise headers.
     * @throws IOException if fails to read ARC files or ARC files isn't valid.
     */
    public static Map<String, Object> getHeadersFromARCFile(InputStream in, Long offset) throws IOException {
        Map<String, Object> headers = new HashMap<String, Object>();
        // extra needed headers.
        headers.put(ARCRecordMetaData.VERSION_FIELD_KEY, "");
        headers.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, offset);

        String line = InputStreamUtils.readLine(in);
        String[] tmp = line.split(" ");

        // decode header.
        if (tmp.length == 5) {
            headers.put(ARCRecordMetaData.URL_FIELD_KEY, tmp[0]);
            headers.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, tmp[1]);
            headers.put(ARCRecordMetaData.DATE_FIELD_KEY, tmp[2]);
            headers.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, tmp[3]);
            headers.put(ARCRecordMetaData.LENGTH_FIELD_KEY, tmp[4]);
        } else {
            throw new IOException("Does not include required metadata to be a valid " + "ARC header: " + line);
        }
        // Matches rest of header lines.
        line = InputStreamUtils.readLine(in);
        Matcher m = HTTP_HEADER_PATTERN.matcher(line);

        if (m.matches()) {
            headers.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, m.group(1));
            // not valid META DATA
            headers.put(RESPONSETEXT, line);
        }
        while ((line = InputStreamUtils.readLine(in)) != null && line.length() > 0
                && line.startsWith("<") /* arc/warc header */) {
            int index = line.indexOf(':');
            if (index != -1) {
                headers.put(line.substring(0, index), line.substring(index + 2));
            } else {
                throw new IOException("Inputstream doesn't not point to valid ARC record");
            }
        }

        return headers;
    }

    /**
     * Check if the filename belongs to an ARC file.
     * @param filename a given filename
     * @return true, if the filename converted to lowercase ends with .arc or .arc.gz 
     */
    public static boolean isARC(String filename) {
        ArgumentNotValid.checkNotNullOrEmpty(filename, "filename");
        String filenameLowercase = filename.toLowerCase();
        return (filenameLowercase.endsWith(".arc") || filenameLowercase.endsWith(".arc.gz"));
    }
}