org.archive.io.arc.ARCWriterTest.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.io.arc.ARCWriterTest.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.arc;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.NullInputStream;
import org.apache.commons.io.output.NullOutputStream;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.util.ArchiveUtils;
import org.archive.util.TmpDirTestCase;

import com.google.common.io.Closeables;

/**
 * Test ARCWriter class.
 *
 * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
 * ARCWriter.  Then it validates what was written w/ ARCReader.
 *
 * @author stack
 */
public class ARCWriterTest extends TmpDirTestCase implements ARCConstants {
    /**
     * Utility class for writing bad ARCs (with trailing junk)
     */
    public class CorruptibleARCWriter extends ARCWriter {
        byte[] endJunk = null;

        public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
            super(serial_no, settings);
        }

        @Override
        protected void postWriteRecordTasks() throws IOException {
            if (endJunk != null) {
                this.write(endJunk);
            }
            super.postWriteRecordTasks();
        }

        public void setEndJunk(byte[] b) throws IOException {
            this.endJunk = b;
        }
    }

    /**
     * Suffix to use for ARC files made by JUNIT.
     */
    private static final String SUFFIX = "JUNIT";

    private static final String SOME_URL = "http://www.archive.org/test/";

    private static final AtomicInteger SERIAL_NO = new AtomicInteger();

    /*
     * @see TestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();
    }

    /*
     * @see TestCase#tearDown()
     */
    protected void tearDown() throws Exception {
        super.tearDown();
    }

    protected static String getContent() {
        return getContent(null);
    }

    protected static String getContent(String indexStr) {
        String page = (indexStr != null) ? "Page #" + indexStr : "Some Page";
        return "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n\r\n" + "<html><head><title>" + page
                + "</title></head>" + "<body>" + page + "</body></html>";
    }

    @SuppressWarnings("deprecation")
    protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) throws IOException {
        String indexStr = Integer.toString(index);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with an arbitrary 14-digit date per RFC2540
        String now = ArchiveUtils.get14DigitDate();
        int recordLength = 0;
        byte[] record = (getContent(indexStr)).getBytes();
        recordLength += record.length;
        baos.write(record);
        // Add the newline between records back in
        baos.write("\n".getBytes());
        recordLength += 1;
        arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", "0.1.2.3", Long.parseLong(now),
                recordLength, baos);
        return recordLength;
    }

    private File writeRecords(String baseName, boolean compress, long maxSize, int recordCount) throws IOException {
        cleanUpOldFiles(baseName);
        File[] files = { getTmpDir() };
        ARCWriter arcWriter = new ARCWriter(SERIAL_NO, new WriterPoolSettingsData(baseName, "${prefix}-" + SUFFIX,
                maxSize, compress, Arrays.asList(files), null));
        assertNotNull(arcWriter);
        for (int i = 0; i < recordCount; i++) {
            writeRandomHTTPRecord(arcWriter, i);
        }
        arcWriter.close();
        assertTrue("Doesn't exist: " + arcWriter.getFile().getAbsolutePath(), arcWriter.getFile().exists());
        return arcWriter.getFile();
    }

    private void validate(File arcFile, int recordCount) throws FileNotFoundException, IOException {
        ARCReader reader = ARCReaderFactory.get(arcFile);
        assertNotNull(reader);
        List<ArchiveRecordHeader> metaDatas = null;
        if (recordCount == -1) {
            metaDatas = reader.validate();
        } else {
            metaDatas = reader.validate(recordCount);
        }
        reader.close();
        // Now, run through each of the records doing absolute get going from
        // the end to start.  Reopen the arc so no context between this test
        // and the previous.

        for (int i = metaDatas.size() - 1; i >= 0; i--) {
            reader = ARCReaderFactory.get(arcFile);
            ARCRecordMetaData meta = (ARCRecordMetaData) metaDatas.get(i);
            ArchiveRecord r = reader.get(meta.getOffset());
            String mimeType = r.getHeader().getMimetype();
            assertTrue("Record is bogus", mimeType != null && mimeType.length() > 0);
            reader.close();
        }
        assertEquals("Metadata count not as expected", recordCount, metaDatas.size());
        for (Iterator<ArchiveRecordHeader> i = metaDatas.iterator(); i.hasNext();) {
            ARCRecordMetaData r = (ARCRecordMetaData) i.next();
            assertTrue("Record is empty", r.getLength() > 0);
        }
    }

    public void testCheckARCFileSize() throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", false);
    }

    public void testCheckARCFileSizeCompressed() throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", true);
    }

    public void testWriteRecord() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecord", false, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount + 1); // Header record.
    }

    public void testRandomAccess() throws IOException {
        final int recordCount = 3;
        File arcFile = writeRecords("writeRecord", true, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        // Get to second record.  Get its offset for later use.
        boolean readFirst = false;
        String url = null;
        long offset = -1;
        long totalRecords = 0;
        boolean readSecond = false;
        for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); totalRecords++) {
            ARCRecord ar = (ARCRecord) i.next();
            if (!readFirst) {
                readFirst = true;
                continue;
            }
            if (!readSecond) {
                url = ar.getMetaData().getUrl();
                offset = ar.getMetaData().getOffset();
                readSecond = true;
            }
        }
        reader.close();

        reader = ARCReaderFactory.get(arcFile, offset);
        ArchiveRecord ar = reader.get();
        assertEquals(ar.getHeader().getUrl(), url);
        ar.close();
        reader.close();

        // Get reader again.  See how iterator works with offset
        reader = ARCReaderFactory.get(arcFile, offset);
        int count = 0;
        for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
            count++;
        }
        reader.close();
        assertEquals(totalRecords - 1, count);
    }

    public void testWriteRecordCompressed() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecordCompressed", true, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount + 1 /*Header record*/);
    }

    public void testWriteGiantRecord() throws IOException {
        PrintStream dummyStream = new PrintStream(new NullOutputStream());
        ARCWriter arcWriter = new ARCWriter(SERIAL_NO, dummyStream, new File("dummy"),
                new WriterPoolSettingsData("", "", -1, false, null, null));
        assertNotNull(arcWriter);

        // Start the record with an arbitrary 14-digit date per RFC2540
        long now = System.currentTimeMillis();
        long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;

        arcWriter.write("dummy:uri", "application/octet-stream", "0.1.2.3", now, recordLength,
                new NullInputStream(recordLength));
        arcWriter.close();
    }

    private void runCheckARCFileSizeTest(String baseName, boolean compress)
            throws FileNotFoundException, IOException {
        File f = writeRecords(baseName, compress, 1024, 15);
        validate(f, 15 + 1);
    }

    protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
        File[] files = { getTmpDir() };
        return new CorruptibleARCWriter(SERIAL_NO, new WriterPoolSettingsData(name, "${prefix}-" + SUFFIX,
                DEFAULT_MAX_ARC_FILE_SIZE, compress, Arrays.asList(files), null));
    }

    protected static ByteArrayInputStream getBais(String str) throws IOException {
        return new ByteArrayInputStream(str.getBytes());
    }

    /**
     * Writes a record, suppressing normal length-checks (so that 
     * intentionally malformed records may be written). 
     */
    protected static void writeRecord(ARCWriter writer, String url, String type, int len, ByteArrayInputStream bais)
            throws IOException {
        writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len, bais, false);
    }

    protected int iterateRecords(ARCReader r) throws IOException {
        int count = 0;
        for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
            ARCRecord rec = (ARCRecord) i.next();
            rec.close();
            if (count != 0) {
                assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
                        rec.getMetaData().getUrl().startsWith(SOME_URL));
            }
            count++;
        }
        return count;
    }

    protected CorruptibleARCWriter createArcWithOneRecord(String name, boolean compressed) throws IOException {
        CorruptibleARCWriter writer = createARCWriter(name, compressed);
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        return writer;
    }

    public void testSpaceInURL() {
        String eMessage = null;
        try {
            holeyUrl("testSpaceInURL", false, " ");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Metadata line doesn't match"));
    }

    public void testTabInURL() {
        String eMessage = null;
        try {
            holeyUrl("testTabInURL", false, "\t");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Metadata line doesn't match"));
    }

    protected void holeyUrl(String name, boolean compress, String urlInsert) throws IOException {
        ARCWriter writer = null;
        try {
            writer = createArcWithOneRecord(name, compress);
            // Add some bytes on the end to mess up the record.
            String content = getContent();
            writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", content.length(),
                    getBais(content));
        } finally {
            Closeables.close(writer, true);
        }
    }

    // If uncompressed, length has to be right or parse will fail.
    //
    //    public void testLengthTooShort() throws IOException {
    //        lengthTooShort("testLengthTooShort-" + PREFIX, false);
    //    }

    public void testLengthTooShortCompressed() throws IOException {
        lengthTooShort("testLengthTooShortCompressed", true, false);
    }

    public void testLengthTooShortCompressedStrict() throws IOException {
        String eMessage = null;
        try {
            lengthTooShort("testLengthTooShortCompressedStrict", true, true);
        } catch (RuntimeException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("java.io.IOException: Record STARTING at"));
    }

    protected void lengthTooShort(String name, boolean compress, boolean strict) throws IOException {
        CorruptibleARCWriter writer = null;
        try {
            writer = createArcWithOneRecord(name, compress);
            // Add some bytes on the end to mess up the record.
            String content = getContent();
            ByteArrayInputStream bais = getBais(content + "SOME TRAILING BYTES");
            writeRecord(writer, SOME_URL, "text/html", content.length(), bais);
            writer.setEndJunk("SOME TRAILING BYTES".getBytes());
            writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        } finally {
            Closeables.close(writer, true);
        }

        // Catch System.err into a byte stream.
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        PrintStream origErr = System.err;
        ARCReader r = null;
        try {
            System.setErr(new PrintStream(os));

            r = ARCReaderFactory.get(writer.getFile());
            r.setStrict(strict);
            int count = iterateRecords(r);
            assertTrue("Count wrong " + count, count == 4);

            // Make sure we get the warning string which complains about the
            // trailing bytes.
            String err = os.toString();
            assertTrue("No message " + err, err.startsWith("WARNING") && (err.indexOf("Record STARTING at") > 0));
            r.close();
        } finally {
            Closeables.close(r, true);
            System.setErr(origErr);
        }
    }

    //  If uncompressed, length has to be right or parse will fail.
    //
    //    public void testLengthTooLong()
    //    throws IOException {
    //        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
    //            false, false);
    //    }

    public void testLengthTooLongCompressed() throws IOException {
        lengthTooLong("testLengthTooLongCompressed", true, false);
    }

    public void testLengthTooLongCompressedStrict() {
        String eMessage = null;
        try {
            lengthTooLong("testLengthTooLongCompressed", true, true);
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Premature EOF before end-of-record"));
    }

    protected void lengthTooLong(String name, boolean compress, boolean strict) throws IOException {
        ARCWriter writer = createArcWithOneRecord(name, compress);
        // Add a record with a length that is too long.
        String content = getContent();
        writeRecord(writer, SOME_URL + "2", "text/html", content.length() + 10, getBais(content));
        writeRecord(writer, SOME_URL + "3", "text/html", content.length(), getBais(content));
        writer.close();

        // Catch System.err.
        ByteArrayOutputStream os = new ByteArrayOutputStream();

        PrintStream origErr = System.err;
        ARCReader r = null;
        try {
            System.setErr(new PrintStream(os));

            r = ARCReaderFactory.get(writer.getFile());
            r.setStrict(strict);
            int count = iterateRecords(r);
            assertTrue("Count wrong " + count, count == 4);

            // Make sure we get the warning string which complains about the
            // trailing bytes.
            String err = os.toString();
            assertTrue("No message " + err, err.startsWith("WARNING Premature EOF before end-of-record"));
        } finally {
            Closeables.close(r, true);
            System.setErr(origErr);
        }
    }

    public void testGapError() throws IOException {
        ARCWriter writer = createArcWithOneRecord("testGapError", true);
        String content = getContent();
        // Make a 'weird' RIS that returns bad 'remaining' length
        // awhen remaining should be 0
        ReplayInputStream ris = new ReplayInputStream(content.getBytes(), content.length(), null) {
            public long remaining() {
                return (super.remaining() == 0) ? -1 : super.remaining();
            }
        };
        String message = null;
        try {
            writer.write(SOME_URL, "text/html", "192.168.1.1", (new Date()).getTime(), content.length(), ris);
        } catch (IOException e) {
            message = e.getMessage();
        } finally {
            IOUtils.closeQuietly(ris);
        }
        writer.close();
        assertTrue("No gap when should be",
                message != null && message.indexOf("Gap between expected and actual") >= 0);
    }

    /**
     * Write an arc file for other tests to use.
     * @param arcdir Directory to write to.
     * @param compress True if file should be compressed.
     * @return ARC written.
     * @throws IOException 
     */
    public static File createARCFile(File arcdir, boolean compress) throws IOException {
        File[] files = { arcdir };
        ARCWriter writer = new ARCWriter(SERIAL_NO, new WriterPoolSettingsData("", "test",
                DEFAULT_MAX_ARC_FILE_SIZE, compress, Arrays.asList(files), null));
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        writer.close();
        return writer.getFile();
    }

    //    public void testSpeed() throws IOException {
    //        ARCWriter writer = createArcWithOneRecord("speed", true);
    //        // Add a record with a length that is too long.
    //        String content = getContent();
    //        final int count = 100000;
    //        logger.info("Starting speed write of " + count + " records.");
    //        for (int i = 0; i < count; i++) {
    //            writeRecord(writer, SOME_URL, "text/html", content.length(),
    //                    getBaos(content));
    //        }
    //        writer.close();
    //        logger.info("Finished speed write test.");
    //    }

    public void testValidateMetaLine() throws Exception {
        final String line = "http://www.aandw.net/images/walden2.png "
                + "128.197.34.86 20060111174224 image/png 2160";
        ARCWriter w = createARCWriter("testValidateMetaLine", true);
        try {
            w.validateMetaLine(line);
            w.validateMetaLine(line + LINE_SEPARATOR);
            w.validateMetaLine(line + "\\r\\n");
        } finally {
            w.close();
        }
    }

    public void testArcRecordOffsetReads() throws Exception {
        ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
        ARCRecord ar = getSingleRecord(r);
        // Now try getting some random set of bytes out of it 
        // at an odd offset (used to fail because we were
        // doing bad math to find where in buffer to read).
        final byte[] buffer = new byte[17];
        final int maxRead = 4;
        int totalRead = 0;
        while (totalRead < maxRead) {
            totalRead = totalRead + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
            assertTrue(totalRead > 0);
        }
        r.close();
    }

    // available should always be >= 0; extra read()s should all give EOF
    public void testArchiveRecordAvailableConsistent() throws Exception {
        // first test reading byte-at-a-time via no-param read()
        ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
        ARCRecord record = getSingleRecord(r);
        int c = record.read();
        while (c >= 0) {
            c = record.read();
        }
        // consecutive reads after EOR should always give -1, still show zero available()
        for (int i = 0; i < 5; i++) {
            assertTrue("available negative:" + record.available(), record.available() >= 0);
            assertEquals(-1, record.read());
        }
        r.close();
    }

    // should always give -1 on repeated reads past EOR
    public void testArchiveRecordEORConsistent() throws Exception {
        ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
        ARCRecord record = getSingleRecord(r);
        this.readToEOS(record);
        // consecutive reads after EOR should always give -1
        for (int i = 0; i < 5; i++) {
            assertEquals(-1, record.read(new byte[1]));
        }
        r.close();
    }

    // should not throw premature EOF when wrapped with BufferedInputStream
    // [HER-1450] showed this was the case using Apache Tika
    public void testArchiveRecordMarkSupport() throws Exception {
        ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
        ARCRecord record = getSingleRecord(r);
        record.setStrict(true);
        // ensure mark support
        InputStream stream = new BufferedInputStream(record);
        if (stream.markSupported()) {
            for (int i = 0; i < 3; i++) {
                this.readToEOS(stream);
                stream.mark(stream.available());
                stream.reset();
            }
            stream.close();
        }
        r.close();
    }

    /**
     * Test a particular style of using the reader iterator. (Should
     * possibly be on a reader-centric test class, but the best setup 
     * functionality is here.)
     * 
     * @throws IOException
     */
    public void testReadIterator() throws IOException {
        final int recordCount = 3;
        File arcFile = writeRecords("writeRecord", true, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        Iterator<ArchiveRecord> it = reader.iterator();
        while (it.hasNext()) {
            ArchiveRecord next = it.next();
            next.close();
        }
        reader.close();
    }

    protected void readToEOS(InputStream in) throws Exception {
        byte[] buf = new byte[1024];
        int read = 0;
        while (read >= 0) {
            read = in.read(buf);
            // System.out.println("readToEOS read " + read + " bytes");
        }
    }

    protected ARCReader getSingleRecordReader(String name) throws Exception {
        // Get an ARC with one record.
        WriterPoolMember w = createArcWithOneRecord(name, true);
        w.close();
        // Get reader on said ARC.
        ARCReader r = ARCReaderFactory.get(w.getFile());
        return r;
    }

    protected ARCRecord getSingleRecord(ARCReader r) {
        final Iterator<ArchiveRecord> i = r.iterator();
        // Skip first ARC meta record.
        i.next();
        i.hasNext();
        // Now we're at first and only record in ARC.
        return (ARCRecord) i.next();
    }
}