com.cyberway.issue.io.arc.ARCWriterTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.io.arc.ARCWriterTest.java

Source

/* ARCWriterTest
 *
 * $Id: ARCWriterTest.java 5478 2007-09-19 01:37:07Z gojomo $
 *
 * Created on Dec 31, 2003.
 *
 * Copyright (C) 2003 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.cyberway.issue.io.arc;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.NullInputStream;
import org.apache.commons.io.output.NullOutputStream;
import com.cyberway.issue.io.ArchiveRecord;
import com.cyberway.issue.io.ReplayInputStream;
import com.cyberway.issue.io.WriterPoolMember;
import com.cyberway.issue.util.ArchiveUtils;
import com.cyberway.issue.util.FileUtils;
import com.cyberway.issue.util.TmpDirTestCase;

/**
 * Test ARCWriter class.
 *
 * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
 * ARCWriter.  Then it validates what was written w/ ARCReader.
 *
 * @author stack
 */
public class ARCWriterTest extends TmpDirTestCase implements ARCConstants {
    /** Utility class for writing bad ARCs (with trailing junk)
      */
    public class CorruptibleARCWriter extends ARCWriter {
        byte[] endJunk = null;

        public CorruptibleARCWriter(AtomicInteger serial_no, List<File> name, String name2, boolean compress,
                long default_max_arc_file_size) {
            super(serial_no, name, name2, compress, default_max_arc_file_size);
        }

        @Override
        protected void postWriteRecordTasks() throws IOException {
            if (endJunk != null) {
                this.write(endJunk);
            }
            super.postWriteRecordTasks();
        }

        public void setEndJunk(byte[] b) throws IOException {
            this.endJunk = b;
        }
    }

    /**
     * Prefix to use for ARC files made by JUNIT.
     */
    private static final String SUFFIX = /* TODO DEFAULT_ARC_FILE_PREFIX*/ "JUNIT";

    private static final String SOME_URL = "http://www.archive.org/test/";

    private static final AtomicInteger SERIAL_NO = new AtomicInteger();

    /*
     * @see TestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();
    }

    /*
     * @see TestCase#tearDown()
     */
    protected void tearDown() throws Exception {
        super.tearDown();
    }

    protected static String getContent() {
        return getContent(null);
    }

    protected static String getContent(String indexStr) {
        String page = (indexStr != null) ? "Page #" + indexStr : "Some Page";
        return "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n\r\n" + "<html><head><title>" + page
                + "</title></head>" + "<body>" + page + "</body></html>";
    }

    protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) throws IOException {
        String indexStr = Integer.toString(index);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with an arbitrary 14-digit date per RFC2540
        String now = ArchiveUtils.get14DigitDate();
        int recordLength = 0;
        byte[] record = (getContent(indexStr)).getBytes();
        recordLength += record.length;
        baos.write(record);
        // Add the newline between records back in
        baos.write("\n".getBytes());
        recordLength += 1;
        arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", "0.1.2.3", Long.parseLong(now),
                recordLength, baos);
        return recordLength;
    }

    private File writeRecords(String baseName, boolean compress, long maxSize, int recordCount) throws IOException {
        cleanUpOldFiles(baseName);
        File[] files = { getTmpDir() };
        ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files), baseName + '-' + SUFFIX, compress,
                maxSize);
        assertNotNull(arcWriter);
        for (int i = 0; i < recordCount; i++) {
            writeRandomHTTPRecord(arcWriter, i);
        }
        arcWriter.close();
        assertTrue("Doesn't exist: " + arcWriter.getFile().getAbsolutePath(), arcWriter.getFile().exists());
        return arcWriter.getFile();
    }

    private void validate(File arcFile, int recordCount) throws FileNotFoundException, IOException {
        ARCReader reader = ARCReaderFactory.get(arcFile);
        assertNotNull(reader);
        List metaDatas = null;
        if (recordCount == -1) {
            metaDatas = reader.validate();
        } else {
            metaDatas = reader.validate(recordCount);
        }
        reader.close();
        // Now, run through each of the records doing absolute get going from
        // the end to start.  Reopen the arc so no context between this test
        // and the previous.
        reader = ARCReaderFactory.get(arcFile);
        for (int i = metaDatas.size() - 1; i >= 0; i--) {
            ARCRecordMetaData meta = (ARCRecordMetaData) metaDatas.get(i);
            ArchiveRecord r = reader.get(meta.getOffset());
            String mimeType = r.getHeader().getMimetype();
            assertTrue("Record is bogus", mimeType != null && mimeType.length() > 0);
        }
        reader.close();
        assertTrue("Metadatas not equal", metaDatas.size() == recordCount);
        for (Iterator i = metaDatas.iterator(); i.hasNext();) {
            ARCRecordMetaData r = (ARCRecordMetaData) i.next();
            assertTrue("Record is empty", r.getLength() > 0);
        }
    }

    public void testCheckARCFileSize() throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", false);
    }

    public void testCheckARCFileSizeCompressed() throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", true);
    }

    public void testWriteRecord() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecord", false, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount + 1); // Header record.
    }

    public void testRandomAccess() throws IOException {
        final int recordCount = 3;
        File arcFile = writeRecords("writeRecord", true, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        // Get to second record.  Get its offset for later use.
        boolean readFirst = false;
        String url = null;
        long offset = -1;
        long totalRecords = 0;
        boolean readSecond = false;
        for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
            ARCRecord ar = (ARCRecord) i.next();
            if (!readFirst) {
                readFirst = true;
                continue;
            }
            if (!readSecond) {
                url = ar.getMetaData().getUrl();
                offset = ar.getMetaData().getOffset();
                readSecond = true;
            }
        }

        reader = ARCReaderFactory.get(arcFile, offset);
        ArchiveRecord ar = reader.get();
        assertEquals(ar.getHeader().getUrl(), url);
        ar.close();

        // Get reader again.  See how iterator works with offset
        reader = ARCReaderFactory.get(arcFile, offset);
        int count = 0;
        for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
            count++;
        }
        reader.close();
        assertEquals(totalRecords - 1, count);
    }

    public void testWriteRecordCompressed() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecordCompressed", true, DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount + 1 /*Header record*/);
    }

    public void testWriteGiantRecord() throws IOException {
        File[] files = { getTmpDir() };
        PrintStream dummyStream = new PrintStream(new NullOutputStream());
        ARCWriter arcWriter = new ARCWriter(SERIAL_NO, dummyStream, new File("dummy"), false, null, null);
        assertNotNull(arcWriter);

        // Start the record with an arbitrary 14-digit date per RFC2540
        long now = System.currentTimeMillis();
        long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;

        arcWriter.write("dummy:uri", "application/octet-stream", "0.1.2.3", now, recordLength,
                new NullInputStream(recordLength));
        arcWriter.close();
    }

    private void runCheckARCFileSizeTest(String baseName, boolean compress)
            throws FileNotFoundException, IOException {
        writeRecords(baseName, compress, 1024, 15);
        // Now validate all files just created.
        File[] files = FileUtils.getFilesWithPrefix(getTmpDir(), SUFFIX);
        for (int i = 0; i < files.length; i++) {
            validate(files[i], -1);
        }
    }

    protected CorruptibleARCWriter createARCWriter(String NAME, boolean compress) {
        File[] files = { getTmpDir() };
        return new CorruptibleARCWriter(SERIAL_NO, Arrays.asList(files), NAME, compress, DEFAULT_MAX_ARC_FILE_SIZE);
    }

    protected static ByteArrayInputStream getBais(String str) throws IOException {
        return new ByteArrayInputStream(str.getBytes());
    }

    /**
     * Writes a record, suppressing normal length-checks (so that 
     * intentionally malformed records may be written). 
     */
    protected static void writeRecord(ARCWriter writer, String url, String type, int len, ByteArrayInputStream bais)
            throws IOException {
        writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len, bais, false);
    }

    protected int iterateRecords(ARCReader r) throws IOException {
        int count = 0;
        for (Iterator i = r.iterator(); i.hasNext();) {
            ARCRecord rec = (ARCRecord) i.next();
            rec.close();
            if (count != 0) {
                assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
                        rec.getMetaData().getUrl().equals(SOME_URL));
            }
            count++;
        }
        return count;
    }

    protected CorruptibleARCWriter createArcWithOneRecord(String name, boolean compressed) throws IOException {
        CorruptibleARCWriter writer = createARCWriter(name, compressed);
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        return writer;
    }

    public void testSpaceInURL() {
        String eMessage = null;
        try {
            holeyUrl("testSpaceInURL-" + SUFFIX, false, " ");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Metadata line doesn't match"));
    }

    public void testTabInURL() {
        String eMessage = null;
        try {
            holeyUrl("testTabInURL-" + SUFFIX, false, "\t");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Metadata line doesn't match"));
    }

    protected void holeyUrl(String name, boolean compress, String urlInsert) throws IOException {
        ARCWriter writer = createArcWithOneRecord(name, compress);
        // Add some bytes on the end to mess up the record.
        String content = getContent();
        writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", content.length(), getBais(content));
        writer.close();
    }

    // If uncompressed, length has to be right or parse will fail.
    //
    //    public void testLengthTooShort() throws IOException {
    //        lengthTooShort("testLengthTooShort-" + PREFIX, false);
    //    }

    public void testLengthTooShortCompressed() throws IOException {
        lengthTooShort("testLengthTooShortCompressed-" + SUFFIX, true, false);
    }

    public void testLengthTooShortCompressedStrict() throws IOException {
        String eMessage = null;
        try {
            lengthTooShort("testLengthTooShortCompressedStrict-" + SUFFIX, true, true);
        } catch (RuntimeException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("java.io.IOException: Record ENDING at"));
    }

    protected void lengthTooShort(String name, boolean compress, boolean strict) throws IOException {
        CorruptibleARCWriter writer = createArcWithOneRecord(name, compress);
        // Add some bytes on the end to mess up the record.
        String content = getContent();
        ByteArrayInputStream bais = getBais(content + "SOME TRAILING BYTES");
        writeRecord(writer, SOME_URL, "text/html", content.length(), bais);
        writer.setEndJunk("SOME TRAILING BYTES".getBytes());
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        writer.close();

        // Catch System.err into a byte stream.
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        System.setErr(new PrintStream(os));

        ARCReader r = ARCReaderFactory.get(writer.getFile());
        r.setStrict(strict);
        int count = iterateRecords(r);
        assertTrue("Count wrong " + count, count == 4);

        // Make sure we get the warning string which complains about the
        // trailing bytes.
        String err = os.toString();
        assertTrue("No message " + err, err.startsWith("WARNING") && (err.indexOf("Record ENDING at") > 0));
    }

    //  If uncompressed, length has to be right or parse will fail.
    //
    //    public void testLengthTooLong()
    //    throws IOException {
    //        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
    //            false, false);
    //    }

    public void testLengthTooLongCompressed() throws IOException {
        lengthTooLong("testLengthTooLongCompressed-" + SUFFIX, true, false);
    }

    public void testLengthTooLongCompressedStrict() {
        String eMessage = null;
        try {
            lengthTooLong("testLengthTooLongCompressed-" + SUFFIX, true, true);
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
                eMessage.startsWith("Premature EOF before end-of-record"));
    }

    protected void lengthTooLong(String name, boolean compress, boolean strict) throws IOException {
        ARCWriter writer = createArcWithOneRecord(name, compress);
        // Add a record with a length that is too long.
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length() + 10, getBais(content));
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        writer.close();

        // Catch System.err.
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        System.setErr(new PrintStream(os));

        ARCReader r = ARCReaderFactory.get(writer.getFile());
        r.setStrict(strict);
        int count = iterateRecords(r);
        assertTrue("Count wrong " + count, count == 4);

        // Make sure we get the warning string which complains about the
        // trailing bytes.
        String err = os.toString();
        assertTrue("No message " + err, err.startsWith("WARNING Premature EOF before end-of-record"));
    }

    public void testGapError() throws IOException {
        ARCWriter writer = createArcWithOneRecord("testGapError", true);
        String content = getContent();
        // Make a 'weird' RIS that returns bad 'remaining' length
        // awhen remaining should be 0
        ReplayInputStream ris = new ReplayInputStream(content.getBytes(), content.length(), null) {
            public long remaining() {
                return (super.remaining() == 0) ? -1 : super.remaining();
            }
        };
        String message = null;
        try {
            writer.write(SOME_URL, "text/html", "192.168.1.1", (new Date()).getTime(), content.length(), ris);
        } catch (IOException e) {
            message = e.getMessage();
        } finally {
            IOUtils.closeQuietly(ris);
        }
        writer.close();
        assertTrue("No gap when should be",
                message != null && message.indexOf("Gap between expected and actual") >= 0);
    }

    /**
     * Write an arc file for other tests to use.
     * @param arcdir Directory to write to.
     * @param compress True if file should be compressed.
     * @return ARC written.
     * @throws IOException 
     */
    public static File createARCFile(File arcdir, boolean compress) throws IOException {
        File[] files = { arcdir };
        ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays.asList(files), "test", compress,
                DEFAULT_MAX_ARC_FILE_SIZE);
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content));
        writer.close();
        return writer.getFile();
    }

    //    public void testSpeed() throws IOException {
    //        ARCWriter writer = createArcWithOneRecord("speed", true);
    //        // Add a record with a length that is too long.
    //        String content = getContent();
    //        final int count = 100000;
    //        logger.info("Starting speed write of " + count + " records.");
    //        for (int i = 0; i < count; i++) {
    //            writeRecord(writer, SOME_URL, "text/html", content.length(),
    //                    getBaos(content));
    //        }
    //        writer.close();
    //        logger.info("Finished speed write test.");
    //    }

    public void testValidateMetaLine() throws Exception {
        final String line = "http://www.aandw.net/images/walden2.png "
                + "128.197.34.86 20060111174224 image/png 2160";
        ARCWriter w = createARCWriter("testValidateMetaLine", true);
        try {
            w.validateMetaLine(line);
            w.validateMetaLine(line + LINE_SEPARATOR);
            w.validateMetaLine(line + "\\r\\n");
        } finally {
            w.close();
        }
    }

    public void testArcRecordOffsetReads() throws Exception {
        // Get an ARC with one record.
        WriterPoolMember w = createArcWithOneRecord("testArcRecordInBufferStream", true);
        w.close();
        // Get reader on said ARC.
        ARCReader r = ARCReaderFactory.get(w.getFile());
        final Iterator i = r.iterator();
        // Skip first ARC meta record.
        ARCRecord ar = (ARCRecord) i.next();
        i.hasNext();
        // Now we're at first and only record in ARC.
        ar = (ARCRecord) i.next();
        // Now try getting some random set of bytes out of it 
        // at an odd offset (used to fail because we were
        // doing bad math to find where in buffer to read).
        final byte[] buffer = new byte[17];
        final int maxRead = 4;
        int totalRead = 0;
        while (totalRead < maxRead) {
            totalRead = totalRead + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
            assertTrue(totalRead > 0);
        }
    }
}