org.commoncrawl.util.shared.ArcFileReaderTests.java Source code

Introduction

Here is the source code for org.commoncrawl.util.shared.ArcFileReaderTests.java
Source

package org.commoncrawl.util.shared;

/**
* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 **/

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.zip.GZIPOutputStream;

import junit.framework.Assert;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.shared.NIOHttpHeaders;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.shared.ByteArrayUtils;
import org.commoncrawl.util.shared.CCStringUtils;
import org.commoncrawl.util.shared.GZIPUtils;
import org.commoncrawl.util.shared.GZIPUtils.UnzipResult;
import org.commoncrawl.util.shared.IPAddressUtils;
import org.commoncrawl.util.shared.Tuples.Pair;
import org.junit.Test;

import com.google.common.collect.Lists;

/** 
 * ARCFileReader tests
 * 
 * @author rana
 *
 */
public class ArcFileReaderTests {

    private static final Log LOG = LogFactory.getLog(ArcFileReaderTests.class);

    static String getMetaLine(String uri, String arcFileName, String contentType, String hostIP,
            long fetchBeginTimeStamp, long recordLength) throws IOException {

        if (fetchBeginTimeStamp <= 0) {
            throw new IOException("Bogus fetchBeginTimestamp: " + Long.toString(fetchBeginTimeStamp));
        }

        return createMetaline(uri, arcFileName, hostIP, TIMESTAMP14.format(new Date(fetchBeginTimeStamp)),
                contentType, Long.toString(recordLength));
    }

    static SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat("yyyyMMddHHmmss");

    static final char HEADER_FIELD_SEPARATOR = ' ';
    static final String UTF8 = "UTF-8";
    static final char LINE_SEPARATOR = '\n';
    static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, 0, 0, 0, 0 };
    static final String DEFAULT_ENCODING = "ISO-8859-1";
    static final String ARC_MAGIC_NUMBER = "filedesc://";

    /**
     * An override so we get access to underlying output stream and offer an end()
     * that does not accompany closing underlying stream.
     * 
     * @author stack
     */
    static class CompressedStream extends GZIPOutputStream {
        public CompressedStream(OutputStream out) throws IOException {
            super(out);
        }

        /**
         * @return Reference to stream being compressed.
         */
        OutputStream getWrappedStream() {
            return this.out;
        }

        /**
         * Release the deflater's native process resources, which otherwise would
         * not occur until either finalization or DeflaterOutputStream.close()
         * (which would also close underlying stream).
         */
        public void end() {
            def.end();
        }
    }

    static String createMetaline(String uri, String arcFileName, String hostIP, String timeStamp, String mimetype,
            String recordLength) {
        return uri + HEADER_FIELD_SEPARATOR + hostIP + HEADER_FIELD_SEPARATOR + timeStamp + HEADER_FIELD_SEPARATOR
                + mimetype + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
    }

    static byte[] generateARCFileMetaData(String arcFileName, String date) throws IOException {

        String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 " + "0");
        int recordLength = metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
        String metadataHeaderStr = ARC_MAGIC_NUMBER + arcFileName + " 0.0.0.0 " + date + " text/plain "
                + recordLength + metadataHeaderLinesTwoAndThree;

        ByteArrayOutputStream metabaos = new ByteArrayOutputStream(recordLength);

        // Write the metadata header.
        metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
        // Write out a LINE_SEPARATORs to end this record.
        metabaos.write(LINE_SEPARATOR);

        // Now get bytes of all just written and compress if flag set.
        byte[] bytes = metabaos.toByteArray();

        // GZIP the header but catch the gzipping into a byte array so we
        // can add the special IA GZIP header to the product. After
        // manipulations, write to the output stream (The JAVA GZIP
        // implementation does not give access to GZIP header. It
        // produces a 'default' header only). We can get away w/ these
        // maniupulations because the GZIP 'default' header doesn't
        // do the 'optional' CRC'ing of the header.

        byte[] gzippedMetaData = gzip(bytes);

        if (gzippedMetaData[3] != 0) {
            throw new IOException(
                    "The GZIP FLG header is unexpectedly " + " non-zero.  Need to add smarter code that can deal "
                            + " when already extant extra GZIP header fields.");
        }

        // Set the GZIP FLG header to '4' which says that the GZIP header
        // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
        // '0'} 'extra' field. The IA GZIP header will also set byte
        // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
        gzippedMetaData[3] = 4;
        gzippedMetaData[9] = 3;

        byte[] assemblyBuffer = new byte[gzippedMetaData.length + ARC_GZIP_EXTRA_FIELD.length];
        // '10' in the below is a pointer past the following bytes of the
        // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
        // RFC1952 for explaination of the abbreviations just used.
        System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
        System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, ARC_GZIP_EXTRA_FIELD.length);
        System.arraycopy(gzippedMetaData, 10, assemblyBuffer, 10 + ARC_GZIP_EXTRA_FIELD.length,
                gzippedMetaData.length - 10);
        bytes = assemblyBuffer;

        //System.out.println("Header Bytes:" + HexDump.dumpHexString(bytes));
        return bytes;
    }

    static String getMetadataHeaderLinesTwoAndThree(String version) {
        StringBuffer buffer = new StringBuffer();
        buffer.append(LINE_SEPARATOR);
        buffer.append(version);
        buffer.append(" CommonCrawl");
        buffer.append(LINE_SEPARATOR);
        buffer.append("URL IP-address Archive-date Content-type Archive-length");
        buffer.append(LINE_SEPARATOR);
        return buffer.toString();
    }

    /**
     * Gzip passed bytes. Use only when bytes is small.
     * 
     * @param bytes
     *          What to gzip.
     * @return A gzip member of bytes.
     * @throws IOException
     */
    static byte[] gzip(byte[] bytes) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        GZIPOutputStream gzipOS = new GZIPOutputStream(baos);
        gzipOS.write(bytes, 0, bytes.length);
        gzipOS.close();
        return baos.toByteArray();
    }

    public static void writeFirstRecord(final OutputStream os, final String fileName, long ts) throws IOException {
        os.write(generateARCFileMetaData(fileName, TIMESTAMP14.format(new Date(System.currentTimeMillis()))));
    }

    public static boolean write(OutputStream os, String normalizedURL, String arcFileName, int segmentid,
            int crawlNumber, byte[] crawlData, int crawlDataOffset, int crawlDataLen, NIOHttpHeaders headers,
            String contentType, String signature, int hostIP, long lastAttemptTime) throws IOException {

        String encodedURI = normalizedURL;

        String hostIPStr = IPAddressUtils.IntegerToIPAddressString(hostIP);
        long fetchBeginTimestamp = lastAttemptTime;
        String encoding = headers.findValue("Content-Encoding");
        String truncationFlags = "";

        {

            if (crawlData != null && encoding != null && encoding.equalsIgnoreCase("gzip")) {
                int compressedSize = crawlData.length;
                try {
                    UnzipResult result = GZIPUtils.unzipBestEffort(crawlData, 2 << 20);

                    crawlData = result.data;
                    crawlDataOffset = 0;
                    crawlDataLen = result.data.length;

                    if (result.wasTruncated) {
                        if (truncationFlags.length() != 0)
                            truncationFlags += ",";
                        truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate);
                    }
                } catch (Exception e) {
                    LOG.error("URL:" + normalizedURL + " Rejected - GZIP Decompression Failed");
                    crawlData = null;
                }
            }

            // content must not be null
            if (crawlData == null) {
                LOG.error("URL:" + normalizedURL + " Rejected - Content is NULL");
            } else {

                // add in our custom headers ...
                headers.add(Constants.ARCFileHeader_ParseSegmentId, ((Integer) segmentid).toString());
                headers.add(Constants.ARCFileHeader_OriginalURL, normalizedURL);

                headers.add(Constants.ARCFileHeader_Signature, signature);
                headers.add(Constants.ARCFileHeader_CrawlNumber, Integer.toString(crawlNumber));
                headers.add(Constants.ARCFileHeader_FetchTimeStamp, Long.toString(fetchBeginTimestamp));
                // headers.add(Environment.ARCFileHeader_CrawlerId,
                // Integer.toString((int)urlItem.get));

                if (truncationFlags.length() != 0) {
                    headers.add(Constants.ARCFileHeader_ContentTruncated, truncationFlags);
                }

                String headerString = headers.toString() + "\r\n";

                byte[] headerBytes = headerString.getBytes("UTF-8");

                // content is truncated further upstream, so this redundant check /
                // truncation is problematic
                // int contentLength = Math.min(crawlData.length,CONTENT_SIZE_LIMIT);

                // extract metadata line upfront, since if the url exceeds a certain
                // size limit , we are going to reject the entry...
                byte metaDataLine[];

                try {
                    metaDataLine = getMetaLine(encodedURI, arcFileName, contentType, hostIPStr, fetchBeginTimestamp,
                            crawlDataLen + headerBytes.length).getBytes(UTF8);
                } catch (IOException e) {
                    LOG.error("Metadata Line Validation FAILED with Exception:"
                            + CCStringUtils.stringifyException(e));
                    // bail here ...
                    return false;
                }

                // get ready to write out a new gziped entry ...
                OutputStream compressedStream = preWriteRecordTasks(os, headerBytes.length, crawlDataLen,
                        contentType);
                try {
                    // read to write an entry ...
                    compressedStream.write(metaDataLine);

                    // write out the headers ...
                    compressedStream.write(headerBytes, 0, headerBytes.length);
                    // write out the content
                    compressedStream.write(crawlData, 0, crawlDataLen);
                    // line separator ...
                    compressedStream.write(LINE_SEPARATOR);

                } finally {
                    // flush the gzip stream...
                    postWriteRecordTasks(compressedStream);
                }
            }

            return true;
        }

    }

    static OutputStream preWriteRecordTasks(OutputStream os, int headerBytesLength, int contentBytesLength,
            String contentType) throws IOException {

        // Wrap stream in GZIP Writer.
        // The below construction immediately writes the GZIP 'default'
        // header out on the underlying stream.
        return new CompressedStream(os);
    }

    static OutputStream postWriteRecordTasks(OutputStream os) throws IOException {
        CompressedStream o = (CompressedStream) os;
        o.finish();
        o.flush();
        o.end();
        return o.getWrappedStream();
    }

    static String randomConstrainedString(final Random random, char validChars[], final int minLength,
            final int maxLength) {
        final int length = random.nextInt(maxLength - minLength) + minLength;
        final char[] chars = new char[length];
        for (int i = 0, x = chars.length; i < x;)
            chars[i++] = validChars[random.nextInt(validChars.length)];
        return new String(chars);
    }

    static String randomString(final Random random, final int minLength, final int maxLength) {
        final int length = random.nextInt(maxLength - minLength) + minLength;
        final char[] chars = new char[length];
        for (int i = 0, x = chars.length; i < x;)
            do {
                final int cp = random.nextInt(0x10FFFF + 1);
                if (!Character.isDefined(cp))
                    continue;
                final char[] chs = Character.toChars(cp);
                if (chs.length > x - i)
                    continue;
                for (final char ch : chs) {
                    if (!Character.isWhitespace(ch)) {
                        chars[i++] = ch;
                    }
                }
                break;

            } while (true);

        return new String(chars);
    }

    static final String[] testHeaderKeys = { "x-cc-test-header-1", "x-cc-test-header-2", "x-cc-test-header-3" };

    static final String validHeaderChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

    public static class TestRecord {
        public String url;
        public byte[] data;
        public List<Pair<String, String>> headers;
        public int streamPos;
        public int rawSize;
    }

    public static List<TestRecord> buildTestRecords(int recordCount) {
        Random random = new Random();

        List<TestRecord> records = Lists.newArrayList();

        char headerChars[] = validHeaderChars.toCharArray();

        for (int i = 0; i < recordCount; ++i) {
            TestRecord record = new TestRecord();
            // intentionally add a space in the url to mimic malformed headers
            record.url = "http://foo/ " + randomString(random, 5, 100);
            record.data = randomString(random, 1000, 3000).getBytes(Charset.forName("UTF-8"));
            record.headers = Lists.newArrayList();
            for (int j = 0; j < testHeaderKeys.length; ++j) {
                record.headers.add(new Pair<String, String>(testHeaderKeys[j],
                        randomConstrainedString(random, headerChars, 100, 200)));
            }
            records.add(record);
        }
        return records;
    }

    public static final int BASIC_TEST_RECORD_COUNT = 100;

    /** 
     * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
     */
    @Test
    public void testReader() {
        DataOutputBuffer os = new DataOutputBuffer();
        long timestamp = System.currentTimeMillis();
        try {
            // write the ARC File into memory 
            writeFirstRecord(os, "test", timestamp);
            List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
            long testAttemptTime = System.currentTimeMillis();

            for (TestRecord record : records) {
                NIOHttpHeaders headers = new NIOHttpHeaders();
                for (int i = 0; i < record.headers.size(); ++i) {
                    headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
                }

                write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                        MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
            }
            os.flush();
            os.close();

            final AtomicBoolean streamClosed = new AtomicBoolean();
            // setup ArcFileReader to read the file 
            InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

                public synchronized int read(byte b[], int off, int len) {
                    len = 1;
                    return super.read(b, off, len);
                }

                public void close() throws IOException {
                    super.close();
                    streamClosed.set(true);
                }
            };
            ARCFileReader reader = new ARCFileReader(in);
            int index = 0;
            Text key = new Text();
            BytesWritable value = new BytesWritable();

            // iterate and validate stuff ... 
            while (reader.hasMoreItems()) {
                reader.nextKeyValue(key, value);
                TestRecord testRecord = records.get(index++);
                // get test key bytes as utf-8 bytes ... 
                byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
                // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
                // with ?, which causes our test case (which does use invalid characters to from the key, to break.
                Assert.assertTrue(
                        compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
                // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
                // we search for this specific byte pattern to locate start of content, then compare it against source ... 
                int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                        "\r\n\r\n".getBytes());
                if (indexofHeaderTerminator == -1) {
                    throw new IOException("No Header Terminator found in Value!");
                }
                indexofHeaderTerminator += 4;
                // read headers ... 
                String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                        Charset.forName("UTF-8"));
                NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
                for (int i = 0; i < testRecord.headers.size(); ++i) {
                    Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                    Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                    Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
                }

                Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                        indexofHeaderTerminator, testRecord.data.length) == 0);
            }
            reader.close();

            Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
            Assert.assertTrue(streamClosed.get());
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    /** 
     * helper offset based byte array comparator 
     * @param buffer1
     * @param offset1
     * @param length1
     * @param buffer2
     * @param offset2
     * @param length2
     * @return
     */
    public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2,
            int length2) {
        // Short circuit equal case
        if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2) {
            return 0;
        }
        // Bring WritableComparator code local
        int end1 = offset1 + length1;
        int end2 = offset2 + length2;
        for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
            int a = (buffer1[i] & 0xff);
            int b = (buffer2[j] & 0xff);
            if (a != b) {
                return a - b;
            }
        }
        return length1 - length2;
    }

}