org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java Source code

Introduction

Here is the source code for org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
Source

package org.commoncrawl.hadoop.io.mapreduce;

/**
* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 **/

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;

import junit.framework.Assert;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.shared.ArcFileReaderTests.TestRecord;
import org.commoncrawl.util.shared.ArcFileItemUtils;
import org.commoncrawl.util.shared.ArcFileReaderTests;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.shared.NIOHttpHeaders;
import org.commoncrawl.util.shared.ByteArrayUtils;
import org.commoncrawl.util.shared.Tuples.Pair;
import org.junit.Test;

import com.google.common.collect.Lists;

/** 
 * ARCFileInputFormat tests
 * 
 * @author rana
 *
 */
public class ArcFileInputFormatTests {

    static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId)
            throws IOException {
        List<TestRecord> recordSet = ArcFileReaderTests
                .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
        Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz");
        FSDataOutputStream os = fs.create(filePath);
        try {
            // write the ARC File into memory 
            ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

            long testAttemptTime = System.currentTimeMillis();

            NIOHttpHeaders testHeaders = new NIOHttpHeaders();
            testHeaders.add("test", "test-value");

            for (TestRecord record : recordSet) {
                long preWritePos = os.getPos();
                ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                        testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
                long postWritePos = os.getPos();
                record.streamPos = (int) preWritePos;
                record.rawSize = (int) (postWritePos - preWritePos);
            }
            os.flush();
        } finally {
            os.close();
        }
        return new Pair<Path, List<TestRecord>>(filePath, recordSet);
    }

    List<Pair<Path, List<TestRecord>>> buildTestFiles(Path rootPath, FileSystem fs, int numOfTestFiles)
            throws IOException {
        List<Pair<Path, List<TestRecord>>> list = Lists.newArrayList();
        for (int i = 0; i < numOfTestFiles; ++i) {
            list.add(buildTestARCFile(rootPath, fs, i));
        }
        return list;
    }

    static final int NUM_ITERATIONS = 1;
    static final int NUM_TEST_FILES = 10;

    static final int getIndexOfSplit(List<Pair<Path, List<TestRecord>>> splits, InputSplit targetSplit) {
        for (int i = 0; i < splits.size(); ++i) {
            Path pathAtIndex = splits.get(i).e0;
            if (((FileSplit) targetSplit).getPath().getName().equals(pathAtIndex.getName())) {
                return i;
            }
        }
        return -1;
    }

    static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
            RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

        int splitDataIndex = getIndexOfSplit(splits, split);

        Assert.assertTrue(splitDataIndex != -1);

        List<TestRecord> records = splits.get(splitDataIndex).e1;

        int itemIndex = 0;
        // iterate and validate stuff ... 
        while (reader.nextKeyValue()) {
            Text key = reader.getCurrentKey();
            BytesWritable value = reader.getCurrentValue();

            TestRecord testRecord = records.get(itemIndex++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                    key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            indexofHeaderTerminator += 4;
            Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                    value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

        splits.remove(splitDataIndex);

    }

    static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
            RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException {

        int splitDataIndex = getIndexOfSplit(splits, split);

        Assert.assertTrue(splitDataIndex != -1);

        List<TestRecord> records = splits.get(splitDataIndex).e1;

        int itemIndex = 0;
        // iterate and validate stuff ...
        while (reader.nextKeyValue()) {

            Text key = reader.getCurrentKey();
            ArcFileItem value = reader.getCurrentValue();

            TestRecord testRecord = records.get(itemIndex++);

            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                    key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                    value.getContent().getReadOnlyBytes(), value.getContent().getOffset(),
                    value.getContent().getCount()) == 0);
            NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
            // validate metadata 
            Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
            Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos);
            Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize);
            Assert.assertEquals("test-value", headers.findValue("test"));
            Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName());

        }
        reader.close();

        Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

        splits.remove(splitDataIndex);

    }

    @Test
    public void TestArcInputFormat() throws IOException, InterruptedException {
        for (int i = 0; i < NUM_ITERATIONS; ++i) {
            Job job = new Job();
            FileSystem fs = LocalFileSystem.get(job.getConfiguration());
            Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
            fs.mkdirs(path);

            List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

            FileInputFormat.setInputPaths(job, path);

            ARCFileInputFormat inputFormat = new ARCFileInputFormat();

            List<InputSplit> splits = inputFormat.getSplits(job);

            for (InputSplit split : splits) {
                RecordReader<Text, BytesWritable> reader = inputFormat.createRecordReader(split,
                        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
                validateSplit(fs, split, fileList, reader);
            }

            Assert.assertTrue(fileList.size() == 0);

            fs.delete(path, true);
        }

    }

    @Test
    public void TestArcFileItemInputFormat() throws IOException, InterruptedException {
        for (int i = 0; i < NUM_ITERATIONS; ++i) {
            Job job = new Job();
            FileSystem fs = LocalFileSystem.get(job.getConfiguration());
            Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
            fs.mkdirs(path);

            List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

            FileInputFormat.setInputPaths(job, path);

            ARCFileItemInputFormat inputFormat = new ARCFileItemInputFormat();

            List<InputSplit> splits = inputFormat.getSplits(job);

            for (InputSplit split : splits) {
                RecordReader<Text, ArcFileItem> reader = inputFormat.createRecordReader(split,
                        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
                validateArcFileItemSplit(fs, split, fileList, reader);
            }

            Assert.assertTrue(fileList.size() == 0);

            fs.delete(path, true);
        }

    }

}