Java tutorial
package org.commoncrawl.util.shared; /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.charset.Charset; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; import java.util.zip.GZIPOutputStream; import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.io.Text; import org.commoncrawl.crawl.common.shared.Constants; import org.commoncrawl.io.shared.NIOHttpHeaders; import org.commoncrawl.protocol.shared.ArcFileItem; import org.commoncrawl.util.shared.ByteArrayUtils; import org.commoncrawl.util.shared.CCStringUtils; import org.commoncrawl.util.shared.GZIPUtils; import org.commoncrawl.util.shared.GZIPUtils.UnzipResult; import org.commoncrawl.util.shared.IPAddressUtils; import org.commoncrawl.util.shared.Tuples.Pair; import org.junit.Test; import com.google.common.collect.Lists; /** * ARCFileReader tests * * @author rana * */ public class ArcFileReaderTests { private static final Log LOG = LogFactory.getLog(ArcFileReaderTests.class); static String getMetaLine(String uri, String arcFileName, String contentType, String hostIP, long fetchBeginTimeStamp, long recordLength) throws IOException { if (fetchBeginTimeStamp <= 0) { throw new IOException("Bogus fetchBeginTimestamp: " + Long.toString(fetchBeginTimeStamp)); } return createMetaline(uri, arcFileName, hostIP, TIMESTAMP14.format(new Date(fetchBeginTimeStamp)), contentType, Long.toString(recordLength)); } static SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat("yyyyMMddHHmmss"); static final char HEADER_FIELD_SEPARATOR = ' '; static final String UTF8 = "UTF-8"; static final char LINE_SEPARATOR = '\n'; static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, 0, 0, 0, 0 }; static final String DEFAULT_ENCODING = "ISO-8859-1"; static final String ARC_MAGIC_NUMBER = "filedesc://"; /** * An override so we get access to underlying output stream and offer an end() * that does not accompany closing underlying stream. * * @author stack */ static class CompressedStream extends GZIPOutputStream { public CompressedStream(OutputStream out) throws IOException { super(out); } /** * @return Reference to stream being compressed. */ OutputStream getWrappedStream() { return this.out; } /** * Release the deflater's native process resources, which otherwise would * not occur until either finalization or DeflaterOutputStream.close() * (which would also close underlying stream). */ public void end() { def.end(); } } static String createMetaline(String uri, String arcFileName, String hostIP, String timeStamp, String mimetype, String recordLength) { return uri + HEADER_FIELD_SEPARATOR + hostIP + HEADER_FIELD_SEPARATOR + timeStamp + HEADER_FIELD_SEPARATOR + mimetype + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR; } static byte[] generateARCFileMetaData(String arcFileName, String date) throws IOException { String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 " + "0"); int recordLength = metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length; String metadataHeaderStr = ARC_MAGIC_NUMBER + arcFileName + " 0.0.0.0 " + date + " text/plain " + recordLength + metadataHeaderLinesTwoAndThree; ByteArrayOutputStream metabaos = new ByteArrayOutputStream(recordLength); // Write the metadata header. metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING)); // Write out a LINE_SEPARATORs to end this record. metabaos.write(LINE_SEPARATOR); // Now get bytes of all just written and compress if flag set. byte[] bytes = metabaos.toByteArray(); // GZIP the header but catch the gzipping into a byte array so we // can add the special IA GZIP header to the product. After // manipulations, write to the output stream (The JAVA GZIP // implementation does not give access to GZIP header. It // produces a 'default' header only). We can get away w/ these // maniupulations because the GZIP 'default' header doesn't // do the 'optional' CRC'ing of the header. byte[] gzippedMetaData = gzip(bytes); if (gzippedMetaData[3] != 0) { throw new IOException( "The GZIP FLG header is unexpectedly " + " non-zero. Need to add smarter code that can deal " + " when already extant extra GZIP header fields."); } // Set the GZIP FLG header to '4' which says that the GZIP header // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0, // '0'} 'extra' field. The IA GZIP header will also set byte // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same. gzippedMetaData[3] = 4; gzippedMetaData[9] = 3; byte[] assemblyBuffer = new byte[gzippedMetaData.length + ARC_GZIP_EXTRA_FIELD.length]; // '10' in the below is a pointer past the following bytes of the // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See // RFC1952 for explaination of the abbreviations just used. System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10); System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, ARC_GZIP_EXTRA_FIELD.length); System.arraycopy(gzippedMetaData, 10, assemblyBuffer, 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10); bytes = assemblyBuffer; //System.out.println("Header Bytes:" + HexDump.dumpHexString(bytes)); return bytes; } static String getMetadataHeaderLinesTwoAndThree(String version) { StringBuffer buffer = new StringBuffer(); buffer.append(LINE_SEPARATOR); buffer.append(version); buffer.append(" CommonCrawl"); buffer.append(LINE_SEPARATOR); buffer.append("URL IP-address Archive-date Content-type Archive-length"); buffer.append(LINE_SEPARATOR); return buffer.toString(); } /** * Gzip passed bytes. Use only when bytes is small. * * @param bytes * What to gzip. * @return A gzip member of bytes. * @throws IOException */ static byte[] gzip(byte[] bytes) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzipOS = new GZIPOutputStream(baos); gzipOS.write(bytes, 0, bytes.length); gzipOS.close(); return baos.toByteArray(); } public static void writeFirstRecord(final OutputStream os, final String fileName, long ts) throws IOException { os.write(generateARCFileMetaData(fileName, TIMESTAMP14.format(new Date(System.currentTimeMillis())))); } public static boolean write(OutputStream os, String normalizedURL, String arcFileName, int segmentid, int crawlNumber, byte[] crawlData, int crawlDataOffset, int crawlDataLen, NIOHttpHeaders headers, String contentType, String signature, int hostIP, long lastAttemptTime) throws IOException { String encodedURI = normalizedURL; String hostIPStr = IPAddressUtils.IntegerToIPAddressString(hostIP); long fetchBeginTimestamp = lastAttemptTime; String encoding = headers.findValue("Content-Encoding"); String truncationFlags = ""; { if (crawlData != null && encoding != null && encoding.equalsIgnoreCase("gzip")) { int compressedSize = crawlData.length; try { UnzipResult result = GZIPUtils.unzipBestEffort(crawlData, 2 << 20); crawlData = result.data; crawlDataOffset = 0; crawlDataLen = result.data.length; if (result.wasTruncated) { if (truncationFlags.length() != 0) truncationFlags += ","; truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate); } } catch (Exception e) { LOG.error("URL:" + normalizedURL + " Rejected - GZIP Decompression Failed"); crawlData = null; } } // content must not be null if (crawlData == null) { LOG.error("URL:" + normalizedURL + " Rejected - Content is NULL"); } else { // add in our custom headers ... headers.add(Constants.ARCFileHeader_ParseSegmentId, ((Integer) segmentid).toString()); headers.add(Constants.ARCFileHeader_OriginalURL, normalizedURL); headers.add(Constants.ARCFileHeader_Signature, signature); headers.add(Constants.ARCFileHeader_CrawlNumber, Integer.toString(crawlNumber)); headers.add(Constants.ARCFileHeader_FetchTimeStamp, Long.toString(fetchBeginTimestamp)); // headers.add(Environment.ARCFileHeader_CrawlerId, // Integer.toString((int)urlItem.get)); if (truncationFlags.length() != 0) { headers.add(Constants.ARCFileHeader_ContentTruncated, truncationFlags); } String headerString = headers.toString() + "\r\n"; byte[] headerBytes = headerString.getBytes("UTF-8"); // content is truncated further upstream, so this redundant check / // truncation is problematic // int contentLength = Math.min(crawlData.length,CONTENT_SIZE_LIMIT); // extract metadata line upfront, since if the url exceeds a certain // size limit , we are going to reject the entry... byte metaDataLine[]; try { metaDataLine = getMetaLine(encodedURI, arcFileName, contentType, hostIPStr, fetchBeginTimestamp, crawlDataLen + headerBytes.length).getBytes(UTF8); } catch (IOException e) { LOG.error("Metadata Line Validation FAILED with Exception:" + CCStringUtils.stringifyException(e)); // bail here ... return false; } // get ready to write out a new gziped entry ... OutputStream compressedStream = preWriteRecordTasks(os, headerBytes.length, crawlDataLen, contentType); try { // read to write an entry ... compressedStream.write(metaDataLine); // write out the headers ... compressedStream.write(headerBytes, 0, headerBytes.length); // write out the content compressedStream.write(crawlData, 0, crawlDataLen); // line separator ... compressedStream.write(LINE_SEPARATOR); } finally { // flush the gzip stream... postWriteRecordTasks(compressedStream); } } return true; } } static OutputStream preWriteRecordTasks(OutputStream os, int headerBytesLength, int contentBytesLength, String contentType) throws IOException { // Wrap stream in GZIP Writer. // The below construction immediately writes the GZIP 'default' // header out on the underlying stream. return new CompressedStream(os); } static OutputStream postWriteRecordTasks(OutputStream os) throws IOException { CompressedStream o = (CompressedStream) os; o.finish(); o.flush(); o.end(); return o.getWrappedStream(); } static String randomConstrainedString(final Random random, char validChars[], final int minLength, final int maxLength) { final int length = random.nextInt(maxLength - minLength) + minLength; final char[] chars = new char[length]; for (int i = 0, x = chars.length; i < x;) chars[i++] = validChars[random.nextInt(validChars.length)]; return new String(chars); } static String randomString(final Random random, final int minLength, final int maxLength) { final int length = random.nextInt(maxLength - minLength) + minLength; final char[] chars = new char[length]; for (int i = 0, x = chars.length; i < x;) do { final int cp = random.nextInt(0x10FFFF + 1); if (!Character.isDefined(cp)) continue; final char[] chs = Character.toChars(cp); if (chs.length > x - i) continue; for (final char ch : chs) { if (!Character.isWhitespace(ch)) { chars[i++] = ch; } } break; } while (true); return new String(chars); } static final String[] testHeaderKeys = { "x-cc-test-header-1", "x-cc-test-header-2", "x-cc-test-header-3" }; static final String validHeaderChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; public static class TestRecord { public String url; public byte[] data; public List<Pair<String, String>> headers; public int streamPos; public int rawSize; } public static List<TestRecord> buildTestRecords(int recordCount) { Random random = new Random(); List<TestRecord> records = Lists.newArrayList(); char headerChars[] = validHeaderChars.toCharArray(); for (int i = 0; i < recordCount; ++i) { TestRecord record = new TestRecord(); // intentionally add a space in the url to mimic malformed headers record.url = "http://foo/ " + randomString(random, 5, 100); record.data = randomString(random, 1000, 3000).getBytes(Charset.forName("UTF-8")); record.headers = Lists.newArrayList(); for (int j = 0; j < testHeaderKeys.length; ++j) { record.headers.add(new Pair<String, String>(testHeaderKeys[j], randomConstrainedString(random, headerChars, 100, 200))); } records.add(record); } return records; } public static final int BASIC_TEST_RECORD_COUNT = 100; /** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... */ @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } /** * helper offset based byte array comparator * @param buffer1 * @param offset1 * @param length1 * @param buffer2 * @param offset2 * @param length2 * @return */ public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { // Short circuit equal case if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2) { return 0; } // Bring WritableComparator code local int end1 = offset1 + length1; int end2 = offset2 + length2; for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { int a = (buffer1[i] & 0xff); int b = (buffer2[j] & 0xff); if (a != b) { return a - b; } } return length1 - length2; } }