org.commoncrawl.service.listcrawler.CacheWriterThread.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.CacheWriterThread.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.CRC32;
import java.util.zip.CheckedOutputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataOutputBuffer;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.ByteStream;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FlexBuffer;

/** 
 * thread that writes crawled content to disk / cache 
 * @author rana
 *
 */
final class CacheWriterThread implements Runnable {

    private class ThriftyGZIPOutputStream extends GZIPOutputStream {
        public ThriftyGZIPOutputStream(OutputStream os) throws IOException {
            super(os);
        }

        @Override
        public void finish() throws IOException {
            super.finish();
            def.end();
        }

        @Override
        public void close() throws IOException {
            super.finish();
            def.end();
        }
    }

    public static final Log LOG = LogFactory.getLog(CacheWriterThread.class);

    private CRC32 _crc32Out = new CRC32();
    private CacheManager _manager;
    private File _dataDirectory;
    private LinkedBlockingQueue<CacheWriteRequest> _writeRequestQueue;

    public CacheWriterThread(CacheManager cacheManager) {
        _manager = cacheManager;
        _dataDirectory = cacheManager.getLocalDataDirectory();
        _writeRequestQueue = cacheManager.getWriteRequestQueue();
    }

    @Override
    public void run() {

        boolean shutdown = false;

        while (!shutdown) {
            try {
                final CacheWriteRequest request = _writeRequestQueue.take();

                switch (request._requestType) {

                case ExitThreadRequest: {
                    // shutdown condition ... 
                    CacheManager.LOG.info("Disk Writer Thread Received Shutdown. Exiting!");
                    shutdown = true;
                }
                    break;

                case WriteRequest: {

                    long timeStart = System.currentTimeMillis();

                    try {
                        // reset crc calculator (single thread so no worries on synchronization)
                        _crc32Out.reset();

                        // figure out if we need to compress the item ... 
                        if ((request._item.getFlags() & CacheItem.Flags.Flag_IsCompressed) == 0
                                && request._item.getContent().getCount() != 0) {
                            LOG.info("Incoming Cache Request Content for:" + request._item.getUrl()
                                    + " is not compressed. Compressing...");
                            ByteStream compressedBytesOut = new ByteStream(request._item.getContent().getCount());
                            ThriftyGZIPOutputStream gzipOutputStream = new ThriftyGZIPOutputStream(
                                    compressedBytesOut);
                            gzipOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                                    request._item.getContent().getCount());
                            gzipOutputStream.finish();
                            LOG.info("Finished Compressing Incoming Content for:" + request._item.getUrl()
                                    + " BytesIn:" + request._item.getContent().getCount() + " BytesOut:"
                                    + compressedBytesOut.size());
                            // replace buffer

                            request._item.setContent(
                                    new FlexBuffer(compressedBytesOut.getBuffer(), 0, compressedBytesOut.size()));
                            request._item.setFlags((request._item.getFlags() | CacheItem.Flags.Flag_IsCompressed));
                        }

                        // create streams ...
                        ByteStream bufferOutputStream = new ByteStream(8192);

                        CheckedOutputStream checkedStream = new CheckedOutputStream(bufferOutputStream, _crc32Out);
                        DataOutputStream dataOutputStream = new DataOutputStream(checkedStream);

                        // remember if this item has content ... 
                        boolean hasContent = request._item.isFieldDirty(CacheItem.Field_CONTENT);
                        // now mark the content field as clean, so that it will not be serialized in our current serialization attempt ... 
                        request._item.setFieldClean(CacheItem.Field_CONTENT);
                        // and go ahead and write out the data to the intermediate buffer while also computing partial checksum 
                        request._item.write(dataOutputStream);

                        request._item.setFieldDirty(CacheItem.Field_CONTENT);

                        // ok, now ... write out file header ... 
                        CacheItemHeader itemHeader = new CacheItemHeader(_manager.getLocalLogSyncBytes());

                        itemHeader._status = CacheItemHeader.STATUS_ALIVE;
                        itemHeader._lastAccessTime = System.currentTimeMillis();
                        itemHeader._fingerprint = request._itemFingerprint;
                        // compute total length ... 

                        // first the header bytes in the cacheItem 
                        itemHeader._dataLength = bufferOutputStream.size();
                        // next the content length (encoded - as in size + bytes) ... 
                        itemHeader._dataLength += 4 + request._item.getContent().getCount();
                        // lastly the crc value iteself ... 
                        itemHeader._dataLength += 8;
                        // open the log file ... 
                        DataOutputBuffer logStream = new DataOutputBuffer();

                        // ok, go ahead and write the header 
                        itemHeader.writeHeader(logStream);
                        // ok now write out the item data minus content... 
                        logStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
                        // now create a checked stream for the content ... 
                        CheckedOutputStream checkedStream2 = new CheckedOutputStream(logStream,
                                checkedStream.getChecksum());

                        dataOutputStream = new DataOutputStream(checkedStream2);

                        // content size 
                        dataOutputStream.writeInt(request._item.getContent().getCount());
                        // now write out the content (via checked stream so that we can calc checksum on content)
                        dataOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                                request._item.getContent().getCount());
                        // ok ... lastly write out the checksum bytes ... 
                        dataOutputStream.writeLong(checkedStream2.getChecksum().getValue());
                        // and FINALLY, write out the total item bytes (so that we can seek in reverse to read last request log 
                        logStream.writeInt(CacheItemHeader.SIZE + itemHeader._dataLength);

                        // ok flush everyting to the memory stream 
                        dataOutputStream.flush();

                        //ok - time to acquire the log semaphore 
                        //LOG.info("Acquiring Local Log Semaphore");
                        _manager.getLocalLogAccessSemaphore().acquireUninterruptibly();

                        try {

                            // now time to acquire the write semaphore ... 
                            _manager.getLocalLogWriteAccessSemaphore().acquireUninterruptibly();

                            // get the current file position 
                            long recordOffset = _manager.getLocalLogFilePos();

                            try {

                                long ioTimeStart = System.currentTimeMillis();

                                RandomAccessFile logFile = new RandomAccessFile(_manager.getActiveLogFilePath(),
                                        "rw");

                                try {
                                    // seek to our known record offset 
                                    logFile.seek(recordOffset);
                                    // write out the data
                                    logFile.write(logStream.getData(), 0, logStream.getLength());
                                } finally {
                                    logFile.close();
                                }
                                // now we need to update the file header 
                                _manager.updateLogFileHeader(_manager.getActiveLogFilePath(), 1,
                                        CacheItemHeader.SIZE + itemHeader._dataLength + 4 /*trailing bytes*/);

                                CacheManager.LOG
                                        .info("#### Wrote Cache Item in:" + (System.currentTimeMillis() - timeStart)
                                                + " iotime:" + (System.currentTimeMillis() - ioTimeStart)
                                                + " QueueSize:" + _writeRequestQueue.size());

                            } finally {
                                // release write semaphore quickly 
                                _manager.getLocalLogWriteAccessSemaphore().release();
                            }

                            // now inform the manager of the completed request ... 
                            _manager.writeRequestComplete(request, recordOffset);
                        } finally {
                            //LOG.info("Releasing Local Log Semaphore");
                            _manager.getLocalLogAccessSemaphore().release();
                        }
                    } catch (IOException e) {
                        CacheManager.LOG.error("### FUC# BATMAN! - GONNA LOSE THIS REQUEST!!!!:"
                                + CCStringUtils.stringifyException(e));
                        _manager.writeRequestFailed(request, e);
                    }
                }
                    break;
                }
            } catch (InterruptedException e) {

            }
        }
    }
}