org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java

Source

package org.commoncrawl.hadoop.mergeutils;

/*
 *    Copyright 2010 - CommonCrawl Foundation
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.commoncrawl.util.shared.CCStringUtils;
import org.apache.hadoop.io.compress.CompressionCodec;

/**
 * 
 * An output sink that writes to a sequence file
 * 
 * @author rana
 * 
 * @param <KeyType>
 * @param <ValueType>
 */
public class SequenceFileSpillWriter<KeyType extends WritableComparable, ValueType extends Writable>
        implements RawDataSpillWriter<KeyType, ValueType> {

    private static final Log LOG = LogFactory.getLog(SequenceFileSpillWriter.class);
    private static final int BUFFER_QUEUE_CAPACITY = 100;

    private SequenceFileIndexWriter<KeyType, ValueType> _indexWriter = null;
    private SequenceFile.Writer writer = null;
    private long _recordCount = 0;
    private FSDataOutputStream _outputStream;
    private ByteBuffer _activeBuffer = null;
    private Thread _writerThread = null;
    private IOException _writerException = null;
    private int _spillBufferSize = -1;

    // default buffer size
    public static final int DEFAULT_SPILL_BUFFER_SIZE = 1000000;
    // the number of index items we can store in ram
    public static final String SPILL_WRITER_BUFFER_SIZE_PARAM = "commoncrawl.spillwriter.buffer.size";
    // the queue capacity parameter 
    public static final String QUEUE_CAPACITY_PARAM = "commoncrawl.spillwriter.queue.capacity";

    private static class QueuedBufferItem {

        QueuedBufferItem(ByteBuffer buffer) {
            _buffer = buffer;
        }

        public ByteBuffer _buffer;
    }

    private LinkedBlockingQueue<QueuedBufferItem> _bufferQueue;

    public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath,
            Class<KeyType> keyClass, Class<ValueType> valueClass,
            SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, CompressionCodec codec,
            short replicationFactor) throws IOException {

        _bufferQueue = new LinkedBlockingQueue<QueuedBufferItem>(
                conf.getInt(QUEUE_CAPACITY_PARAM, BUFFER_QUEUE_CAPACITY));
        _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE);
        _outputStream = fileSystem.create(outputFilePath, true, 10 * 1024 * 1024, replicationFactor,
                fileSystem.getDefaultBlockSize());
        // allocate buffer ...
        _activeBuffer = ByteBuffer.allocate(_spillBufferSize);
        // assign index writer ..
        _indexWriter = optionalIndexWriter;

        if (codec != null) {
            writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
                    codec);
        } else {
            writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE,
                    null);
        }

        _writerThread = new Thread(new Runnable() {

            @Override
            public void run() {
                // LOG.info("Writer Thread Starting");

                while (true) {

                    QueuedBufferItem queuedBufferItem = null;

                    try {
                        queuedBufferItem = _bufferQueue.take();
                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                    if (queuedBufferItem._buffer == null) {
                        // LOG.info("Writer Thread received empty buffer item. Exiting");
                        return;
                    } else {

                        ByteBuffer theBuffer = queuedBufferItem._buffer;

                        // LOG.info("Writer Thread received item. Limit:" +
                        // theBuffer.limit());

                        // get byte pointer
                        byte[] bufferAsBytes = theBuffer.array();

                        int itemsWritten = 0;
                        long timeStart = System.currentTimeMillis();

                        while (theBuffer.remaining() != 0) {

                            // now read in key length
                            int keyLen = theBuffer.getInt();
                            // mark key position
                            int keyPos = theBuffer.position();
                            // now skip past key length
                            theBuffer.position(keyPos + keyLen);
                            // read value length
                            int valueLen = theBuffer.getInt();
                            // mark value position
                            int valuePosition = theBuffer.position();
                            // now skip past it ...
                            theBuffer.position(valuePosition + valueLen);
                            // now write this out to the sequence file ...

                            try {
                                spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition,
                                        valueLen);
                            } catch (IOException e) {
                                LOG.error("Writer Thread Failed with Error:" + CCStringUtils.stringifyException(e));
                                _writerException = e;
                                return;
                            }
                            itemsWritten++;
                        }
                        // LOG.info("Writer Thread Finished With Buffer. Wrote:"+
                        // itemsWritten + " in:" + (System.currentTimeMillis() -
                        // timeStart));
                    }
                }
            }

        });
        _writerThread.start();
    }

    @Override
    public void close() throws IOException {

        // flush any active buffers ...
        flushActiveBuffer();

        // wait for writer thread to terminate gracefully
        if (_writerThread != null) {
            try {
                _bufferQueue.put(new QueuedBufferItem(null));
                // LOG.info("Waiting for Writer Thread to Die");
                _writerThread.join();
                // LOG.info("Writer Thread Died. Continuing Close");
                _writerThread = null;
            } catch (InterruptedException e) {
            }
        }

        if (writer != null) {
            // close sequence file writer
            writer.close();
            writer = null;
        }

        // and index writer (if open);
        if (_indexWriter != null) {
            _indexWriter.close();
            _indexWriter = null;
        }
        // and finally output stream
        if (_outputStream != null) {
            _outputStream.close();
            _outputStream = null;
        }

        // LOG.info("Closing Spill File. RecordCount:" + _recordCount);
    }

    private static OutputStream newOutputStream(final ByteBuffer buf) {
        return new OutputStream() {

            @Override
            public void write(int b) throws IOException {
                buf.put((byte) (b & 0xff));
            }

            public void write(byte src[], int off, int len) throws IOException {
                buf.put(src, off, len);
            }
        };
    }

    @Override
    public void spillRecord(KeyType key, ValueType value) throws IOException {

        // mark buffer position ...
        int startPosition = -1;
        // LOG.info("Buffer start position:" + startPositon);

        boolean done = false;

        while (!done) {

            if (_activeBuffer == null) {
                allocateActiveBuffer();
            }

            startPosition = _activeBuffer.position();

            DataOutputStream outputStream = new DataOutputStream(newOutputStream(_activeBuffer));
            int keySizePos = 0;
            int valueSizePos = 0;
            int keySize = 0;
            int endPosition = 0;
            int valueSize = 0;
            boolean overflow = false;
            try {
                // save key size position
                keySizePos = _activeBuffer.position();
                // LOG.info("keySizePos:" + keySizePos);
                // skip past key length
                _activeBuffer.position(keySizePos + 4);
                // next write key and value
                key.write(outputStream);
                // now save value size position
                valueSizePos = _activeBuffer.position();
                // and calculate key size
                keySize = valueSizePos - keySizePos - 4;
                // reseek back
                _activeBuffer.position(keySizePos);
                // write out real key size
                _activeBuffer.putInt(keySize);
                // skip to value size position + 4
                _activeBuffer.position(valueSizePos + 4);
                // write out actual value
                value.write(outputStream);
                // save end position
                endPosition = _activeBuffer.position();
                // calculate value size
                valueSize = endPosition - valueSizePos - 4;
                // reseek back to value size pos
                _activeBuffer.position(valueSizePos);
                // write value size
                _activeBuffer.putInt(valueSize);
                // seek forward to end position
                _activeBuffer.position(endPosition);

                // LOG.info("keySizePos:" + keySizePos + " keySize:" + keySize +
                // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

                done = true;
            }
            // trap for buffer overflow specifically
            catch (IllegalArgumentException e) {
                // LOG.error("IllegalArgumentException detected while writing to Spill Data Buffer. Flushing");
                overflow = true;
            }
            // trap for buffer overflow specifically
            catch (BufferOverflowException e) {
                // LOG.error("Buffer Overflow detected while writing to Spill Data Buffer. Flushing");
                overflow = true;
            }

            if (overflow) {
                // LOG.error("Overflow Condition StartPosition:" + startPosition +
                // " KeySizePos:" + keySizePos + " keySize:" + keySize +
                // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

                if (startPosition == 0) {
                    throw new IOException("Key + Value Size too Big for SpillBuffer !");
                }
                // reset to start position
                _activeBuffer.position(startPosition);
                // sort and spill
                flushActiveBuffer();
            }

        }

    }

    @Override
    public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength, final byte[] valueData,
            final int valueOffset, final int valueLength) throws IOException {

        // mark buffer position ...
        int startPosition = -1;
        // LOG.info("Buffer start position:" + startPositon);

        boolean done = false;

        while (!done) {

            if (_activeBuffer == null) {
                allocateActiveBuffer();
            }

            startPosition = _activeBuffer.position();
            int keySizePos = 0;
            int valueSizePos = 0;
            int keySize = 0;
            int endPosition = 0;
            int valueSize = 0;
            boolean overflow = false;

            try {
                // save key size position
                keySizePos = _activeBuffer.position();
                // LOG.info("keySizePos:" + keySizePos);
                // skip past key length
                _activeBuffer.position(keySizePos + 4);
                // next write key and value
                _activeBuffer.put(keyData, keyOffset, keyLength);
                // now save value size position
                valueSizePos = _activeBuffer.position();
                // and calculate key size
                keySize = valueSizePos - keySizePos - 4;
                // reseek back
                _activeBuffer.position(keySizePos);
                // write out real key size
                _activeBuffer.putInt(keySize);
                // skip to value size position + 4
                _activeBuffer.position(valueSizePos + 4);
                // write out actual value
                _activeBuffer.put(valueData, valueOffset, valueLength);
                // save end position
                endPosition = _activeBuffer.position();
                // calculate value size
                valueSize = endPosition - valueSizePos - 4;
                // reseek back to value size pos
                _activeBuffer.position(valueSizePos);
                // write value size
                _activeBuffer.putInt(valueSize);
                // seek forward to end position
                _activeBuffer.position(endPosition);

                // LOG.info("keySizePos:" + keySizePos + " keySize:" + keySize +
                // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

                done = true;
            }
            // trap for buffer overflow specifically
            catch (IllegalArgumentException e) {
                // LOG.error("IllegalArgumentException detected while writing to Spill Data Buffer. Flushing");
                overflow = true;
            }
            // trap for buffer overflow specifically
            catch (BufferOverflowException e) {
                // LOG.error("Buffer Overflow detected while writing to Spill Data Buffer. Flushing");
                overflow = true;
            }

            if (overflow) {
                // LOG.error("Overflow Condition StartPosition:" + startPosition +
                // " KeySizePos:" + keySizePos + " keySize:" + keySize +
                // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

                if (startPosition == 0) {
                    throw new IOException("Key + Value Size too Big for SpillBuffer !");
                }
                // reset to start position
                _activeBuffer.position(startPosition);
                // sort and spill
                flushActiveBuffer();
            }
        }
    }

    private void allocateActiveBuffer() throws IOException {
        // LOG.info("Allocating Buffer of Size:" + BUFFER_SIZE);
        _activeBuffer = ByteBuffer.allocate(_spillBufferSize);
    }

    private void flushActiveBuffer() throws IOException {
        try {
            if (_activeBuffer != null && _activeBuffer.position() != 0) {
                // flip the buffer ...
                _activeBuffer.flip();
                // LOG.info("Flipped Active Buffer.Limit:" + _activeBuffer.limit()+
                // ". Queueing Buffer");
                // and queue it up for a write ...
                _bufferQueue.put(new QueuedBufferItem(_activeBuffer));
            }
        } catch (InterruptedException e) {
        }
        _activeBuffer = null;
    }

    public void spillRawRecord2(byte[] keyData, int keyOffset, int keyLength, final byte[] valueData,
            final int valueOffset, final int valueLength) throws IOException {

        if (_indexWriter != null)
            _indexWriter.indexItem(keyData, keyOffset, keyLength, valueData, valueOffset, valueLength,
                    writer.getLength());

        _recordCount++;

        writer.appendRaw(keyData, keyOffset, keyLength, new ValueBytes() {

            @Override
            public int getSize() {
                return valueLength;
            }

            @Override
            public void writeCompressedBytes(DataOutputStream outStream)
                    throws IllegalArgumentException, IOException {
                throw new IOException("UnSupported Method");
            }

            @Override
            public void writeUncompressedBytes(DataOutputStream outStream) throws IOException {
                outStream.write(valueData, valueOffset, valueLength);
            }

        });
    }
}