org.apache.distributedlog.LogRecord.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.distributedlog.LogRecord.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.distributedlog;

import com.google.common.annotations.VisibleForTesting;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.ByteBufInputStream;
import io.netty.buffer.Unpooled;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.distributedlog.common.util.ByteBufUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Log record is the basic element in a log.
 *
 * <p>A log is a sequence of log records. Each log record is a sequence of bytes.
 * Log records are written sequentially into a stream, and will be assigned with
 * an unique system generated sequence number {@link DLSN} (distributedlog sequence
 * number). Besides {@link DLSN}, application can assign its own sequence number
 * while constructing log records. The application defined sequence number is called
 * <code>TransactionID</code> (<i>txid</i>). Either {@link DLSN} or <code>TransactionId</code>
 * could be used to position readers to start from specific log records.
 *
 * <h3>User Record</h3>
 *
 * <p>User records are the records written by applications and read by applications. They
 * are constructed via {@link #LogRecord(long, byte[])} by applications and appended to
 * logs by writers. And they would be deserialized from bytes by the readers and return
 * to applications.
 *
 * <h3>Control Record</h3>
 *
 * <p>Control records are special records that written by distributedlog. They are invisible
 * to applications. They could be treated as <i>commit requests</i> as what people could find
 * in distributed consensus algorithms, since they are usually written by distributedlog to
 * commit application written records. <i>Commit</i> means making application written records
 * visible to readers to achieve consistent views among them.
 *
 * <p>They are named as 'Control Records' for controlling visibility of application written records.
 *
 * <p>The transaction id of 'Control Records' are assigned by distributedlog by inheriting from last
 * written user records. So we could indicate what user records that a control record is committing
 * by looking at its transaction id.
 *
 * <h4>EndOfStream Record</h4>
 *
 * <p><code>EoS</code>(EndOfStream) is a special control record that would be written by a writer
 * to seal a log. After a <i>EoS</i> record is written to a log, no writers could append any record
 * after that and readers will get {@link org.apache.distributedlog.exceptions.EndOfStreamException}
 * when they reach EoS.
 *
 * <p>TransactionID of EoS is <code>Long.MAX_VALUE</code>.
 *
 * <h3>Serialization & Deserialization</h3>
 *
 * <p>Data type in brackets. Interpretation should be on the basis of data types and not individual
 * bytes to honor Endianness.
 *
 * <pre>
 * LogRecord structure:
 * -------------------
 * Bytes 0 - 7                      : Metadata (Long)
 * Bytes 8 - 15                     : TxId (Long)
 * Bytes 16 - 19                    : Payload length (Integer)
 * Bytes 20 - 20+payload.length-1   : Payload (Byte[])
 *
 * Metadata: 8 Bytes (Long)
 * --------
 *
 * 0x 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 *            |_____________| |_____|
 *                   |           |
 *               position      flags
 *
 * Flags: 2 Bytes (least significant)
 * -----
 * Bit  0      : If set, control record, else record with payload.
 * Bit  1      : If set, end of stream.
 * Bits 2 - 15 : Unused
 * </pre>
 *
 * <h3>Sequence Numbers</h3>
 *
 * <p>A record is associated with three types of sequence numbers. They are generated
 * and used for different purposes. Check {@link LogRecordWithDLSN} for more details.
 *
 * @see LogRecordWithDLSN
 */
@NotThreadSafe
public class LogRecord {

    private static final Logger LOG = LoggerFactory.getLogger(LogRecord.class);

    // Allow 4K overhead for metadata within the max transmission size
    public static final int MAX_LOGRECORD_SIZE = 1024 * 1024 - 8 * 1024; //1MB - 8KB
    // Allow 4K overhead for transmission overhead
    public static final int MAX_LOGRECORDSET_SIZE = 1024 * 1024 - 4 * 1024; //1MB - 4KB

    private static final int INPUTSTREAM_MARK_LIMIT = 16;

    static final long LOGRECORD_METADATA_FLAGS_MASK = 0xffffL;
    static final long LOGRECORD_METADATA_FLAGS_UMASK = 0xffffffffffff0000L;
    static final long LOGRECORD_METADATA_POSITION_MASK = 0x0000ffffffff0000L;
    static final long LOGRECORD_METADATA_POSITION_UMASK = 0xffff00000000ffffL;
    static final int LOGRECORD_METADATA_POSITION_SHIFT = 16;
    static final long LOGRECORD_METADATA_UNUSED_MASK = 0xffff000000000000L;

    // TODO: Replace with EnumSet
    static final long LOGRECORD_FLAGS_CONTROL_MESSAGE = 0x1;
    static final long LOGRECORD_FLAGS_END_OF_STREAM = 0x2;
    static final long LOGRECORD_FLAGS_RECORD_SET = 0x4;

    private long metadata;
    private long txid;
    private ByteBuf payload;

    /**
     * Construct an uninitialized log record.
     *
     * <p>NOTE: only deserializer should call this constructor.
     */
    protected LogRecord() {
        this.txid = 0;
        this.metadata = 0;
    }

    /**
     * Construct a log record with <i>TransactionId</i> and payload.
     *
     * <p>Usually writer would construct the log record for writing.
     *
     * @param txid
     *          application defined transaction id.
     * @param payload
     *          record data
     */
    public LogRecord(long txid, byte[] payload) {
        this.txid = txid;
        this.payload = Unpooled.wrappedBuffer(payload);
    }

    /**
     * Construct a log record with <i>txid</i> and payload <i>buffer</i>.
     *
     * @param txid application defined transaction id.
     * @param buffer payload buffer.
     */
    public LogRecord(long txid, ByteBuffer buffer) {
        this.txid = txid;
        this.payload = Unpooled.wrappedBuffer(buffer);
    }

    /**
     * Construct a log record with <i>txid</i> and ByteBuf <i>payload</i>.
     *
     * @param txid transaction id
     * @param payload playload
     */
    public LogRecord(long txid, ByteBuf payload) {
        this.txid = txid;
        this.payload = payload;
        this.metadata = 0;
    }

    //
    // Accessors
    //

    /**
     * Return application defined transaction id.
     *
     * @return transacton id.
     */
    public long getTransactionId() {
        return txid;
    }

    /**
     * Set application defined transaction id.
     *
     * @param txid application defined transaction id.
     */
    protected void setTransactionId(long txid) {
        this.txid = txid;
    }

    /**
     * Return the payload of this log record.
     *
     * @return payload of this log record.
     */
    public byte[] getPayload() {
        return ByteBufUtils.getArray(payload);
    }

    /**
     * Return the payload buf of this log record.
     *
     * @return payload buf of this record.
     * @since 0.5.0
     */
    public ByteBuf getPayloadBuf() {
        return payload.slice();
    }

    void setPayloadBuf(ByteBuf payload, boolean copyData) {
        if (null != this.payload) {
            this.payload.release();
        }
        if (copyData) {
            this.payload = Unpooled.copiedBuffer(payload);
        } else {
            this.payload = payload;
        }
    }

    /**
     * Return the payload as an {@link InputStream}.
     *
     * @return payload as input stream
     */
    public InputStream getPayLoadInputStream() {
        return new ByteBufInputStream(payload.retainedSlice(), true);
    }

    //
    // Metadata & Flags
    //

    protected void setMetadata(long metadata) {
        this.metadata = metadata;
    }

    protected long getMetadata() {
        return this.metadata;
    }

    /**
     * Set the position in the log segment.
     *
     * @see #getPositionWithinLogSegment()
     * @param positionWithinLogSegment position in the log segment.
     */
    void setPositionWithinLogSegment(int positionWithinLogSegment) {
        assert (positionWithinLogSegment >= 0);
        metadata = (metadata & LOGRECORD_METADATA_POSITION_UMASK)
                | (((long) positionWithinLogSegment) << LOGRECORD_METADATA_POSITION_SHIFT);
    }

    /**
     * The position in the log segment means how many records (inclusive) added to the log segment so far.
     *
     * @return position of the record in the log segment.
     */
    public int getPositionWithinLogSegment() {
        long ret = (metadata & LOGRECORD_METADATA_POSITION_MASK) >> LOGRECORD_METADATA_POSITION_SHIFT;
        if (ret < 0 || ret > Integer.MAX_VALUE) {
            throw new IllegalArgumentException(ret + " position should never exceed max integer value");
        }
        return (int) ret;
    }

    /**
     * Get the last position of this record in the log segment.
     *
     * <p>If the record isn't record set, it would be same as {@link #getPositionWithinLogSegment()},
     * otherwise, it would be {@link #getPositionWithinLogSegment()} + numRecords - 1. If the record set
     * version is unknown, it would be same as {@link #getPositionWithinLogSegment()}.
     *
     * @return last position of this record in the log segment.
     */
    int getLastPositionWithinLogSegment() {
        if (isRecordSet()) {
            try {
                return getPositionWithinLogSegment() + LogRecordSet.numRecords(this) - 1;
            } catch (IOException e) {
                // if it is unrecognized record set, we will return the position of this record set.
                return getPositionWithinLogSegment();
            }
        } else {
            return getPositionWithinLogSegment();
        }
    }

    /**
     * Set the record to represent a set of records.
     *
     * <p>The bytes in this record is the serialized format of {@link LogRecordSet}.
     */
    public void setRecordSet() {
        metadata = metadata | LOGRECORD_FLAGS_RECORD_SET;
    }

    /**
     * Check if the record represents a set of records.
     *
     * @return true if the record represents a set of records, otherwise false.
     * @see #setRecordSet()
     */
    public boolean isRecordSet() {
        return isRecordSet(metadata);
    }

    public static boolean isRecordSet(long metadata) {
        return ((metadata & LOGRECORD_FLAGS_RECORD_SET) != 0);
    }

    @VisibleForTesting
    public void setControl() {
        metadata = metadata | LOGRECORD_FLAGS_CONTROL_MESSAGE;
    }

    /**
     * Check if the record is a control record.
     *
     * @return true if the record is a control record, otherwise false.
     */
    public boolean isControl() {
        return isControl(metadata);
    }

    /**
     * Check flags to see if it indicates a control record.
     *
     * @param flags record flags
     * @return true if the record is a control record, otherwise false.
     */
    public static boolean isControl(long flags) {
        return ((flags & LOGRECORD_FLAGS_CONTROL_MESSAGE) != 0);
    }

    /**
     * Set the record as <code>EoS</code> mark.
     *
     * @see #isEndOfStream()
     */
    void setEndOfStream() {
        metadata = metadata | LOGRECORD_FLAGS_END_OF_STREAM;
    }

    /**
     * Check if the record is a <code>EoS</code> mark.
     *
     * <p><code>EoS</code> mark is a special record that writer would
     * add to seal a log. after <code>Eos</code> mark is written,
     * writers can't write any more records and readers will get
     * {@link org.apache.distributedlog.exceptions.EndOfStreamException}
     * when they reach <code>EoS</code>.
     *
     * @return true
     */
    boolean isEndOfStream() {
        return ((metadata & LOGRECORD_FLAGS_END_OF_STREAM) != 0);
    }

    //
    // Serialization & Deserialization
    //

    protected void readPayload(ByteBuf in, boolean copyData) throws IOException {
        int length = in.readInt();
        if (length < 0) {
            throw new EOFException("Log Record is corrupt: Negative length " + length);
        }
        if (copyData) {
            setPayloadBuf(in.slice(in.readerIndex(), length), true);
        } else {
            setPayloadBuf(in.retainedSlice(in.readerIndex(), length), false);
        }
        in.skipBytes(length);
    }

    private void writePayload(ByteBuf out) {
        out.writeInt(payload.readableBytes());
        out.writeBytes(payload, payload.readerIndex(), payload.readableBytes());
    }

    private void writeToStream(ByteBuf out) {
        out.writeLong(metadata);
        out.writeLong(txid);
        writePayload(out);
    }

    /**
     * The size of the serialized log record.
     *
     * <p>This is used to estimate how much will be be appended to the in-memory buffer.
     *
     * @return serialized size
     */
    int getPersistentSize() {
        // Flags + TxId + Payload-length + payload
        return 2 * (Long.SIZE / 8) + Integer.SIZE / 8 + payload.readableBytes();
    }

    /**
     * Writer class to write log records into an output {@code stream}.
     */
    public static class Writer {
        private final ByteBuf buf;

        public Writer(ByteBuf out) {
            this.buf = out;
        }

        /**
         * Write an operation to the output stream.
         *
         * @param record The operation to write
         */
        public void writeOp(LogRecord record) {
            record.writeToStream(buf);
        }

        public int getPendingBytes() {
            return buf.readableBytes();
        }
    }

    /**
     * Reader class to read log records from an input {@code stream}.
      */
    public static class Reader {
        private final RecordStream recordStream;
        private final ByteBuf in;
        private final long startSequenceId;
        private final boolean deserializeRecordSet;
        private static final int SKIP_BUFFER_SIZE = 512;
        private LogRecordSet.Reader recordSetReader = null;
        private LogRecordWithDLSN lastRecordSkipTo = null;

        /**
         * Construct the reader.
         *
         * @param recordStream the record stream for generating {@code DLSN}s.
         * @param in The stream to read from.
         * @param startSequenceId the start sequence id.
         */
        public Reader(RecordStream recordStream, ByteBuf in, long startSequenceId) {
            this(recordStream, in, startSequenceId, true);
        }

        public Reader(RecordStream recordStream, ByteBuf in, long startSequenceId, boolean deserializeRecordSet) {
            this.recordStream = recordStream;
            this.in = in;
            this.startSequenceId = startSequenceId;
            this.deserializeRecordSet = deserializeRecordSet;
        }

        /**
         * Read an log record from the input stream.
         *
         * <p/> Note that the objects returned from this method may be re-used by future
         * calls to the same method.
         *
         * @return the operation read from the stream, or null at the end of the file
         * @throws IOException on error.
         */
        public LogRecordWithDLSN readOp() throws IOException {
            LogRecordWithDLSN nextRecordInStream;
            while (true) {
                if (lastRecordSkipTo != null) {
                    nextRecordInStream = lastRecordSkipTo;
                    recordStream.advance(1);
                    lastRecordSkipTo = null;
                    return nextRecordInStream;
                }
                if (recordSetReader != null) {
                    nextRecordInStream = recordSetReader.nextRecord();
                    if (null != nextRecordInStream) {
                        recordStream.advance(1);
                        return nextRecordInStream;
                    } else {
                        recordSetReader = null;
                    }
                }

                if (in.readableBytes() <= 0) {
                    return null;
                }

                try {
                    long metadata = in.readLong();
                    // Reading the first 8 bytes positions the record stream on the correct log record
                    // By this time all components of the DLSN are valid so this is where we shoud
                    // retrieve the currentDLSN and advance to the next
                    // Given that there are 20 bytes following the read position of the previous call
                    // to readLong, we should not have moved ahead in the stream.
                    nextRecordInStream = new LogRecordWithDLSN(recordStream.getCurrentPosition(), startSequenceId);
                    nextRecordInStream.setMetadata(metadata);
                    nextRecordInStream.setTransactionId(in.readLong());

                    // 1) if it is simple record, copy the data
                    // 2) if it is record set and deserializeRecordSet is true, we don't need to copy data,
                    //    defer data copying to deserializing record from record set.
                    // 3) if it is record set and deserializeRecordSet is false, we copy the data, so applications
                    //    don't have to deal with reference count.
                    boolean copyData = !isRecordSet(metadata) || !deserializeRecordSet;
                    nextRecordInStream.readPayload(in, copyData);
                    if (LOG.isTraceEnabled()) {
                        if (nextRecordInStream.isControl()) {
                            LOG.trace("Reading {} Control DLSN {}", recordStream.getName(),
                                    nextRecordInStream.getDlsn());
                        } else {
                            LOG.trace("Reading {} Valid DLSN {}", recordStream.getName(),
                                    nextRecordInStream.getDlsn());
                        }
                    }

                    int numRecords = 1;
                    if (!deserializeRecordSet && nextRecordInStream.isRecordSet()) {
                        numRecords = LogRecordSet.numRecords(nextRecordInStream);
                    }

                    if (deserializeRecordSet && nextRecordInStream.isRecordSet()) {
                        recordSetReader = LogRecordSet.of(nextRecordInStream);
                    } else {
                        recordStream.advance(numRecords);
                        return nextRecordInStream;
                    }
                } catch (EOFException eof) {
                    // Expected
                    break;
                }
            }
            return null;
        }

        public boolean skipTo(long txId, boolean skipControl) throws IOException {
            return skipTo(txId, null, skipControl);
        }

        public boolean skipTo(DLSN dlsn) throws IOException {
            return skipTo(null, dlsn, false);
        }

        private boolean skipTo(Long txId, DLSN dlsn, boolean skipControl) throws IOException {
            boolean found = false;
            while (true) {
                try {
                    long flags;
                    long currTxId;

                    // if there is not record set, read next record
                    if (null == recordSetReader) {
                        in.markReaderIndex();
                        flags = in.readLong();
                        currTxId = in.readLong();
                    } else {
                        // check record set until reach end of record set
                        lastRecordSkipTo = recordSetReader.nextRecord();
                        if (null == lastRecordSkipTo) {
                            // reach end of record set
                            recordSetReader = null;
                            continue;
                        }
                        flags = lastRecordSkipTo.getMetadata();
                        currTxId = lastRecordSkipTo.getTransactionId();
                    }

                    if ((null != dlsn) && (recordStream.getCurrentPosition().compareTo(dlsn) >= 0)) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Found position {} beyond {}", recordStream.getCurrentPosition(), dlsn);
                        }
                        if (null == lastRecordSkipTo) {
                            in.resetReaderIndex();
                        }
                        found = true;
                        break;
                    }
                    if ((null != txId) && (currTxId >= txId)) {
                        if (!skipControl || !isControl(flags)) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Found position {} beyond {}", currTxId, txId);
                            }
                            if (null == lastRecordSkipTo) {
                                in.resetReaderIndex();
                            }
                            found = true;
                            break;
                        }
                    }

                    if (null != lastRecordSkipTo) {
                        recordStream.advance(1);
                        continue;
                    }

                    // get the num of records to skip
                    if (isRecordSet(flags)) {
                        // read record set
                        LogRecordWithDLSN record = new LogRecordWithDLSN(recordStream.getCurrentPosition(),
                                startSequenceId);
                        record.setMetadata(flags);
                        record.setTransactionId(currTxId);
                        record.readPayload(in, false);
                        recordSetReader = LogRecordSet.of(record);
                    } else {
                        int length = in.readInt();
                        if (length < 0) {
                            // We should never really see this as we only write complete entries to
                            // BK and BK client has logic to detect torn writes (through checksum)
                            LOG.info("Encountered Record with negative length at TxId: {}", currTxId);
                            break;
                        }
                        // skip single record
                        in.skipBytes(length);
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Skipped Record with TxId {} DLSN {}", currTxId,
                                    recordStream.getCurrentPosition());
                        }
                        recordStream.advance(1);
                    }
                } catch (EOFException eof) {
                    LOG.debug("Skip encountered end of file Exception", eof);
                    break;
                }
            }
            return found;
        }
    }
}