co.cask.cdap.data.stream.StreamDataFileWriter.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.data.stream.StreamDataFileWriter.java

Source

/*
 * Copyright  2014-2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.data.stream;

import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.common.io.BinaryEncoder;
import co.cask.cdap.common.io.BufferedEncoder;
import co.cask.cdap.common.io.Encoder;
import co.cask.cdap.common.stream.StreamEventDataCodec;
import co.cask.cdap.data.file.FileWriter;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.OutputSupplier;
import com.google.common.primitives.Longs;
import org.apache.hadoop.fs.Syncable;

import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.Map;
import javax.annotation.concurrent.NotThreadSafe;

/**
 * File format
 *
 * Stream event file:
 *
 * <pre>
 * {@code
 *
 * event_file = <header> <data>* <end_marker>
 * header = "E" "1" <properties>
 * properties = Avro encoded with the properties schema
 * data = <timestamp> <length> <stream_event>+
 * timestamp = 8 bytes int64 for timestamp in milliseconds
 * length = Avro encoded int32 for size in bytes for all <stream_event>s
 * stream_event = Avro encoded bytes according to the StreamData schema
 * end_marker = 8 bytes int64 with value == -(close_timestamp)
 *
 * }
 * </pre>
 *
 * Stream index file:
 *
 * <pre>
 * {@code
 *
 * meta_file = <header> <index>*
 * header = "I" "1" <properties>
 * properties = Avro encoded with the properties schema
 * index = <timestamp> <offset>
 * timestamp = 8 bytes int64 for timestamp in milliseconds
 * offset = 8 bytes int64 for offset to data block in the event file
 *
 * }
 * </pre>
 */
@NotThreadSafe
public final class StreamDataFileWriter implements TimestampCloseable, Flushable, FileWriter<StreamEvent> {

    private static final int BUFFER_SIZE = 256 * 1024; // 256K

    private final OutputStream eventOutput;
    private final OutputStream indexOutput;
    private final long indexInterval;
    private final BufferedEncoder encoder;
    private final BufferedEncoder lengthEncoder;

    // Timestamp for the current block
    private long currentTimestamp;
    private long position;
    private long nextIndexTime;
    private boolean synced;
    private boolean closed;
    private long closeTimestamp;

    /**
     * Constructs a new instance that writes to given outputs. Same as calling
     * {@link StreamDataFileWriter#StreamDataFileWriter(OutputSupplier, OutputSupplier, long, Map)}
     * with an empty property map.
     */
    public StreamDataFileWriter(OutputSupplier<? extends OutputStream> eventOutputSupplier,
            OutputSupplier<? extends OutputStream> indexOutputSupplier, long indexInterval) throws IOException {
        this(eventOutputSupplier, indexOutputSupplier, indexInterval, ImmutableMap.<String, String>of());
    }

    /**
     * Constructs a new instance that writes to given outputs.
     *
     * @param eventOutputSupplier the provider of the {@link OutputStream} for writing events
     * @param indexOutputSupplier the provider of the {@link OutputStream} for writing the index
     * @param indexInterval the time interval in milliseconds for emitting a new index entry
     * @param properties the property set that will be stored as file properties
     * @throws IOException if there is an error in preparing the output streams
     */
    public StreamDataFileWriter(OutputSupplier<? extends OutputStream> eventOutputSupplier,
            OutputSupplier<? extends OutputStream> indexOutputSupplier, long indexInterval,
            Map<String, String> properties) throws IOException {
        this.eventOutput = eventOutputSupplier.getOutput();
        try {
            this.indexOutput = indexOutputSupplier.getOutput();
        } catch (IOException e) {
            Closeables.closeQuietly(this.eventOutput);
            throw e;
        }
        this.indexInterval = indexInterval;
        this.currentTimestamp = -1L;
        this.closeTimestamp = -1L;

        Function<OutputStream, Encoder> encoderFactory = createEncoderFactory();
        this.encoder = new BufferedEncoder(BUFFER_SIZE, encoderFactory);
        this.lengthEncoder = new BufferedEncoder(5, encoderFactory);

        try {
            init(properties);
        } catch (IOException e) {
            Closeables.closeQuietly(eventOutput);
            Closeables.closeQuietly(indexOutput);
            throw e;
        }
    }

    @Override
    public void append(StreamEvent event) throws IOException {
        doAppend(event, BUFFER_SIZE);
    }

    /**
     * Writes multiple events to the stream file. Events provided by the iterator must be sorted by timestamp.
     * This method guarantees events with the same timestamp are written in the same data block. Note that
     * events of the same timestamp are all buffered in memory before writing to disk, since the data block length
     * needs to be known before it can be written to disk.
     *
     * @param events an {@link Iterator} that provides events to append
     * @throws IOException
     */
    @Override
    public void appendAll(Iterator<? extends StreamEvent> events) throws IOException {
        while (events.hasNext()) {
            doAppend(events.next(), Integer.MAX_VALUE);
        }
    }

    @Override
    public void close() throws IOException {
        if (closed) {
            return;
        }

        try {
            flushBlock(false);
            // Write the tail marker, which is a -(current timestamp).
            closeTimestamp = System.currentTimeMillis();
            eventOutput.write(Longs.toByteArray(-closeTimestamp));
        } finally {
            closed = true;
            try {
                eventOutput.close();
            } finally {
                indexOutput.close();
            }
        }
    }

    @Override
    public void flush() throws IOException {
        try {
            flushBlock(true);
        } catch (IOException e) {
            throw closeWithException(e);
        }
    }

    @Override
    public long getCloseTimestamp() {
        Preconditions.checkState(closed, "Writer not closed");
        return closeTimestamp;
    }

    private void doAppend(StreamEvent event, int flushLimit) throws IOException {
        if (closed) {
            throw new IOException("Writer already closed.");
        }

        synced = false;
        long eventTimestamp = event.getTimestamp();
        if (eventTimestamp < currentTimestamp) {
            throw closeWithException(new IOException("Out of order events written."));
        }

        try {
            if (eventTimestamp > currentTimestamp) {
                flushBlock(false);

                currentTimestamp = eventTimestamp;

                // Write the timestamp directly to output
                eventOutput.write(Bytes.toBytes(currentTimestamp));
                position += Bytes.SIZEOF_LONG;
            }

            // Encodes the event data into buffer.
            StreamEventDataCodec.encode(event, encoder);

            // Optionally flush if already filled up the buffer.
            if (encoder.size() >= flushLimit) {
                flushBlock(false);
            }

        } catch (IOException e) {
            throw closeWithException(e);
        }
    }

    private void init(Map<String, String> properties) throws IOException {
        // Writes the header for event file
        encoder.writeRaw(StreamDataFileConstants.MAGIC_HEADER_V2);

        Map<String, String> headers = Maps.newHashMap(properties);
        headers.put(StreamDataFileConstants.Property.Key.SCHEMA,
                StreamEventDataCodec.STREAM_DATA_SCHEMA.toString());
        StreamUtils.encodeMap(headers, encoder);

        long headerSize = encoder.size();
        encoder.writeTo(eventOutput);
        sync(eventOutput);
        position = headerSize;

        // Writes the header for index file
        encoder.writeRaw(StreamDataFileConstants.INDEX_MAGIC_HEADER_V1);

        // Empty properties map for now. May have properties in future version.
        StreamUtils.encodeMap(ImmutableMap.<String, String>of(), encoder);
        encoder.writeTo(indexOutput);
        sync(indexOutput);
    }

    /**
     * Writes the buffered data to underlying output stream.
     *
     * @param sync If {@code true}, perform a sync call to the underlying output stream.
     * @throws IOException If failed to flush.
     */
    private void flushBlock(boolean sync) throws IOException {
        if (encoder.size() == 0) {
            if (sync && !synced) {
                sync(eventOutput);
                sync(indexOutput);
                synced = true;
            }
            return;
        }

        // Record the current event output position if needs to update index
        long indexOffset = -1L;
        if (currentTimestamp >= nextIndexTime) {
            // Index offset is the current block start, hence is current position - 8 bytes timestamp already written.
            indexOffset = position - Bytes.SIZEOF_LONG;
        }

        // Writes the size of the encoded event
        lengthEncoder.writeInt(encoder.size());
        int size = lengthEncoder.size();
        lengthEncoder.writeTo(eventOutput);
        position += size;

        // Writes all encoded data from the buffer to the output.
        size = encoder.size();
        encoder.writeTo(eventOutput);
        position += size;
        if (sync) {
            sync(eventOutput);
        }

        if (indexOffset >= 0) {
            encoder.writeRaw(Bytes.toBytes(currentTimestamp));
            encoder.writeRaw(Bytes.toBytes(indexOffset));
            encoder.writeTo(indexOutput);
            if (sync) {
                sync(indexOutput);
            }

            nextIndexTime = currentTimestamp + indexInterval;
        } else if (sync) {
            sync(indexOutput);
        }

        // Reset the current timestamp so that a data block will start.
        currentTimestamp = -1L;
        synced = sync;
    }

    private void sync(OutputStream output) throws IOException {
        if (output instanceof Syncable) {
            ((Syncable) output).hsync();
        } else {
            output.flush();
        }
    }

    /**
     * Close this writer because of exception.
     * This method always throw exception.
     */
    private IOException closeWithException(IOException ex) throws IOException {
        closed = true;
        Closeables.closeQuietly(eventOutput);
        Closeables.closeQuietly(indexOutput);
        throw ex;
    }

    private static Function<OutputStream, Encoder> createEncoderFactory() {
        return new Function<OutputStream, Encoder>() {
            @Override
            public Encoder apply(OutputStream input) {
                return new BinaryEncoder(input);
            }
        };
    }
}