org.apache.drill.exec.work.batch.SpoolingRawBatchBuffer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.drill.exec.work.batch.SpoolingRawBatchBuffer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.work.batch;

import io.netty.buffer.ByteBuf;
import io.netty.buffer.DrillBuf;

import java.io.EOFException;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;

import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.proto.BitData;
import org.apache.drill.exec.proto.ExecProtos;
import org.apache.drill.exec.proto.helper.QueryIdHelper;
import org.apache.drill.exec.record.RawFragmentBatch;
import org.apache.drill.exec.store.LocalSyncableFileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Queues;

/**
 * This implementation of RawBatchBuffer starts writing incoming batches to disk once the buffer size reaches a threshold.
 * The order of the incoming buffers is maintained.
 */
public class SpoolingRawBatchBuffer extends BaseRawBatchBuffer<SpoolingRawBatchBuffer.RawFragmentBatchWrapper> {
    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SpoolingRawBatchBuffer.class);

    private static String DRILL_LOCAL_IMPL_STRING = "fs.drill-local.impl";
    private static final float STOP_SPOOLING_FRACTION = (float) 0.5;
    public static final long ALLOCATOR_INITIAL_RESERVATION = 1 * 1024 * 1024;
    public static final long ALLOCATOR_MAX_RESERVATION = 20L * 1000 * 1000 * 1000;

    private enum SpoolingState {
        NOT_SPOOLING, SPOOLING, PAUSE_SPOOLING, STOP_SPOOLING
    }

    private final BufferAllocator allocator;
    private final long threshold;
    private final int oppositeId;
    private final int bufferIndex;

    private volatile SpoolingState spoolingState;
    private volatile long currentSizeInMemory = 0;
    private volatile Spooler spooler;

    private FileSystem fs;
    private Path path;
    private FSDataOutputStream outputStream;

    public SpoolingRawBatchBuffer(FragmentContext context, int fragmentCount, int oppositeId, int bufferIndex) {
        super(context, fragmentCount);
        this.allocator = context.getNewChildAllocator("SpoolingRawBatchBufer", 100, ALLOCATOR_INITIAL_RESERVATION,
                ALLOCATOR_MAX_RESERVATION);
        this.threshold = context.getConfig().getLong(ExecConstants.SPOOLING_BUFFER_MEMORY);
        this.oppositeId = oppositeId;
        this.bufferIndex = bufferIndex;
        this.bufferQueue = new SpoolingBufferQueue();
    }

    private class SpoolingBufferQueue implements BufferQueue<RawFragmentBatchWrapper> {

        private final LinkedBlockingDeque<RawFragmentBatchWrapper> buffer = Queues.newLinkedBlockingDeque();

        @Override
        public void addOomBatch(RawFragmentBatch batch) {
            RawFragmentBatchWrapper batchWrapper = new RawFragmentBatchWrapper(batch, true);
            batchWrapper.setOutOfMemory(true);
            buffer.addFirst(batchWrapper);
        }

        @Override
        public RawFragmentBatch poll() throws IOException {
            RawFragmentBatchWrapper batchWrapper = buffer.poll();
            if (batchWrapper != null) {
                try {
                    return batchWrapper.get();
                } catch (InterruptedException e) {
                    return null;
                }
            }
            return null;
        }

        @Override
        public RawFragmentBatch take() throws IOException, InterruptedException {
            return buffer.take().get();
        }

        @Override
        public boolean checkForOutOfMemory() {
            return buffer.peek().isOutOfMemory();
        }

        @Override
        public int size() {
            return buffer.size();
        }

        @Override
        public boolean isEmpty() {
            return buffer.size() == 0;
        }

        public void add(RawFragmentBatchWrapper batchWrapper) {
            buffer.add(batchWrapper);
        }
    }

    private synchronized void setSpoolingState(SpoolingState newState) {
        SpoolingState currentState = spoolingState;
        if (newState == SpoolingState.NOT_SPOOLING || currentState == SpoolingState.STOP_SPOOLING) {
            return;
        }
        spoolingState = newState;
    }

    private boolean isCurrentlySpooling() {
        return spoolingState == SpoolingState.SPOOLING;
    }

    private void startSpooling() {
        setSpoolingState(SpoolingState.SPOOLING);
    }

    private void pauseSpooling() {
        setSpoolingState(SpoolingState.PAUSE_SPOOLING);
    }

    private boolean isSpoolingStopped() {
        return spoolingState == SpoolingState.STOP_SPOOLING;
    }

    private void stopSpooling() {
        setSpoolingState(SpoolingState.STOP_SPOOLING);
    }

    public String getDir() {
        List<String> dirs = context.getConfig().getStringList(ExecConstants.TEMP_DIRECTORIES);
        return dirs.get(ThreadLocalRandom.current().nextInt(dirs.size()));
    }

    private synchronized void initSpooler() throws IOException {
        if (spooler != null) {
            return;
        }

        Configuration conf = new Configuration();
        conf.set(FileSystem.FS_DEFAULT_NAME_KEY, context.getConfig().getString(ExecConstants.TEMP_FILESYSTEM));
        conf.set(DRILL_LOCAL_IMPL_STRING, LocalSyncableFileSystem.class.getName());
        fs = FileSystem.get(conf);
        path = getPath();
        outputStream = fs.create(path);
        final String spoolingThreadName = QueryIdHelper.getExecutorThreadName(context.getHandle())
                .concat(":Spooler-" + oppositeId + "-" + bufferIndex);
        spooler = new Spooler(spoolingThreadName);
        spooler.start();
    }

    @Override
    protected void enqueueInner(RawFragmentBatch batch) throws IOException {
        assert batch.getHeader().getSendingMajorFragmentId() == oppositeId;

        logger.debug("Enqueue batch. Current buffer size: {}. Last batch: {}. Sending fragment: {}",
                bufferQueue.size(), batch.getHeader().getIsLastBatch(),
                batch.getHeader().getSendingMajorFragmentId());
        RawFragmentBatchWrapper wrapper;

        boolean spoolCurrentBatch = isCurrentlySpooling();
        wrapper = new RawFragmentBatchWrapper(batch, !spoolCurrentBatch);
        currentSizeInMemory += wrapper.getBodySize();
        if (spoolCurrentBatch) {
            if (spooler == null) {
                initSpooler();
            }
            spooler.addBatchForSpooling(wrapper);
        }
        bufferQueue.add(wrapper);
        if (!spoolCurrentBatch && currentSizeInMemory > threshold) {
            logger.debug("Buffer size {} greater than threshold {}. Start spooling to disk", currentSizeInMemory,
                    threshold);
            startSpooling();
        }
    }

    @Override
    public void kill(FragmentContext context) {
        allocator.close();
        if (spooler != null) {
            spooler.terminate();
        }
    }

    @Override
    protected void upkeep(RawFragmentBatch batch) {
        if (context.isOverMemoryLimit()) {
            outOfMemory.set(true);
        }

        DrillBuf body = batch.getBody();
        if (body != null) {
            currentSizeInMemory -= body.capacity();
        }
        if (isCurrentlySpooling() && currentSizeInMemory < threshold * STOP_SPOOLING_FRACTION) {
            logger.debug("buffer size {} less than {}x threshold. Stop spooling.", currentSizeInMemory,
                    STOP_SPOOLING_FRACTION);
            pauseSpooling();
        }
        logger.debug("Got batch. Current buffer size: {}", bufferQueue.size());
    }

    @Override
    public void close() {
        if (spooler != null) {
            spooler.terminate();
            while (spooler.isAlive()) {
                try {
                    spooler.join();
                } catch (InterruptedException e) {
                    logger.warn("Interrupted while waiting for spooling thread to exit");
                    continue;
                }
            }
        }
        allocator.close();
        try {
            if (outputStream != null) {
                outputStream.close();
            }
        } catch (IOException e) {
            logger.warn("Failed to cleanup I/O streams", e);
        }
        if (context.getConfig().getBoolean(ExecConstants.SPOOLING_BUFFER_DELETE)) {
            try {
                if (fs != null) {
                    fs.delete(path, false);
                    logger.debug("Deleted file {}", path.toString());
                }
            } catch (IOException e) {
                logger.warn("Failed to delete temporary files", e);
            }
        }
        super.close();
    }

    private class Spooler extends Thread {

        private final LinkedBlockingDeque<RawFragmentBatchWrapper> spoolingQueue;
        private volatile boolean shouldContinue = true;
        private Thread spoolingThread;

        public Spooler(String name) {
            setDaemon(true);
            setName(name);
            spoolingQueue = Queues.newLinkedBlockingDeque();
        }

        public void run() {
            try {
                while (shouldContinue) {
                    RawFragmentBatchWrapper batch;
                    try {
                        batch = spoolingQueue.take();
                    } catch (InterruptedException e) {
                        if (shouldContinue) {
                            continue;
                        } else {
                            break;
                        }
                    }
                    try {
                        batch.writeToStream(outputStream);
                    } catch (IOException e) {
                        context.fail(e);
                    }
                }
            } catch (Throwable e) {
                context.fail(e);
            } finally {
                logger.info("Spooler thread exiting");
            }
        }

        public void addBatchForSpooling(RawFragmentBatchWrapper batchWrapper) {
            if (isSpoolingStopped()) {
                spoolingQueue.add(batchWrapper);
            } else {
                // will not spill this batch
                batchWrapper.available = true;
                batchWrapper.batch.sendOk();
                batchWrapper.latch.countDown();
            }
        }

        public void terminate() {
            stopSpooling();
            shouldContinue = false;
            if (spoolingThread.isAlive()) {
                spoolingThread.interrupt();
            }
        }
    }

    class RawFragmentBatchWrapper {
        private RawFragmentBatch batch;
        private volatile boolean available;
        private CountDownLatch latch;
        private volatile int bodyLength;
        private volatile boolean outOfMemory = false;
        private long start = -1;
        private long check;

        public RawFragmentBatchWrapper(RawFragmentBatch batch, boolean available) {
            Preconditions.checkNotNull(batch);
            this.batch = batch;
            this.available = available;
            this.latch = new CountDownLatch(available ? 0 : 1);
            if (available) {
                batch.sendOk();
            }
        }

        public boolean isNull() {
            return batch == null;
        }

        public RawFragmentBatch get() throws InterruptedException, IOException {
            if (available) {
                assert batch.getHeader() != null : "batch header null";
                return batch;
            } else {
                latch.await();
                readFromStream();
                available = true;
                return batch;
            }
        }

        public long getBodySize() {
            if (batch.getBody() == null) {
                return 0;
            }
            assert batch.getBody().readableBytes() >= 0;
            return batch.getBody().readableBytes();
        }

        public void writeToStream(FSDataOutputStream stream) throws IOException {
            Stopwatch watch = Stopwatch.createStarted();
            available = false;
            check = ThreadLocalRandom.current().nextLong();
            start = stream.getPos();
            logger.debug("Writing check value {} at position {}", check, start);
            stream.writeLong(check);
            batch.getHeader().writeDelimitedTo(stream);
            ByteBuf buf = batch.getBody();
            if (buf != null) {
                bodyLength = buf.capacity();
            } else {
                bodyLength = 0;
            }
            if (bodyLength > 0) {
                buf.getBytes(0, stream, bodyLength);
            }
            stream.hsync();
            FileStatus status = fs.getFileStatus(path);
            long len = status.getLen();
            logger.debug("After spooling batch, stream at position {}. File length {}", stream.getPos(), len);
            batch.sendOk();
            latch.countDown();
            long t = watch.elapsed(TimeUnit.MICROSECONDS);
            logger.debug("Took {} us to spool {} to disk. Rate {} mb/s", t, bodyLength, bodyLength / t);
            if (buf != null) {
                buf.release();
            }
        }

        public void readFromStream() throws IOException, InterruptedException {
            long pos = start;
            boolean tryAgain = true;
            int duration = 0;

            while (tryAgain) {

                // Sometimes, the file isn't quite done writing when we attempt to read it. As such, we need to wait and retry.
                Thread.sleep(duration);

                try (final FSDataInputStream stream = fs.open(path);
                        final DrillBuf buf = allocator.buffer(bodyLength)) {
                    stream.seek(start);
                    final long currentPos = stream.getPos();
                    final long check = stream.readLong();
                    pos = stream.getPos();
                    assert check == this.check : String.format("Check values don't match: %d %d, Position %d",
                            this.check, check, currentPos);
                    Stopwatch watch = Stopwatch.createStarted();
                    BitData.FragmentRecordBatch header = BitData.FragmentRecordBatch.parseDelimitedFrom(stream);
                    pos = stream.getPos();
                    assert header != null : "header null after parsing from stream";
                    buf.writeBytes(stream, bodyLength);
                    pos = stream.getPos();
                    batch = new RawFragmentBatch(header, buf, null);
                    available = true;
                    latch.countDown();
                    long t = watch.elapsed(TimeUnit.MICROSECONDS);
                    logger.debug("Took {} us to read {} from disk. Rate {} mb/s", t, bodyLength, bodyLength / t);
                    tryAgain = false;
                } catch (EOFException e) {
                    FileStatus status = fs.getFileStatus(path);
                    logger.warn("EOF reading from file {} at pos {}. Current file size: {}", path, pos,
                            status.getLen());
                    duration = Math.max(1, duration * 2);
                    if (duration < 60000) {
                        continue;
                    } else {
                        throw e;
                    }
                } finally {
                    if (tryAgain) {
                        // we had a premature exit, release batch memory so we don't leak it.
                        if (batch != null) {
                            batch.getBody().release();
                        }
                    }
                }
            }
        }

        private boolean isOutOfMemory() {
            return outOfMemory;
        }

        private void setOutOfMemory(boolean outOfMemory) {
            this.outOfMemory = outOfMemory;
        }
    }

    private Path getPath() {
        ExecProtos.FragmentHandle handle = context.getHandle();

        String qid = QueryIdHelper.getQueryId(handle.getQueryId());

        int majorFragmentId = handle.getMajorFragmentId();
        int minorFragmentId = handle.getMinorFragmentId();

        String fileName = Joiner.on(Path.SEPARATOR).join(getDir(), qid, majorFragmentId, minorFragmentId,
                oppositeId, bufferIndex);

        return new Path(fileName);
    }
}