com.palantir.atlasdb.sweep.BackgroundSweeperImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.palantir.atlasdb.sweep.BackgroundSweeperImpl.java

Source

/**
 * Copyright 2015 Palantir Technologies
 *
 * Licensed under the BSD-3 License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://opensource.org/licenses/BSD-3-Clause
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.sweep;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import javax.annotation.Nullable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.UnsignedBytes;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyValueService;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.SweepResults;
import com.palantir.atlasdb.protos.generated.TableMetadataPersistence.SweepStrategy;
import com.palantir.atlasdb.schema.generated.SweepPriorityTable;
import com.palantir.atlasdb.schema.generated.SweepPriorityTable.SweepPriorityRow;
import com.palantir.atlasdb.schema.generated.SweepPriorityTable.SweepPriorityRowResult;
import com.palantir.atlasdb.schema.generated.SweepProgressTable;
import com.palantir.atlasdb.schema.generated.SweepProgressTable.SweepProgressRow;
import com.palantir.atlasdb.schema.generated.SweepProgressTable.SweepProgressRowResult;
import com.palantir.atlasdb.schema.generated.SweepTableFactory;
import com.palantir.atlasdb.transaction.api.LockAwareTransactionManager;
import com.palantir.atlasdb.transaction.api.RuntimeTransactionTask;
import com.palantir.atlasdb.transaction.api.Transaction;
import com.palantir.atlasdb.transaction.api.TransactionReadSentinelBehavior;
import com.palantir.atlasdb.transaction.impl.TxTask;
import com.palantir.atlasdb.transaction.impl.UnmodifiableTransaction;
import com.palantir.common.base.Throwables;
import com.palantir.lock.LockDescriptor;
import com.palantir.lock.LockMode;
import com.palantir.lock.LockRefreshToken;
import com.palantir.lock.LockRequest;
import com.palantir.lock.StringLockDescriptor;

public class BackgroundSweeperImpl implements BackgroundSweeper {
    private static final Logger log = LoggerFactory.getLogger(BackgroundSweeperImpl.class);
    private final LockAwareTransactionManager txManager;
    private final KeyValueService kvs;
    private final SweepTaskRunner sweepRunner;
    private final Supplier<Boolean> isSweepEnabled;
    private final Supplier<Long> sweepPauseMillis;
    private final Supplier<Integer> sweepBatchSize;
    private final SweepTableFactory tableFactory;
    private volatile float batchSizeMultiplier = 1.0f;
    private Thread daemon;

    // weights one month of no sweeping with the same priority as about 100000 expected cells to sweep.
    private static final double MILLIS_SINCE_SWEEP_PRIORITY_WEIGHT = 100000.0
            / TimeUnit.MILLISECONDS.convert(30, TimeUnit.DAYS);

    public BackgroundSweeperImpl(LockAwareTransactionManager txManager, KeyValueService kvs,
            SweepTaskRunner sweepRunner, Supplier<Boolean> isSweepEnabled, Supplier<Long> sweepPauseMillis,
            Supplier<Integer> sweepBatchSize, SweepTableFactory tableFactory) {
        this.txManager = txManager;
        this.kvs = kvs;
        this.sweepRunner = sweepRunner;
        this.isSweepEnabled = isSweepEnabled;
        this.sweepPauseMillis = sweepPauseMillis;
        this.sweepBatchSize = sweepBatchSize;
        this.tableFactory = tableFactory;
    }

    @Override
    public synchronized void runInBackground() {
        Preconditions.checkState(daemon == null);
        daemon = new Thread(this);
        daemon.setDaemon(true);
        daemon.setName("BackgroundSweeper");
        daemon.start();
    }

    @Override
    public void run() {
        Optional<LockRefreshToken> locks = Optional.absent();
        try {
            // Wait a while before starting so short lived clis don't try to sweep.
            Thread.sleep(20 * (1000 + sweepPauseMillis.get()));
            log.debug("Starting background sweeper.");
            while (true) {
                boolean sweptSuccessfully = false;
                try {
                    if (isSweepEnabled.get()) {
                        locks = lockOrRefresh(locks);
                        if (locks.isPresent()) {
                            sweptSuccessfully = runOnce();
                        } else {
                            log.debug("Skipping sweep because sweep is running elsewhere.");
                        }
                    } else {
                        log.debug("Skipping sweep because it is currently disabled.");
                    }
                } catch (InsufficientConsistencyException e) {
                    log.warn("Could not sweep because not all nodes of the database are online.", e);
                } catch (RuntimeException e) {
                    if (checkAndRepairTableDrop()) {
                        log.error("The table being swept by the background sweeper was dropped, moving on...");
                    } else {
                        log.error("The background sweep job failed unexpectedly with a batch size of "
                                + ((int) (batchSizeMultiplier * sweepBatchSize.get()))
                                + ". Attempting to continue with a lower batch size...", e);
                        batchSizeMultiplier = Math.min(batchSizeMultiplier / 2, 1.0f / sweepBatchSize.get());
                    }
                }
                if (sweptSuccessfully) {
                    batchSizeMultiplier = Math.min(1.0f, batchSizeMultiplier * 1.01f);
                    Thread.sleep(sweepPauseMillis.get());
                } else {
                    Thread.sleep(20 * (1000 + sweepPauseMillis.get()));
                }
            }
        } catch (InterruptedException e) {
            log.debug("Shutting down background sweeper.");
        } finally {
            if (locks.isPresent()) {
                txManager.getLockService().unlock(locks.get());
            }
        }
    }

    private boolean runOnce() {
        SweepProgressRowResult progress = txManager
                .runTaskWithRetry(new RuntimeTransactionTask<SweepProgressRowResult>() {
                    @Override
                    public SweepProgressRowResult execute(Transaction t) {
                        SweepProgressTable progressTable = tableFactory.getSweepProgressTable(t);
                        SweepProgressRowResult result = progressTable.getRow(SweepProgressRow.of(0)).orNull();
                        if (result == null) {
                            result = chooseNextTableToSweep(new SweepTransaction(t,
                                    sweepRunner.getSweepTimestamp(SweepStrategy.CONSERVATIVE)));
                        }
                        return result;
                    }
                });
        if (progress == null) {
            // Don't change this log statement. It's parsed by test automation code.
            log.debug("Skipping sweep because no table has enough new writes to be worth sweeping at the moment.");
            return false;
        }
        int batchSize = Math.max(1, (int) (sweepBatchSize.get() * batchSizeMultiplier));
        Stopwatch watch = Stopwatch.createStarted();
        try {
            SweepResults results = sweepRunner.run(progress.getFullTableName(), batchSize, progress.getStartRow());
            log.debug("Swept {} unique cells from {} starting at {} and performed {} deletions in {} ms.",
                    results.getCellsExamined(), progress.getFullTableName(),
                    progress.getStartRow() == null ? "0" : PtBytes.encodeHexString(progress.getStartRow()),
                    results.getCellsDeleted(), watch.elapsed(TimeUnit.MILLISECONDS));
            saveSweepResults(progress, results);
            return true;
        } catch (RuntimeException e) {
            // Error logged at a higher log level above.
            log.debug("Failed to sweep {} with batch size {} starting from row {}", progress.getFullTableName(),
                    batchSize,
                    progress.getStartRow() == null ? "0" : PtBytes.encodeHexString(progress.getStartRow()));
            throw e;
        }
    }

    @Nullable
    private SweepProgressRowResult chooseNextTableToSweep(SweepTransaction t) {
        Set<String> allTables = Sets.difference(kvs.getAllTableNames(), AtlasDbConstants.hiddenTables);
        SweepPriorityTable oldPriorityTable = tableFactory.getSweepPriorityTable(t);
        SweepPriorityTable newPriorityTable = tableFactory.getSweepPriorityTable(t.delegate());

        // We read priorities from the past because we should prioritize based on what the sweeper will
        // actually be able to sweep. We read priorities from the present to make sure we don't repeatedly
        // sweep the same table while waiting for the past to catch up.
        List<SweepPriorityRowResult> oldPriorities = oldPriorityTable.getAllRowsUnordered().immutableCopy();
        List<SweepPriorityRowResult> newPriorities = newPriorityTable.getAllRowsUnordered().immutableCopy();
        Map<String, SweepPriorityRowResult> newPrioritiesByTableName = Maps.uniqueIndex(newPriorities,
                Functions.compose(SweepPriorityRow.getFullTableNameFun(), SweepPriorityRowResult.getRowNameFun()));
        String tableName = getTableToSweep(t, allTables, oldPriorities, newPrioritiesByTableName);
        if (tableName == null) {
            return null;
        }
        RowResult<byte[]> rawResult = RowResult.<byte[]>create(SweepProgressRow.of(0).persistToBytes(),
                ImmutableSortedMap.<byte[], byte[]>orderedBy(UnsignedBytes.lexicographicalComparator())
                        .put(SweepProgressTable.SweepProgressNamedColumn.FULL_TABLE_NAME.getShortName(),
                                SweepProgressTable.FullTableName.of(tableName).persistValue())
                        .build());

        log.debug("Now starting to sweep {}.", tableName);
        return SweepProgressRowResult.of(rawResult);
    }

    @Nullable
    private String getTableToSweep(SweepTransaction t, Set<String> allTables,
            List<SweepPriorityRowResult> oldPriorities,
            Map<String, SweepPriorityRowResult> newPrioritiesByTableName) {
        Set<String> unsweptTables = Sets.difference(allTables, newPrioritiesByTableName.keySet());
        if (!unsweptTables.isEmpty()) {
            return Iterables.get(unsweptTables, 0);
        }
        double maxPriority = 0.0;
        String toSweep = null;
        Collection<SweepPriorityRow> toDelete = Lists.newArrayList();
        for (SweepPriorityRowResult oldPriority : oldPriorities) {
            String tableName = oldPriority.getRowName().getFullTableName();
            if (allTables.contains(tableName)) {
                SweepPriorityRowResult newPriority = newPrioritiesByTableName.get(tableName);
                double priority = getSweepPriority(oldPriority, newPriority);
                if (priority > maxPriority) {
                    maxPriority = priority;
                    toSweep = tableName;
                }
            } else {
                toDelete.add(oldPriority.getRowName());
            }
        }

        // Clean up rows for tables that no longer exist.
        tableFactory.getSweepPriorityTable(t.delegate()).delete(toDelete);

        return toSweep;
    }

    private double getSweepPriority(SweepPriorityRowResult oldPriority, SweepPriorityRowResult newPriority) {
        if (AtlasDbConstants.hiddenTables.contains(newPriority.getRowName().getFullTableName())) {
            // Never sweep hidden tables
            return 0.0;
        }
        if (!newPriority.hasLastSweepTime()) {
            // Highest priority if we've never swept it before
            return Double.MAX_VALUE;
        }
        if (oldPriority.getWriteCount() > newPriority.getWriteCount()) {
            // We just swept this, or it got truncated.
            return 0.0;
        }
        long cellsDeleted = Math.max(1, oldPriority.getCellsDeleted());
        long cellsExamined = Math.max(1, oldPriority.getCellsExamined());
        long writeCount = Math.max(1, oldPriority.getWriteCount());
        double previousEfficacy = 1.0 * cellsDeleted / cellsExamined;
        double estimatedCellsToSweep = previousEfficacy * writeCount;
        long millisSinceSweep = System.currentTimeMillis() - oldPriority.getLastSweepTime();

        if (writeCount <= 100 + cellsExamined / 100
                && TimeUnit.DAYS.convert(millisSinceSweep, TimeUnit.MILLISECONDS) < 180) {
            // Not worth the effort if fewer than 1% of cells are new and we've swept in the last 6 months.
            return 0.0;
        }

        // This ordering function weights one month of no sweeping
        // with the same priority as about 100000 expected cells to sweep.
        return estimatedCellsToSweep + millisSinceSweep * MILLIS_SINCE_SWEEP_PRIORITY_WEIGHT;
    }

    private void saveSweepResults(final SweepProgressRowResult progress, final SweepResults results) {
        final long cellsDeleted = fromNullable(progress.getCellsDeleted()) + results.getCellsDeleted();
        final long cellsExamined = fromNullable(progress.getCellsExamined()) + results.getCellsExamined();
        if (results.getNextStartRow().isPresent()) {
            saveIntermediateSweepResults(progress, results.getNextStartRow().get(), cellsDeleted, cellsExamined);
            return;
        }

        saveFinalSweepResults(progress, cellsDeleted, cellsExamined);

        log.debug("Finished sweeping {}, examined {} unique cells, deleted {} cells.", progress.getFullTableName(),
                cellsExamined, cellsDeleted);

        if (cellsDeleted > 0) {
            Stopwatch watch = Stopwatch.createStarted();
            kvs.compactInternally(progress.getFullTableName());
            log.debug("Finished performing compactInternally on {} in {} ms.", progress.getFullTableName(),
                    watch.elapsed(TimeUnit.MILLISECONDS));
        }

        // Truncate instead of delete because the progress table contains only
        // a single row that has accumulated many overwrites.
        kvs.truncateTable(tableFactory.getSweepProgressTable(null).getTableName());
    }

    private void saveIntermediateSweepResults(final SweepProgressRowResult progress, final byte[] nextStartRow,
            final long cellsDeleted, final long cellsExamined) {
        txManager.runTaskWithRetry(new TxTask() {
            @Override
            public Void execute(Transaction t) {
                SweepProgressTable progressTable = tableFactory.getSweepProgressTable(t);
                SweepProgressRow row = SweepProgressRow.of(0);
                progressTable.putFullTableName(row, progress.getFullTableName());
                progressTable.putStartRow(row, nextStartRow);
                progressTable.putCellsDeleted(row, cellsDeleted);
                progressTable.putCellsExamined(row, cellsExamined);
                if (!progress.hasStartRow()) {
                    // This is the first set of results being written for this table.
                    tableFactory.getSweepPriorityTable(t)
                            .putWriteCount(SweepPriorityRow.of(progress.getFullTableName()), 0L);
                }
                return null;
            }
        });
    }

    private void saveFinalSweepResults(final SweepProgressRowResult progress, final long cellsDeleted,
            final long cellsExamined) {
        txManager.runTaskWithRetry(new TxTask() {
            @Override
            public Void execute(Transaction t) {
                SweepPriorityTable priorityTable = tableFactory.getSweepPriorityTable(t);
                SweepPriorityRow row = SweepPriorityRow.of(progress.getFullTableName());
                priorityTable.putCellsDeleted(row, cellsDeleted);
                priorityTable.putCellsExamined(row, cellsExamined);
                priorityTable.putLastSweepTime(row, System.currentTimeMillis());
                if (!progress.hasStartRow()) {
                    // This is the first (and only) set of results being written for this table.
                    priorityTable.putWriteCount(row, 0L);
                }
                return null;
            }
        });
    }

    /**
     * Check whether the table being swept was dropped. If so, stop sweeping it and move on.
     * @return Whether the table being swept was dropped.
     */
    private boolean checkAndRepairTableDrop() {
        try {
            Set<String> tables = kvs.getAllTableNames();
            SweepProgressRowResult result = txManager
                    .runTaskReadOnly(new RuntimeTransactionTask<SweepProgressRowResult>() {
                        @Override
                        public SweepProgressRowResult execute(Transaction t) {
                            return tableFactory.getSweepProgressTable(t).getRow(SweepProgressRow.of(0L)).orNull();
                        }
                    });
            if (result == null || tables.contains(result.getFullTableName())) {
                return false;
            }
            kvs.truncateTable(tableFactory.getSweepProgressTable(null).getTableName());
            return true;
        } catch (RuntimeException e) {
            log.error(
                    "Failed to check whether the table being swept was dropped. Continuing under the assumption that it wasn't...",
                    e);
            return false;
        }
    }

    private long fromNullable(Long num) {
        return num == null ? 0L : num.longValue();
    }

    private Optional<LockRefreshToken> lockOrRefresh(Optional<LockRefreshToken> previousLocks)
            throws InterruptedException {
        if (previousLocks.isPresent()) {
            LockRefreshToken refreshToken = previousLocks.get();
            Set<LockRefreshToken> refreshedTokens = txManager.getLockService()
                    .refreshLockRefreshTokens(ImmutableList.of(refreshToken));
            if (refreshedTokens.isEmpty()) {
                return Optional.absent();
            } else {
                return previousLocks;
            }
        } else {
            LockDescriptor lock = StringLockDescriptor.of("atlas sweep");
            LockRequest request = LockRequest.builder(ImmutableSortedMap.of(lock, LockMode.WRITE)).doNotBlock()
                    .build();
            LockRefreshToken token = txManager.getLockService().lockAnonymously(request);
            return Optional.fromNullable(token);
        }
    }

    @Override
    public synchronized void shutdown() {
        if (daemon == null) {
            return;
        }
        log.debug("Signalling background sweeper to shut down.");
        daemon.interrupt();
        try {
            daemon.join();
            daemon = null;
        } catch (InterruptedException e) {
            throw Throwables.rewrapAndThrowUncheckedException(e);
        }
    }

    private static class SweepTransaction extends UnmodifiableTransaction {
        private final long sweepTimestamp;

        public SweepTransaction(Transaction delegate, long sweepTimestamp) {
            super(delegate);
            this.sweepTimestamp = sweepTimestamp;
        }

        @Override
        public long getTimestamp() {
            return sweepTimestamp;
        }

        @Override
        public TransactionReadSentinelBehavior getReadSentinelBehavior() {
            return TransactionReadSentinelBehavior.IGNORE;
        }
    }
}