Java tutorial
/** * Copyright 2015 Palantir Technologies * * Licensed under the BSD-3 License (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://opensource.org/licenses/BSD-3-Clause * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.palantir.atlasdb.sweep; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import javax.annotation.Nullable; import org.apache.commons.lang.Validate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.base.Supplier; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; import com.google.common.collect.Multimap; import com.google.common.collect.PeekingIterator; import com.google.common.collect.Sets; import com.google.common.primitives.UnsignedBytes; import com.palantir.atlasdb.AtlasDbConstants; import com.palantir.atlasdb.cleaner.Follower; import com.palantir.atlasdb.keyvalue.api.Cell; import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException; import com.palantir.atlasdb.keyvalue.api.KeyValueService; import com.palantir.atlasdb.keyvalue.api.RangeRequest; import com.palantir.atlasdb.keyvalue.api.RangeRequests; import com.palantir.atlasdb.keyvalue.api.RowResult; import com.palantir.atlasdb.keyvalue.api.SweepResults; import com.palantir.atlasdb.keyvalue.api.Value; import com.palantir.atlasdb.protos.generated.TableMetadataPersistence.SweepStrategy; import com.palantir.atlasdb.transaction.api.Transaction.TransactionType; import com.palantir.atlasdb.transaction.api.TransactionFailedRetriableException; import com.palantir.atlasdb.transaction.api.TransactionManager; import com.palantir.atlasdb.transaction.impl.SweepStrategyManager; import com.palantir.atlasdb.transaction.impl.TransactionConstants; import com.palantir.atlasdb.transaction.service.TransactionService; import com.palantir.common.annotation.Modified; import com.palantir.common.annotation.Output; import com.palantir.common.base.ClosableIterator; import com.palantir.common.base.ClosableIterators; /** * Sweeps one individual table. * * @author jweel */ public class SweepTaskRunner { private static final Logger log = LoggerFactory.getLogger(SweepTaskRunner.class); private static final Set<Long> invalidTimestamps = ImmutableSet.of(Value.INVALID_VALUE_TIMESTAMP); private final TransactionManager txManager; private final KeyValueService keyValueService; private final Supplier<Long> unreadableTimestampSupplier; private final Supplier<Long> immutableTimestampSupplier; private final TransactionService transactionService; private final SweepStrategyManager sweepStrategyManager; private final Supplier<Integer> batchSizeSupplier; private final Collection<Follower> followers; public SweepTaskRunner(TransactionManager txManager, KeyValueService keyValueService, Supplier<Long> unreadableTimestampSupplier, Supplier<Long> immutableTimestampSupplier, TransactionService transactionService, SweepStrategyManager sweepStrategyManager, Supplier<Integer> batchSizeSupplier, Collection<Follower> followers) { this.txManager = txManager; this.keyValueService = keyValueService; this.unreadableTimestampSupplier = unreadableTimestampSupplier; this.immutableTimestampSupplier = immutableTimestampSupplier; this.transactionService = transactionService; this.sweepStrategyManager = sweepStrategyManager; this.batchSizeSupplier = batchSizeSupplier; this.followers = followers; } public SweepResults run(String tableName, @Nullable byte[] startRow) { Preconditions.checkNotNull(tableName); Preconditions.checkState(!tableName.startsWith(AtlasDbConstants.NAMESPACE_PREFIX), "The sweeper should not be run on tables passed through namespace mapping."); Preconditions.checkState(!AtlasDbConstants.hiddenTables.contains(tableName)); // Earliest start timestamp of any currently open transaction, with two caveats: // (1) unreadableTimestamps are calculated via wall-clock time, and so may not be correct // under pathological clock conditions // (2) immutableTimestamps do not account for locks have timed out after checking their locks; // such a transaction may have a start timestamp less than the immutableTimestamp, and it // could still get successfully committed (its commit timestamp may or may not be less than // the immutableTimestamp // Note that this is fine, because we'll either // (1) force old readers to abort (if they read a garbage collection sentinel), or // (2) force old writers to retry (note that we must roll back any uncommitted transactions that // we encounter SweepStrategy sweepStrategy = sweepStrategyManager.get().get(tableName); if (sweepStrategy == null) { sweepStrategy = SweepStrategy.CONSERVATIVE; } else if (sweepStrategy == SweepStrategy.NOTHING) { return new SweepResults(null, 0, 0); } if (startRow == null) { startRow = new byte[0]; } int batchSize = batchSizeSupplier.get(); RangeRequest rangeRequest = RangeRequest.builder().startRowInclusive(startRow).batchHint(batchSize).build(); long sweepTimestamp = getSweepTimestamp(tableName); ClosableIterator<RowResult<Value>> valueResults; if (sweepStrategy == SweepStrategy.CONSERVATIVE) { valueResults = ClosableIterators.wrap(ImmutableList.<RowResult<Value>>of().iterator()); } else { valueResults = keyValueService.getRange(tableName, rangeRequest, sweepTimestamp); } ClosableIterator<RowResult<Set<Long>>> rowResults = keyValueService.getRangeOfTimestamps(tableName, rangeRequest, sweepTimestamp); try { List<RowResult<Set<Long>>> rowResultTimestamps = ImmutableList .copyOf(Iterators.limit(rowResults, batchSize)); PeekingIterator<RowResult<Value>> peekingValues = Iterators.peekingIterator(valueResults); Set<Cell> sentinelsToAdd = Sets.newHashSet(); Multimap<Cell, Long> rowTimestamps = getTimestampsFromRowResults(rowResultTimestamps, sweepStrategy); Multimap<Cell, Long> cellTsPairsToSweep = getCellTsPairsToSweep(rowTimestamps, peekingValues, sweepTimestamp, sweepStrategy, sentinelsToAdd); sweepCells(tableName, cellTsPairsToSweep, sentinelsToAdd); byte[] nextRow = rowResultTimestamps.size() < batchSize ? null : RangeRequests.getNextStartRow(false, Iterables.getLast(rowResultTimestamps).getRowName()); return new SweepResults(nextRow, rowResultTimestamps.size(), cellTsPairsToSweep.size()); } finally { rowResults.close(); valueResults.close(); } } public long getSweepTimestamp(String tableName) { SweepStrategy sweepStrategy = sweepStrategyManager.get().get(tableName); if (sweepStrategy == SweepStrategy.CONSERVATIVE) { return Math.min(unreadableTimestampSupplier.get(), immutableTimestampSupplier.get()); } else { return immutableTimestampSupplier.get(); } } private Multimap<Cell, Long> getTimestampsFromRowResults(List<RowResult<Set<Long>>> cellsToSweep, SweepStrategy sweepStrategy) { Multimap<Cell, Long> cellTsMappings = HashMultimap.create(); for (RowResult<Set<Long>> rowResult : cellsToSweep) { for (Map.Entry<Cell, Set<Long>> entry : rowResult.getCells()) { if (sweepStrategy == SweepStrategy.CONSERVATIVE) { cellTsMappings.putAll(entry.getKey(), Sets.difference(entry.getValue(), invalidTimestamps)); } else { cellTsMappings.putAll(entry.getKey(), entry.getValue()); } } } return cellTsMappings; } private Multimap<Cell, Long> getCellTsPairsToSweep(Multimap<Cell, Long> cellTsMappings, PeekingIterator<RowResult<Value>> values, long sweepTimestamp, SweepStrategy sweepStrategy, @Output Set<Cell> sentinelsToAdd) { Multimap<Cell, Long> cellTsMappingsToSweep = HashMultimap.create(); Map<Long, Long> startTsToCommitTs = transactionService.get(cellTsMappings.values()); for (Map.Entry<Cell, Collection<Long>> entry : cellTsMappings.asMap().entrySet()) { Cell cell = entry.getKey(); Collection<Long> timestamps = entry.getValue(); boolean sweepLastCommitted = isLatestValueEmpty(cell, values); Iterable<? extends Long> timestampsToSweep = getTimestampsToSweep(cell, timestamps, startTsToCommitTs, sentinelsToAdd, sweepTimestamp, sweepLastCommitted, sweepStrategy); cellTsMappingsToSweep.putAll(entry.getKey(), timestampsToSweep); } return cellTsMappingsToSweep; } private boolean isLatestValueEmpty(Cell cell, PeekingIterator<RowResult<Value>> values) { while (values.hasNext()) { RowResult<Value> result = values.peek(); int comparison = UnsignedBytes.lexicographicalComparator().compare(cell.getRowName(), result.getRowName()); if (comparison == 0) { Value matchingValue = result.getColumns().get(cell.getColumnName()); return matchingValue != null && matchingValue.getContents().length == 0; } else if (comparison < 0) { return false; } else { values.next(); } } return false; } private Set<Long> getTimestampsToSweep(Cell cell, Collection<Long> timestamps /* start timestamps */, @Modified Map<Long, Long> startTsToCommitTs, @Output Set<Cell> sentinelsToAdd, long sweepTimestamp, boolean sweepLastCommitted, SweepStrategy sweepStrategy) { Set<Long> uncommittedTimestamps = Sets.newHashSet(); SortedSet<Long> committedTimestampsToSweep = Sets.newTreeSet(); long maxStartTs = TransactionConstants.FAILED_COMMIT_TS; boolean maxStartTsIsCommitted = false; for (long startTs : timestamps) { long commitTs = ensureCommitTimestampExists(startTs, startTsToCommitTs); if (startTs > maxStartTs && commitTs < sweepTimestamp) { maxStartTs = startTs; maxStartTsIsCommitted = commitTs != TransactionConstants.FAILED_COMMIT_TS; } // Note: there could be an open transaction whose start timestamp is equal to // sweepTimestamp; thus we want to sweep all cells such that: // (1) their commit timestamp is less than sweepTimestamp // (2) their start timestamp is NOT the greatest possible start timestamp // passing condition (1) if (commitTs > 0 && commitTs < sweepTimestamp) { committedTimestampsToSweep.add(startTs); } else if (commitTs == TransactionConstants.FAILED_COMMIT_TS) { uncommittedTimestamps.add(startTs); } } if (committedTimestampsToSweep.isEmpty()) { return uncommittedTimestamps; } if (sweepStrategy == SweepStrategy.CONSERVATIVE && committedTimestampsToSweep.size() > 1) { // We need to add a sentinel if we are removing a committed value sentinelsToAdd.add(cell); } if (sweepLastCommitted && maxStartTsIsCommitted) { return Sets.union(uncommittedTimestamps, committedTimestampsToSweep); } return Sets.union(uncommittedTimestamps, committedTimestampsToSweep.subSet(0L, committedTimestampsToSweep.last())); } private long ensureCommitTimestampExists(Long startTs, @Modified Map<Long, Long> startTsToCommitTs) { Long commitTs = startTsToCommitTs.get(startTs); if (commitTs == null) { // Roll back this transaction (note that rolling back arbitrary transactions // can never cause correctness issues, only liveness issues) try { // TODO: carrino: use the batched version of putUnlessExists when it is available. transactionService.putUnlessExists(startTs, TransactionConstants.FAILED_COMMIT_TS); } catch (KeyAlreadyExistsException e) { String msg = "Could not roll back transaction with start timestamp " + startTs + "; either" + " it was already rolled back (by a different transaction), or it committed successfully" + " before we could roll it back."; log.error("This isn't a bug but it should be very infrequent. " + msg, new TransactionFailedRetriableException(msg, e)); } commitTs = transactionService.get(startTs); Validate.notNull(commitTs); startTsToCommitTs.put(startTs, commitTs); } return commitTs; } private void sweepCells(String tableName, Multimap<Cell, Long> cellTsPairsToSweep, Set<Cell> sentinelsToAdd) { if (cellTsPairsToSweep.isEmpty()) { return; } for (Follower follower : followers) { follower.run(txManager, tableName, cellTsPairsToSweep.keySet(), TransactionType.HARD_DELETE); } if (!sentinelsToAdd.isEmpty()) { keyValueService.addGarbageCollectionSentinelValues(tableName, sentinelsToAdd); } keyValueService.delete(tableName, cellTsPairsToSweep); } }