org.apache.tephra.hbase.txprune.DataJanitorState.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tephra.hbase.txprune.DataJanitorState.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.tephra.hbase.txprune;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.tephra.hbase.coprocessor.TransactionProcessor;
import org.apache.tephra.txprune.RegionPruneInfo;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.annotation.Nullable;

/**
 * Persist data janitor state into an HBase table.
 * This is used by both {@link TransactionProcessor} and by the {@link HBaseTransactionPruningPlugin}
 * to persist and read the compaction state.
 */
@SuppressWarnings("WeakerAccess")
public class DataJanitorState {
    private static final Log LOG = LogFactory.getLog(DataJanitorState.class);

    public static final byte[] FAMILY = { 'f' };
    public static final byte[] PRUNE_UPPER_BOUND_COL = { 'p' };

    private static final byte[] REGION_TIME_COL = { 'r' };
    private static final byte[] INACTIVE_TRANSACTION_BOUND_TIME_COL = { 'i' };
    private static final byte[] EMPTY_REGION_TIME_COL = { 'e' };

    private static final byte[] REGION_KEY_PREFIX = { 0x1 };
    private static final byte[] REGION_KEY_PREFIX_STOP = { 0x2 };

    private static final byte[] REGION_TIME_KEY_PREFIX = { 0x2 };
    private static final byte[] REGION_TIME_KEY_PREFIX_STOP = { 0x3 };

    private static final byte[] INACTIVE_TRANSACTION_BOUND_TIME_KEY_PREFIX = { 0x3 };
    private static final byte[] INACTIVE_TRANSACTION_BOUND_TIME_KEY_PREFIX_STOP = { 0x4 };

    private static final byte[] EMPTY_REGION_TIME_KEY_PREFIX = { 0x4 };
    private static final byte[] EMPTY_REGION_TIME_KEY_PREFIX_STOP = { 0x5 };

    private static final byte[] REGION_TIME_COUNT_KEY_PREFIX = { 0x5 };
    private static final byte[] REGION_TIME_COUNT_KEY_PREFIX_STOP = { 0x6 };

    private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
    // This value can be used when we don't care about the value we write in a column
    private static final byte[] COL_VAL = Bytes.toBytes('1');

    private final TableSupplier stateTableSupplier;

    public DataJanitorState(TableSupplier stateTableSupplier) {
        this.stateTableSupplier = stateTableSupplier;
    }

    // ----------------------------------------------------------------
    // ------- Methods for prune upper bound for a given region -------
    // ----------------------------------------------------------------
    // The data is stored in the following format -
    // Key: 0x1<region-id>
    // Col 'u': <prune upper bound>
    // ----------------------------------------------------------------

    /**
     * Persist the latest prune upper bound for a given region. This is called by {@link TransactionProcessor}
     * after major compaction.
     *
     * @param regionId region id
     * @param pruneUpperBound the latest prune upper bound for the region
     * @throws IOException when not able to persist the data to HBase
     */
    public void savePruneUpperBoundForRegion(byte[] regionId, long pruneUpperBound) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Put put = new Put(makeRegionKey(regionId));
            put.addColumn(FAMILY, PRUNE_UPPER_BOUND_COL, Bytes.toBytes(pruneUpperBound));
            stateTable.put(put);
        }
    }

    /**
     * Get latest prune upper bound for a given region. This indicates the largest invalid transaction that no
     * longer has writes in this region.
     *
     * @param regionId region id
     * @return latest prune upper bound for the region
     * @throws IOException when not able to read the data from HBase
     */
    public long getPruneUpperBoundForRegion(byte[] regionId) throws IOException {
        RegionPruneInfo regionPruneInfo = getPruneInfoForRegion(regionId);
        return (regionPruneInfo == null) ? -1 : regionPruneInfo.getPruneUpperBound();
    }

    /**
     * Get the latest {@link RegionPruneInfo} for a given region.
     *
     * @param regionId region id
     * @return {@link RegionPruneInfo} for the region
     * @throws IOException when not able to read the data from HBase
     */
    @Nullable
    public RegionPruneInfo getPruneInfoForRegion(byte[] regionId) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Get get = new Get(makeRegionKey(regionId));
            get.addColumn(FAMILY, PRUNE_UPPER_BOUND_COL);
            Cell cell = stateTable.get(get).getColumnLatestCell(FAMILY, PRUNE_UPPER_BOUND_COL);
            if (cell == null) {
                return null;
            }
            byte[] pruneUpperBoundBytes = CellUtil.cloneValue(cell);
            long timestamp = cell.getTimestamp();
            return new RegionPruneInfo(regionId, Bytes.toStringBinary(regionId), Bytes.toLong(pruneUpperBoundBytes),
                    timestamp);
        }
    }

    /**
     * Get latest prune upper bounds for given regions. This is a batch operation of method
     * {@link #getPruneUpperBoundForRegion(byte[])}
     *
     * @param regions a set of regions
     * @return a map containing region id and its latest prune upper bound value
     * @throws IOException when not able to read the data from HBase
     */
    public Map<byte[], Long> getPruneUpperBoundForRegions(SortedSet<byte[]> regions) throws IOException {
        Map<byte[], Long> resultMap = new TreeMap<>(Bytes.BYTES_COMPARATOR);
        List<RegionPruneInfo> regionPruneInfos = getPruneInfoForRegions(regions);
        for (RegionPruneInfo regionPruneInfo : regionPruneInfos) {
            resultMap.put(regionPruneInfo.getRegionName(), regionPruneInfo.getPruneUpperBound());
        }
        return Collections.unmodifiableMap(resultMap);
    }

    /**
     * Gets a list of {@link RegionPruneInfo} for given regions. Returns all regions if the given regions set is null.
     *
     * @param regions a set of regions
     * @return list of {@link RegionPruneInfo}s.
     * @throws IOException when not able to read the data from HBase
     */
    public List<RegionPruneInfo> getPruneInfoForRegions(@Nullable SortedSet<byte[]> regions) throws IOException {
        List<RegionPruneInfo> regionPruneInfos = new ArrayList<>();
        try (Table stateTable = stateTableSupplier.get()) {
            byte[] startRow = makeRegionKey(EMPTY_BYTE_ARRAY);
            Scan scan = new Scan(startRow, REGION_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, PRUNE_UPPER_BOUND_COL);

            try (ResultScanner scanner = stateTable.getScanner(scan)) {
                Result next;
                while ((next = scanner.next()) != null) {
                    byte[] region = getRegionFromKey(next.getRow());
                    if (regions == null || regions.contains(region)) {
                        Cell cell = next.getColumnLatestCell(FAMILY, PRUNE_UPPER_BOUND_COL);
                        if (cell != null) {
                            byte[] pruneUpperBoundBytes = CellUtil.cloneValue(cell);
                            long timestamp = cell.getTimestamp();
                            regionPruneInfos.add(new RegionPruneInfo(region, Bytes.toStringBinary(region),
                                    Bytes.toLong(pruneUpperBoundBytes), timestamp));
                        }
                    }
                }
            }
        }
        return Collections.unmodifiableList(regionPruneInfos);
    }

    /**
     * Delete prune upper bounds for the regions that are not in the given exclude set, and the
     * prune upper bound is less than the given value.
     * After the invalid list is pruned up to deletionPruneUpperBound, we do not need entries for regions that have
     * prune upper bound less than deletionPruneUpperBound. We however limit the deletion to only regions that are
     * no longer in existence (due to deletion, etc.), to avoid update/delete race conditions.
     *
     * @param deletionPruneUpperBound prune upper bound below which regions will be deleted
     * @param excludeRegions set of regions that should not be deleted
     * @throws IOException when not able to delete data in HBase
     */
    public void deletePruneUpperBounds(long deletionPruneUpperBound, SortedSet<byte[]> excludeRegions)
            throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            byte[] startRow = makeRegionKey(EMPTY_BYTE_ARRAY);
            Scan scan = new Scan(startRow, REGION_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, PRUNE_UPPER_BOUND_COL);

            try (ResultScanner scanner = stateTable.getScanner(scan)) {
                Result next;
                while ((next = scanner.next()) != null) {
                    byte[] region = getRegionFromKey(next.getRow());
                    if (!excludeRegions.contains(region)) {
                        byte[] timeBytes = next.getValue(FAMILY, PRUNE_UPPER_BOUND_COL);
                        if (timeBytes != null) {
                            long pruneUpperBoundRegion = Bytes.toLong(timeBytes);
                            if (pruneUpperBoundRegion < deletionPruneUpperBound) {
                                stateTable.delete(new Delete(next.getRow()));
                            }
                        }
                    }
                }
            }
        }
    }

    // ---------------------------------------------------
    // ------- Methods for regions at a given time -------
    // ---------------------------------------------------
    // Key: 0x2<inverted time><region-id>
    // Col 't': <empty byte array>
    // ---------------------------------------------------

    /**
     * Persist the regions for the given time. {@link HBaseTransactionPruningPlugin} saves the set of
     * transactional regions existing in the HBase instance periodically.
     *
     * @param time timestamp in milliseconds
     * @param regions set of regions at the time
     * @throws IOException when not able to persist the data to HBase
     */
    public void saveRegionsForTime(long time, Set<byte[]> regions) throws IOException {
        byte[] timeBytes = Bytes.toBytes(getInvertedTime(time));
        try (Table stateTable = stateTableSupplier.get()) {
            for (byte[] region : regions) {
                Put put = new Put(makeTimeRegionKey(timeBytes, region));
                put.addColumn(FAMILY, REGION_TIME_COL, COL_VAL);
                stateTable.put(put);
            }

            // Save the count of regions as a checksum
            saveRegionCountForTime(stateTable, timeBytes, regions.size());
        }
    }

    @VisibleForTesting
    void saveRegionCountForTime(Table stateTable, byte[] timeBytes, int count) throws IOException {
        Put put = new Put(makeTimeRegionCountKey(timeBytes));
        put.addColumn(FAMILY, REGION_TIME_COL, Bytes.toBytes(count));
        stateTable.put(put);
    }

    /**
     * Return the set of regions saved for the time at or before the given time. This method finds the greatest time
     * that is less than or equal to the given time, and then returns all regions with that exact time, but none that are
     * older than that.
     *
     * @param time timestamp in milliseconds
     * @return set of regions and time at which they were recorded, or null if no regions found
     * @throws IOException when not able to read the data from HBase
     */
    @Nullable
    public TimeRegions getRegionsOnOrBeforeTime(long time) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            TimeRegions timeRegions;
            while ((timeRegions = getNextSetOfTimeRegions(stateTable, time)) != null) {
                int count = getRegionCountForTime(stateTable, timeRegions.getTime());
                if (count != -1 && count == timeRegions.getRegions().size()) {
                    return timeRegions;
                } else {
                    LOG.warn(String.format(
                            "Got incorrect count for regions saved at time %s, expected = %s but actual = %s",
                            timeRegions.getTime(), count, timeRegions.getRegions().size()));
                    time = timeRegions.getTime() - 1;
                }
            }
            return null;
        }
    }

    @Nullable
    private TimeRegions getNextSetOfTimeRegions(Table stateTable, long time) throws IOException {
        byte[] timeBytes = Bytes.toBytes(getInvertedTime(time));
        Scan scan = new Scan(makeTimeRegionKey(timeBytes, EMPTY_BYTE_ARRAY), REGION_TIME_KEY_PREFIX_STOP);
        scan.addColumn(FAMILY, REGION_TIME_COL);

        long currentRegionTime = -1;
        SortedSet<byte[]> regions = new TreeSet<>(Bytes.BYTES_COMPARATOR);
        Result next;
        try (ResultScanner scanner = stateTable.getScanner(scan)) {
            while ((next = scanner.next()) != null) {
                Map.Entry<Long, byte[]> timeRegion = getTimeRegion(next.getRow());
                // Stop if reached next time value
                if (currentRegionTime == -1) {
                    currentRegionTime = timeRegion.getKey();
                } else if (timeRegion.getKey() < currentRegionTime) {
                    break;
                } else if (timeRegion.getKey() > currentRegionTime) {
                    throw new IllegalStateException(
                            String.format("Got out of order time %d when expecting time less than or equal to %d",
                                    timeRegion.getKey(), currentRegionTime));
                }
                regions.add(timeRegion.getValue());
            }
        }
        return regions.isEmpty() ? null
                : new TimeRegions(currentRegionTime, Collections.unmodifiableSortedSet(regions));
    }

    @VisibleForTesting
    int getRegionCountForTime(Table stateTable, long time) throws IOException {
        Get get = new Get(makeTimeRegionCountKey(Bytes.toBytes(getInvertedTime(time))));
        get.addColumn(FAMILY, REGION_TIME_COL);
        Result result = stateTable.get(get);
        byte[] value = result.getValue(FAMILY, REGION_TIME_COL);
        return value == null ? -1 : Bytes.toInt(value);
    }

    /**
     * Delete all the regions that were recorded for all times equal or less than the given time.
     *
     * @param time timestamp in milliseconds
     * @throws IOException when not able to delete data in HBase
     */
    public void deleteAllRegionsOnOrBeforeTime(long time) throws IOException {
        byte[] timeBytes = Bytes.toBytes(getInvertedTime(time));
        try (Table stateTable = stateTableSupplier.get()) {
            // Delete the regions
            Scan scan = new Scan(makeTimeRegionKey(timeBytes, EMPTY_BYTE_ARRAY), REGION_TIME_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, REGION_TIME_COL);
            deleteFromScan(stateTable, scan);

            // Delete the count
            scan = new Scan(makeTimeRegionCountKey(timeBytes), REGION_TIME_COUNT_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, REGION_TIME_COL);
            deleteFromScan(stateTable, scan);
        }
    }

    // ---------------------------------------------------------------------
    // ------- Methods for inactive transaction bound for given time -------
    // ---------------------------------------------------------------------
    // Key: 0x3<inverted time>
    // Col 'p': <inactive transaction bound>
    // ---------------------------------------------------------------------

    /**
     * Persist inactive transaction bound for a given time. This is the smallest not in-progress transaction that
     * will not have writes in any HBase regions that are created after the given time.
     *
     * @param time time in milliseconds
     * @param inactiveTransactionBound inactive transaction bound for the given time
     * @throws IOException when not able to persist the data to HBase
     */
    public void saveInactiveTransactionBoundForTime(long time, long inactiveTransactionBound) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Put put = new Put(makeInactiveTransactionBoundTimeKey(Bytes.toBytes(getInvertedTime(time))));
            put.addColumn(FAMILY, INACTIVE_TRANSACTION_BOUND_TIME_COL, Bytes.toBytes(inactiveTransactionBound));
            stateTable.put(put);
        }
    }

    /**
     * Return inactive transaction bound for the given time.
     *
     * @param time time in milliseconds
     * @return inactive transaction bound for the given time
     * @throws IOException when not able to read the data from HBase
     */
    public long getInactiveTransactionBoundForTime(long time) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Get get = new Get(makeInactiveTransactionBoundTimeKey(Bytes.toBytes(getInvertedTime(time))));
            get.addColumn(FAMILY, INACTIVE_TRANSACTION_BOUND_TIME_COL);
            byte[] result = stateTable.get(get).getValue(FAMILY, INACTIVE_TRANSACTION_BOUND_TIME_COL);
            return result == null ? -1 : Bytes.toLong(result);
        }
    }

    /**
     * Delete all inactive transaction bounds recorded for a time less than the given time
     *
     * @param time time in milliseconds
     * @throws IOException when not able to delete data in HBase
     */
    public void deleteInactiveTransactionBoundsOnOrBeforeTime(long time) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Scan scan = new Scan(makeInactiveTransactionBoundTimeKey(Bytes.toBytes(getInvertedTime(time))),
                    INACTIVE_TRANSACTION_BOUND_TIME_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, INACTIVE_TRANSACTION_BOUND_TIME_COL);
            deleteFromScan(stateTable, scan);
        }
    }

    // --------------------------------------------------------
    // ------- Methods for empty regions at a given time -------
    // --------------------------------------------------------
    // Key: 0x4<time><region-id>
    // Col 'e': <empty byte array>
    // --------------------------------------------------------

    /**
     * Save the given region as empty as of the given time.
     *
     * @param time time in milliseconds
     * @param regionId region id
     */
    public void saveEmptyRegionForTime(long time, byte[] regionId) throws IOException {
        byte[] timeBytes = Bytes.toBytes(time);
        try (Table stateTable = stateTableSupplier.get()) {
            Put put = new Put(makeEmptyRegionTimeKey(timeBytes, regionId));
            put.addColumn(FAMILY, EMPTY_REGION_TIME_COL, COL_VAL);
            stateTable.put(put);
        }
    }

    /**
     * Return regions that were recorded as empty after the given time.
     *
     * @param time time in milliseconds
     * @param includeRegions If not null, the returned set will be an intersection of the includeRegions set
     *                       and the empty regions after the given time
     */
    public SortedSet<byte[]> getEmptyRegionsAfterTime(long time, @Nullable SortedSet<byte[]> includeRegions)
            throws IOException {
        SortedSet<byte[]> emptyRegions = new TreeSet<>(Bytes.BYTES_COMPARATOR);
        try (Table stateTable = stateTableSupplier.get()) {
            Scan scan = new Scan(makeEmptyRegionTimeKey(Bytes.toBytes(time + 1), EMPTY_BYTE_ARRAY),
                    EMPTY_REGION_TIME_KEY_PREFIX_STOP);
            scan.addColumn(FAMILY, EMPTY_REGION_TIME_COL);

            try (ResultScanner scanner = stateTable.getScanner(scan)) {
                Result next;
                while ((next = scanner.next()) != null) {
                    byte[] emptyRegion = getEmptyRegionFromKey(next.getRow());
                    if (includeRegions == null || includeRegions.contains(emptyRegion)) {
                        emptyRegions.add(emptyRegion);
                    }
                }
            }
        }
        return Collections.unmodifiableSortedSet(emptyRegions);
    }

    /**
     * Delete empty region records saved on or before the given time.
     *
     * @param time time in milliseconds
     */
    public void deleteEmptyRegionsOnOrBeforeTime(long time) throws IOException {
        try (Table stateTable = stateTableSupplier.get()) {
            Scan scan = new Scan();
            scan.setStopRow(makeEmptyRegionTimeKey(Bytes.toBytes(time + 1), EMPTY_BYTE_ARRAY));
            scan.addColumn(FAMILY, EMPTY_REGION_TIME_COL);
            deleteFromScan(stateTable, scan);
        }
    }

    @VisibleForTesting
    void deleteFromScan(Table stateTable, Scan scan) throws IOException {
        try (ResultScanner scanner = stateTable.getScanner(scan)) {
            Result next;
            while ((next = scanner.next()) != null) {
                stateTable.delete(new Delete(next.getRow()));
            }
        }
    }

    private byte[] makeRegionKey(byte[] regionId) {
        return Bytes.add(REGION_KEY_PREFIX, regionId);
    }

    private byte[] getRegionFromKey(byte[] regionKey) {
        int prefixLen = REGION_KEY_PREFIX.length;
        return Bytes.copy(regionKey, prefixLen, regionKey.length - prefixLen);
    }

    private byte[] makeTimeRegionKey(byte[] time, byte[] regionId) {
        return Bytes.add(REGION_TIME_KEY_PREFIX, time, regionId);
    }

    private byte[] makeTimeRegionCountKey(byte[] time) {
        return Bytes.add(REGION_TIME_COUNT_KEY_PREFIX, time);
    }

    private byte[] makeInactiveTransactionBoundTimeKey(byte[] time) {
        return Bytes.add(INACTIVE_TRANSACTION_BOUND_TIME_KEY_PREFIX, time);
    }

    private Map.Entry<Long, byte[]> getTimeRegion(byte[] key) {
        int offset = REGION_TIME_KEY_PREFIX.length;
        long time = getInvertedTime(Bytes.toLong(key, offset));
        offset += Bytes.SIZEOF_LONG;
        byte[] regionName = Bytes.copy(key, offset, key.length - offset);
        return Maps.immutableEntry(time, regionName);
    }

    private byte[] makeEmptyRegionTimeKey(byte[] time, byte[] regionId) {
        return Bytes.add(EMPTY_REGION_TIME_KEY_PREFIX, time, regionId);
    }

    private byte[] getEmptyRegionFromKey(byte[] key) {
        int prefixLen = EMPTY_REGION_TIME_KEY_PREFIX.length + Bytes.SIZEOF_LONG;
        return Bytes.copy(key, prefixLen, key.length - prefixLen);
    }

    private long getInvertedTime(long time) {
        return Long.MAX_VALUE - time;
    }

    /**
     * Supplies table for persisting state
     */
    public interface TableSupplier {
        Table get() throws IOException;
    }
}