org.apache.jackrabbit.oak.segment.file.TarReader.java Source code

Introduction

Here is the source code for org.apache.jackrabbit.oak.segment.file.TarReader.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.jackrabbit.oak.segment.file;

import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Lists.newArrayListWithCapacity;
import static com.google.common.collect.Maps.newHashMap;
import static com.google.common.collect.Maps.newLinkedHashMap;
import static com.google.common.collect.Maps.newTreeMap;
import static com.google.common.collect.Sets.newHashSet;
import static com.google.common.collect.Sets.newHashSetWithExpectedSize;
import static java.nio.ByteBuffer.wrap;
import static java.util.Collections.singletonList;
import static org.apache.jackrabbit.oak.segment.Segment.REF_COUNT_OFFSET;
import static org.apache.jackrabbit.oak.segment.Segment.getGcGeneration;
import static org.apache.jackrabbit.oak.segment.SegmentId.isDataSegmentId;
import static org.apache.jackrabbit.oak.segment.file.TarWriter.GRAPH_MAGIC;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.CRC32;

import javax.annotation.Nonnull;

import com.google.common.base.Predicate;
import org.apache.commons.io.FileUtils;
import org.apache.jackrabbit.oak.plugins.blob.ReferenceCollector;
import org.apache.jackrabbit.oak.segment.SegmentGraph.SegmentGraphVisitor;
import org.apache.jackrabbit.oak.segment.SegmentId;
import org.apache.jackrabbit.oak.segment.SegmentStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class TarReader implements Closeable {

    /** Logger instance */
    private static final Logger log = LoggerFactory.getLogger(TarReader.class);

    private static final Logger GC_LOG = LoggerFactory.getLogger(TarReader.class.getName() + "-GC");

    /** Magic byte sequence at the end of the index block. */
    private static final int INDEX_MAGIC = TarWriter.INDEX_MAGIC;

    /**
     * Pattern of the segment entry names. Note the trailing (\\..*)? group
     * that's included for compatibility with possible future extensions.
     */
    private static final Pattern NAME_PATTERN = Pattern.compile(
            "([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})" + "(\\.([0-9a-f]{8}))?(\\..*)?");

    /** The tar file block size. */
    private static final int BLOCK_SIZE = TarWriter.BLOCK_SIZE;

    static int getEntrySize(int size) {
        return BLOCK_SIZE + size + TarWriter.getPaddingSize(size);
    }

    static TarReader open(File file, boolean memoryMapping) throws IOException {
        TarReader reader = openFirstFileWithValidIndex(singletonList(file), memoryMapping);
        if (reader != null) {
            return reader;
        } else {
            throw new IOException("Failed to open tar file " + file);
        }
    }

    /**
     * Creates a TarReader instance for reading content from a tar file.
     * If there exist multiple generations of the same tar file, they are
     * all passed to this method. The latest generation with a valid tar
     * index (which is a good indication of general validity of the file)
     * is opened and the other generations are removed to clean things up.
     * If none of the generations has a valid index, then something must have
     * gone wrong and we'll try recover as much content as we can from the
     * existing tar generations.
     *
     * @param files
     * @param memoryMapping
     * @return
     * @throws IOException
     */
    static TarReader open(Map<Character, File> files, boolean memoryMapping) throws IOException {
        SortedMap<Character, File> sorted = newTreeMap();
        sorted.putAll(files);

        List<File> list = newArrayList(sorted.values());
        Collections.reverse(list);

        TarReader reader = openFirstFileWithValidIndex(list, memoryMapping);
        if (reader != null) {
            return reader;
        }

        // no generation has a valid index, so recover as much as we can
        log.warn("Could not find a valid tar index in {}, recovering...", list);
        LinkedHashMap<UUID, byte[]> entries = newLinkedHashMap();
        for (File file : sorted.values()) {
            collectFileEntries(file, entries, true);
        }

        // regenerate the first generation based on the recovered data
        File file = sorted.values().iterator().next();
        generateTarFile(entries, file);

        reader = openFirstFileWithValidIndex(singletonList(file), memoryMapping);
        if (reader != null) {
            return reader;
        } else {
            throw new IOException("Failed to open recovered tar file " + file);
        }
    }

    static TarReader openRO(Map<Character, File> files, boolean memoryMapping, boolean recover) throws IOException {
        // for readonly store only try the latest generation of a given
        // tar file to prevent any rollback or rewrite
        File file = files.get(Collections.max(files.keySet()));

        TarReader reader = openFirstFileWithValidIndex(singletonList(file), memoryMapping);
        if (reader != null) {
            return reader;
        }
        if (recover) {
            log.warn("Could not find a valid tar index in {}, recovering read-only", file);
            // collecting the entries (without touching the original file) and
            // writing them into an artificial tar file '.ro.bak'
            LinkedHashMap<UUID, byte[]> entries = newLinkedHashMap();
            collectFileEntries(file, entries, false);
            file = findAvailGen(file, ".ro.bak");
            generateTarFile(entries, file);
            reader = openFirstFileWithValidIndex(singletonList(file), memoryMapping);
            if (reader != null) {
                return reader;
            }
        }

        throw new IOException("Failed to open tar file " + file);
    }

    /**
     * Collects all entries from the given file and optionally backs-up the
     * file, by renaming it to a ".bak" extension
     * 
     * @param file
     * @param entries
     * @param backup
     * @throws IOException
     */
    private static void collectFileEntries(File file, LinkedHashMap<UUID, byte[]> entries, boolean backup)
            throws IOException {
        log.info("Recovering segments from tar file {}", file);
        try {
            RandomAccessFile access = new RandomAccessFile(file, "r");
            try {
                recoverEntries(file, access, entries);
            } finally {
                access.close();
            }
        } catch (IOException e) {
            log.warn("Could not read tar file {}, skipping...", file, e);
        }

        if (backup) {
            backupSafely(file);
        }
    }

    /**
     * Regenerates a tar file from a list of entries.
     * 
     * @param entries
     * @param file
     * @throws IOException
     */
    private static void generateTarFile(LinkedHashMap<UUID, byte[]> entries, File file) throws IOException {
        log.info("Regenerating tar file {}", file);
        TarWriter writer = new TarWriter(file);
        for (Map.Entry<UUID, byte[]> entry : entries.entrySet()) {
            UUID uuid = entry.getKey();
            byte[] data = entry.getValue();
            int generation = getGcGeneration(wrap(data), uuid);
            writer.writeEntry(uuid.getMostSignificantBits(), uuid.getLeastSignificantBits(), data, 0, data.length,
                    generation);
        }
        writer.close();
    }

    /**
     * Backup this tar file for manual inspection. Something went
     * wrong earlier so we want to prevent the data from being
     * accidentally removed or overwritten.
     *
     * @param file
     * @throws IOException
     */
    private static void backupSafely(File file) throws IOException {
        File backup = findAvailGen(file, ".bak");
        log.info("Backing up {} to {}", file, backup.getName());
        if (!file.renameTo(backup)) {
            log.warn("Renaming failed, so using copy to backup {}", file);
            FileUtils.copyFile(file, backup);
            if (!file.delete()) {
                throw new IOException("Could not remove broken tar file " + file);
            }
        }
    }

    /**
     * Fine next available generation number so that a generated file doesn't
     * overwrite another existing file.
     * 
     * @param file
     */
    private static File findAvailGen(File file, String ext) {
        File parent = file.getParentFile();
        String name = file.getName();
        File backup = new File(parent, name + ext);
        for (int i = 2; backup.exists(); i++) {
            backup = new File(parent, name + "." + i + ext);
        }
        return backup;
    }

    private static TarReader openFirstFileWithValidIndex(List<File> files, boolean memoryMapping) {
        for (File file : files) {
            String name = file.getName();
            try {
                RandomAccessFile access = new RandomAccessFile(file, "r");
                try {
                    ByteBuffer index = loadAndValidateIndex(access, name);
                    if (index == null) {
                        log.info("No index found in tar file {}, skipping...", name);
                    } else {
                        // found a file with a valid index, drop the others
                        for (File other : files) {
                            if (other != file) {
                                log.info("Removing unused tar file {}", other.getName());
                                other.delete();
                            }
                        }

                        if (memoryMapping) {
                            try {
                                FileAccess mapped = new FileAccess.Mapped(access);
                                // re-read the index, now with memory mapping
                                int indexSize = index.remaining();
                                index = mapped.read(mapped.length() - indexSize - 16 - 1024, indexSize);
                                return new TarReader(file, mapped, index);
                            } catch (IOException e) {
                                log.warn(
                                        "Failed to mmap tar file {}. Falling back to normal file "
                                                + "IO, which will negatively impact repository performance. "
                                                + "This problem may have been caused by restrictions on the "
                                                + "amount of virtual memory available to the JVM. Please make "
                                                + "sure that a 64-bit JVM is being used and that the process "
                                                + "has access to unlimited virtual memory (ulimit option -v).",
                                        name, e);
                            }
                        }

                        FileAccess random = new FileAccess.Random(access);
                        // prevent the finally block from closing the file
                        // as the returned TarReader will take care of that
                        access = null;
                        return new TarReader(file, random, index);
                    }
                } finally {
                    if (access != null) {
                        access.close();
                    }
                }
            } catch (IOException e) {
                log.warn("Could not read tar file {}, skipping...", name, e);
            }
        }

        return null;
    }

    /**
     * Tries to read an existing index from the given tar file. The index is
     * returned if it is found and looks valid (correct checksum, passes
     * sanity checks).
     *
     * @param file tar file
     * @param name name of the tar file, for logging purposes
     * @return tar index, or {@code null} if not found or not valid
     * @throws IOException if the tar file could not be read
     */
    private static ByteBuffer loadAndValidateIndex(RandomAccessFile file, String name) throws IOException {
        long length = file.length();
        if (length % BLOCK_SIZE != 0 || length < 6 * BLOCK_SIZE || length > Integer.MAX_VALUE) {
            log.warn("Unexpected size {} of tar file {}", length, name);
            return null; // unexpected file size
        }

        // read the index metadata just before the two final zero blocks
        ByteBuffer meta = ByteBuffer.allocate(16);
        file.seek(length - 2 * BLOCK_SIZE - 16);
        file.readFully(meta.array());
        int crc32 = meta.getInt();
        int count = meta.getInt();
        int bytes = meta.getInt();
        int magic = meta.getInt();

        if (magic != INDEX_MAGIC) {
            return null; // magic byte mismatch
        }

        if (count < 1 || bytes < count * TarEntry.SIZE + 16 || bytes % BLOCK_SIZE != 0) {
            log.warn("Invalid index metadata in tar file {}", name);
            return null; // impossible entry and/or byte counts
        }

        // this involves seeking backwards in the file, which might not
        // perform well, but that's OK since we only do this once per file
        ByteBuffer index = ByteBuffer.allocate(count * TarEntry.SIZE);
        file.seek(length - 2 * BLOCK_SIZE - 16 - count * TarEntry.SIZE);
        file.readFully(index.array());
        index.mark();

        CRC32 checksum = new CRC32();
        long limit = length - 2 * BLOCK_SIZE - bytes - BLOCK_SIZE;
        long lastmsb = Long.MIN_VALUE;
        long lastlsb = Long.MIN_VALUE;
        byte[] entry = new byte[TarEntry.SIZE];
        for (int i = 0; i < count; i++) {
            index.get(entry);
            checksum.update(entry);

            ByteBuffer buffer = wrap(entry);
            long msb = buffer.getLong();
            long lsb = buffer.getLong();
            int offset = buffer.getInt();
            int size = buffer.getInt();

            if (lastmsb > msb || (lastmsb == msb && lastlsb > lsb)) {
                log.warn("Incorrect index ordering in tar file {}", name);
                return null;
            } else if (lastmsb == msb && lastlsb == lsb && i > 0) {
                log.warn("Duplicate index entry in tar file {}", name);
                return null;
            } else if (offset < 0 || offset % BLOCK_SIZE != 0) {
                log.warn("Invalid index entry offset in tar file {}", name);
                return null;
            } else if (size < 1 || offset + size > limit) {
                log.warn("Invalid index entry size in tar file {}", name);
                return null;
            }

            lastmsb = msb;
            lastlsb = lsb;
        }

        if (crc32 != (int) checksum.getValue()) {
            log.warn("Invalid index checksum in tar file {}", name);
            return null; // checksum mismatch
        }

        index.reset();
        return index;
    }

    /**
     * Scans through the tar file, looking for all segment entries.
     *
     * @throws IOException if the tar file could not be read
     */
    private static void recoverEntries(File file, RandomAccessFile access, LinkedHashMap<UUID, byte[]> entries)
            throws IOException {
        byte[] header = new byte[BLOCK_SIZE];
        while (access.getFilePointer() + BLOCK_SIZE <= access.length()) {
            // read the tar header block
            access.readFully(header);

            // compute the header checksum
            int sum = 0;
            for (int i = 0; i < BLOCK_SIZE; i++) {
                sum += header[i] & 0xff;
            }

            // identify possible zero block
            if (sum == 0 && access.getFilePointer() + 2 * BLOCK_SIZE == access.length()) {
                return; // found the zero blocks at the end of the file
            }

            // replace the actual stored checksum with spaces for comparison
            for (int i = 148; i < 148 + 8; i++) {
                sum -= header[i] & 0xff;
                sum += ' ';
            }

            byte[] checkbytes = String.format("%06o\0 ", sum).getBytes(UTF_8);
            for (int i = 0; i < checkbytes.length; i++) {
                if (checkbytes[i] != header[148 + i]) {
                    log.warn("Invalid entry checksum at offset {} in tar file {}, skipping...",
                            access.getFilePointer() - BLOCK_SIZE, file);
                }
            }

            // The header checksum passes, so read the entry name and size
            ByteBuffer buffer = wrap(header);
            String name = readString(buffer, 100);
            buffer.position(124);
            int size = readNumber(buffer, 12);
            if (access.getFilePointer() + size > access.length()) {
                // checksum was correct, so the size field should be accurate
                log.warn("Partial entry {} in tar file {}, ignoring...", name, file);
                return;
            }

            Matcher matcher = NAME_PATTERN.matcher(name);
            if (matcher.matches()) {
                UUID id = UUID.fromString(matcher.group(1));

                String checksum = matcher.group(3);
                if (checksum != null || !entries.containsKey(id)) {
                    byte[] data = new byte[size];
                    access.readFully(data);

                    // skip possible padding to stay at block boundaries
                    long position = access.getFilePointer();
                    long remainder = position % BLOCK_SIZE;
                    if (remainder != 0) {
                        access.seek(position + (BLOCK_SIZE - remainder));
                    }

                    if (checksum != null) {
                        CRC32 crc = new CRC32();
                        crc.update(data);
                        if (crc.getValue() != Long.parseLong(checksum, 16)) {
                            log.warn("Checksum mismatch in entry {} of tar file {}, skipping...", name, file);
                            continue;
                        }
                    }

                    entries.put(id, data);
                }
            } else if (!name.equals(file.getName() + ".idx")) {
                log.warn("Unexpected entry {} in tar file {}, skipping...", name, file);
                long position = access.getFilePointer() + size;
                long remainder = position % BLOCK_SIZE;
                if (remainder != 0) {
                    position += BLOCK_SIZE - remainder;
                }
                access.seek(position);
            }
        }
    }

    private final File file;

    private final FileAccess access;

    private final ByteBuffer index;

    private volatile boolean closed;

    private volatile boolean hasGraph;

    private TarReader(File file, FileAccess access, ByteBuffer index) {
        this.file = file;
        this.access = access;
        this.index = index;
    }

    long size() {
        return file.length();
    }

    /**
     * Returns the number of segments in this tar file.
     *
     * @return number of segments
     */
    int count() {
        return index.capacity() / TarEntry.SIZE;
    }

    /**
     * Iterates over all entries in this tar file and calls
     * {@link TarEntryVisitor#visit(long, long, File, int, int)} on them.
     *
     * @param visitor entry visitor
     */
    void accept(TarEntryVisitor visitor) {
        int position = index.position();
        while (position < index.limit()) {
            visitor.visit(index.getLong(position), index.getLong(position + 8), file, index.getInt(position + 16),
                    index.getInt(position + 20));
            position += TarEntry.SIZE;
        }
    }

    Set<UUID> getUUIDs() {
        Set<UUID> uuids = newHashSetWithExpectedSize(index.remaining() / TarEntry.SIZE);
        int position = index.position();
        while (position < index.limit()) {
            uuids.add(new UUID(index.getLong(position), index.getLong(position + 8)));
            position += TarEntry.SIZE;
        }
        return uuids;
    }

    boolean containsEntry(long msb, long lsb) {
        return findEntry(msb, lsb) != -1;
    }

    /**
     * If the given segment is in this file, get the byte buffer that allows
     * reading it.
     * <p>
     * Whether or not this will read from the file depends on whether memory
     * mapped files are used or not.
     * 
     * @param msb the most significant bits of the segment id
     * @param lsb the least significant bits of the segment id
     * @return the byte buffer, or null if not in this file
     */
    ByteBuffer readEntry(long msb, long lsb) throws IOException {
        int position = findEntry(msb, lsb);
        if (position != -1) {
            return access.read(index.getInt(position + 16), index.getInt(position + 20));
        } else {
            return null;
        }
    }

    /**
     * Find the position of the given segment in the tar file.
     * It uses the tar index if available.
     * 
     * @param msb the most significant bits of the segment id
     * @param lsb the least significant bits of the segment id
     * @return the position in the file, or -1 if not found
     */
    private int findEntry(long msb, long lsb) {
        // The segment identifiers are randomly generated with uniform
        // distribution, so we can use interpolation search to find the
        // matching entry in the index. The average runtime is O(log log n).

        int lowIndex = 0;
        int highIndex = index.remaining() / TarEntry.SIZE - 1;
        float lowValue = Long.MIN_VALUE;
        float highValue = Long.MAX_VALUE;
        float targetValue = msb;

        while (lowIndex <= highIndex) {
            int guessIndex = lowIndex
                    + Math.round((highIndex - lowIndex) * (targetValue - lowValue) / (highValue - lowValue));
            int position = index.position() + guessIndex * TarEntry.SIZE;
            long m = index.getLong(position);
            if (msb < m) {
                highIndex = guessIndex - 1;
                highValue = m;
            } else if (msb > m) {
                lowIndex = guessIndex + 1;
                lowValue = m;
            } else {
                // getting close...
                long l = index.getLong(position + 8);
                if (lsb < l) {
                    highIndex = guessIndex - 1;
                    highValue = m;
                } else if (lsb > l) {
                    lowIndex = guessIndex + 1;
                    lowValue = m;
                } else {
                    // found it!
                    return position;
                }
            }
        }

        // not found
        return -1;
    }

    @Nonnull
    private TarEntry[] getEntries() {
        TarEntry[] entries = new TarEntry[index.remaining() / TarEntry.SIZE];
        int position = index.position();
        for (int i = 0; position < index.limit(); i++) {
            entries[i] = new TarEntry(index.getLong(position), index.getLong(position + 8),
                    index.getInt(position + 16), index.getInt(position + 20), index.getInt(position + 24));
            position += TarEntry.SIZE;
        }
        Arrays.sort(entries, TarEntry.OFFSET_ORDER);
        return entries;
    }

    @Nonnull
    private List<UUID> getReferences(TarEntry entry, UUID id, Map<UUID, List<UUID>> graph) throws IOException {
        if (graph != null) {
            List<UUID> uuids = graph.get(id);
            return uuids == null ? Collections.<UUID>emptyList() : uuids;
        } else {
            // a pre-compiled graph is not available, so read the
            // references directly from this segment
            ByteBuffer segment = access.read(entry.offset(), Math.min(entry.size(), 16 * 256));
            int pos = segment.position();
            int refCount = segment.get(pos + REF_COUNT_OFFSET) & 0xff;
            int refEnd = pos + 16 * (refCount + 1);
            List<UUID> refIds = newArrayList();
            for (int refPos = pos + 16; refPos < refEnd; refPos += 16) {
                refIds.add(new UUID(segment.getLong(refPos), segment.getLong(refPos + 8)));
            }
            return refIds;
        }
    }

    /**
     * Build the graph of segments reachable from an initial set of segments
     * @param roots     the initial set of segments
     * @param visitor   visitor receiving call back while following the segment graph
     * @throws IOException
     */
    public void traverseSegmentGraph(@Nonnull Set<UUID> roots, @Nonnull SegmentGraphVisitor visitor)
            throws IOException {
        checkNotNull(roots);
        checkNotNull(visitor);
        Map<UUID, List<UUID>> graph = getGraph(false);

        TarEntry[] entries = getEntries();
        for (int i = entries.length - 1; i >= 0; i--) {
            TarEntry entry = entries[i];
            UUID id = new UUID(entry.msb(), entry.lsb());
            if (roots.remove(id) && isDataSegmentId(entry.lsb())) {
                // this is a referenced data segment, so follow the graph
                for (UUID refId : getReferences(entry, id, graph)) {
                    visitor.accept(id, refId);
                    roots.add(refId);
                }
            } else {
                // this segment is not referenced anywhere
                visitor.accept(id, null);
            }
        }
    }

    /**
     * Calculate the ids of the segments directly referenced from {@code referenceIds}
     * through forward references.
     *
     * @param referencedIds  The initial set of ids to start from. On return it
     *                       contains the set of direct forward references.
     *
     * @throws IOException
     */
    void calculateForwardReferences(Set<UUID> referencedIds) throws IOException {
        Map<UUID, List<UUID>> graph = getGraph(false);
        TarEntry[] entries = getEntries();
        for (int i = entries.length - 1; i >= 0; i--) {
            TarEntry entry = entries[i];
            UUID id = new UUID(entry.msb(), entry.lsb());
            if (referencedIds.remove(id)) {
                if (isDataSegmentId(entry.lsb())) {
                    referencedIds.addAll(getReferences(entry, id, graph));
                }
            }
        }
    }

    /**
     * Collect the references of those blobs that are reachable from any segment with a
     * generation at or above {@code minGeneration}.
     * @param store
     * @param collector
     * @param minGeneration
     */
    void collectBlobReferences(SegmentStore store, ReferenceCollector collector, int minGeneration) {
        for (TarEntry entry : getEntries()) {
            if (entry.generation() >= minGeneration) {
                // FIXME OAK-4201: Add an index of binary references in a tar file
                // Fetch the blob references from the tar index instead reading them from the segment
                SegmentId id = store.newSegmentId(entry.msb(), entry.lsb());
                id.getSegment().collectBlobReferences(collector);
            }
        }
    }

    /**
     * Collect reclaimable segments.
     * A data segment is reclaimable iff its generation is in the {@code reclaimGeneration}
     * predicate.
     * A bulk segment is reclaimable if it is in {@code bulkRefs} or if it is transitively
     * reachable through a non reclaimable data segment.
     *
     * @param bulkRefs  bulk segment gc roots
     * @param reclaim   reclaimable segments
     * @param reclaimGeneration  reclaim generation predicate for data segments
     * @throws IOException
     */
    void mark(Set<UUID> bulkRefs, Set<UUID> reclaim, Predicate<Integer> reclaimGeneration) throws IOException {
        Map<UUID, List<UUID>> graph = getGraph(true);
        TarEntry[] entries = getEntries();
        for (int i = entries.length - 1; i >= 0; i--) {
            TarEntry entry = entries[i];
            UUID id = new UUID(entry.msb(), entry.lsb());
            if ((!isDataSegmentId(entry.lsb()) && !bulkRefs.remove(id))
                    || (isDataSegmentId(entry.lsb()) && reclaimGeneration.apply(entry.generation()))) {
                // non referenced bulk segment or old data segment
                reclaim.add(id);
            } else {
                if (isDataSegmentId(entry.lsb())) {
                    for (UUID refId : getReferences(entry, id, graph)) {
                        if (!isDataSegmentId(refId.getLeastSignificantBits())) {
                            // keep the extra check for bulk segments for the case where a
                            // pre-compiled graph is not available and getReferences also
                            // includes data references
                            if (!reclaim.remove(id)) {
                                bulkRefs.add(refId);
                            }
                        }
                    }
                }
            }
        }
    }

    /**
     * Remove reclaimable segments and collect actually reclaimed segments.
     * @param reclaim       segments to reclaim
     * @param reclaimed     actually reclaimed segments
     * @return              reader resulting from the reclamation process
     * @throws IOException
     */
    TarReader sweep(@Nonnull Set<UUID> reclaim, @Nonnull Set<UUID> reclaimed) throws IOException {
        String name = file.getName();
        log.debug("Cleaning up {}", name);

        Set<UUID> cleaned = newHashSet();
        int size = 0;
        int count = 0;
        TarEntry[] entries = getEntries();
        for (int i = 0; i < entries.length; i++) {
            TarEntry entry = entries[i];
            UUID id = new UUID(entry.msb(), entry.lsb());
            if (reclaim.contains(id)) {
                cleaned.add(id);
                entries[i] = null;
            } else {
                size += getEntrySize(entry.size());
                count += 1;
            }
        }
        size += getEntrySize(TarEntry.SIZE * count + 16);
        size += 2 * BLOCK_SIZE;

        if (count == 0) {
            log.debug("None of the entries of {} are referenceable.", name);
            logCleanedSegments(cleaned);
            return null;
        }
        if (size >= access.length() * 3 / 4 && hasGraph()) {
            // the space savings are not worth it at less than 25%,
            // unless this tar file lacks a pre-compiled segment graph
            // in which case we'll always generate a new tar file with
            // the graph to speed up future garbage collection runs.
            log.debug("Not enough space savings. ({}/{}). Skipping clean up of {}", access.length() - size,
                    access.length(), name);
            return this;
        }
        if (!hasGraph()) {
            log.warn("Recovering {}, which is missing its graph.", name);
        }

        int pos = name.length() - "a.tar".length();
        char generation = name.charAt(pos);
        if (generation == 'z') {
            log.debug("No garbage collection after reaching generation z: {}", name);
            return this;
        }

        File newFile = new File(file.getParentFile(), name.substring(0, pos) + (char) (generation + 1) + ".tar");

        log.debug("Writing new generation {}", newFile.getName());
        TarWriter writer = new TarWriter(newFile);
        for (TarEntry entry : entries) {
            if (entry != null) {
                byte[] data = new byte[entry.size()];
                access.read(entry.offset(), entry.size()).get(data);
                writer.writeEntry(entry.msb(), entry.lsb(), data, 0, entry.size(), entry.generation());
            }
        }
        writer.close();

        TarReader reader = openFirstFileWithValidIndex(singletonList(newFile), access.isMemoryMapped());
        if (reader != null) {
            logCleanedSegments(cleaned);
            reclaimed.addAll(cleaned);
            return reader;
        } else {
            log.warn("Failed to open cleaned up tar file {}", file);
            return this;
        }
    }

    private void logCleanedSegments(Set<UUID> cleaned) {
        StringBuilder uuids = new StringBuilder();
        String newLine = System.getProperty("line.separator", "\n") + "        ";

        int c = 0;
        String sep = "";
        for (UUID uuid : cleaned) {
            uuids.append(sep);
            if (c++ % 4 == 0) {
                uuids.append(newLine);
            }
            uuids.append(uuid);
            sep = ", ";
        }

        GC_LOG.info("TarMK cleaned segments from {}: {}", file.getName(), uuids);
    }

    /**
     * @return  {@code true} iff this reader has been closed
     * @see #close()
     */
    boolean isClosed() {
        return closed;
    }

    @Override
    public void close() throws IOException {
        closed = true;
        access.close();
    }

    //-----------------------------------------------------------< private >--

    /**
     * Loads and parses the optional pre-compiled graph entry from the given tar
     * file.
     *
     * @return the parsed graph, or {@code null} if one was not found
     * @throws IOException if the tar file could not be read
     */
    Map<UUID, List<UUID>> getGraph(boolean bulkOnly) throws IOException {
        ByteBuffer graph = loadGraph();
        if (graph == null) {
            return null;
        } else {
            return parseGraph(graph, bulkOnly);
        }
    }

    private boolean hasGraph() {
        if (!hasGraph) {
            try {
                loadGraph();
            } catch (IOException ignore) {
            }
        }
        return hasGraph;
    }

    /**
     * Loads the optional pre-compiled graph entry from the given tar file.
     *
     * @return graph buffer, or {@code null} if one was not found
     * @throws IOException if the tar file could not be read
     */
    private ByteBuffer loadGraph() throws IOException {
        // read the graph metadata just before the tar index entry
        int pos = access.length() - 2 * BLOCK_SIZE - getEntrySize(index.remaining() + 16);
        ByteBuffer meta = access.read(pos - 16, 16);
        int crc32 = meta.getInt();
        int count = meta.getInt();
        int bytes = meta.getInt();
        int magic = meta.getInt();

        if (magic != GRAPH_MAGIC) {
            return null; // magic byte mismatch
        }

        if (count < 0 || bytes < count * 16 + 16 || BLOCK_SIZE + bytes > pos) {
            log.warn("Invalid graph metadata in tar file {}", file);
            return null; // impossible uuid and/or byte counts
        }

        // this involves seeking backwards in the file, which might not
        // perform well, but that's OK since we only do this once per file
        ByteBuffer graph = access.read(pos - bytes, bytes);

        byte[] b = new byte[bytes - 16];
        graph.mark();
        graph.get(b);
        graph.reset();

        CRC32 checksum = new CRC32();
        checksum.update(b);
        if (crc32 != (int) checksum.getValue()) {
            log.warn("Invalid graph checksum in tar file {}", file);
            return null; // checksum mismatch
        }

        hasGraph = true;
        return graph;
    }

    private static Map<UUID, List<UUID>> parseGraph(ByteBuffer graphByteBuffer, boolean bulkOnly) {
        int count = graphByteBuffer.getInt(graphByteBuffer.limit() - 12);

        ByteBuffer buffer = graphByteBuffer.duplicate();
        buffer.limit(graphByteBuffer.limit() - 16);

        List<UUID> uuids = newArrayListWithCapacity(count);
        for (int i = 0; i < count; i++) {
            uuids.add(new UUID(buffer.getLong(), buffer.getLong()));
        }

        Map<UUID, List<UUID>> graph = newHashMap();
        while (buffer.hasRemaining()) {
            UUID uuid = uuids.get(buffer.getInt());
            List<UUID> list = newArrayList();
            int refid = buffer.getInt();
            while (refid != -1) {
                UUID ref = uuids.get(refid);
                if (!bulkOnly || !isDataSegmentId(ref.getLeastSignificantBits())) {
                    list.add(ref);
                }
                refid = buffer.getInt();
            }
            graph.put(uuid, list);
        }
        return graph;
    }

    private static String readString(ByteBuffer buffer, int fieldSize) {
        byte[] b = new byte[fieldSize];
        buffer.get(b);
        int n = 0;
        while (n < fieldSize && b[n] != 0) {
            n++;
        }
        return new String(b, 0, n, UTF_8);
    }

    private static int readNumber(ByteBuffer buffer, int fieldSize) {
        byte[] b = new byte[fieldSize];
        buffer.get(b);
        int number = 0;
        for (int i = 0; i < fieldSize; i++) {
            int digit = b[i] & 0xff;
            if ('0' <= digit && digit <= '7') {
                number = number * 8 + digit - '0';
            } else {
                break;
            }
        }
        return number;
    }

    File getFile() {
        return file;
    }

    //------------------------------------------------------------< Object >--

    @Override
    public String toString() {
        return file.toString();
    }

}