org.apache.cassandra.db.commitlog.CommitLogReplayer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.cassandra.db.commitlog.CommitLogReplayer.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.cassandra.db.commitlog;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;

import com.google.common.base.Predicate;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.github.tjake.ICRC32;

import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.db.*;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.io.compress.CompressionParameters;
import org.apache.cassandra.io.compress.ICompressor;
import org.apache.cassandra.io.util.ByteBufferDataInput;
import org.apache.cassandra.io.util.FastByteArrayInputStream;
import org.apache.cassandra.io.util.FileDataInput;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.RandomAccessReader;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.CRC32Factory;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.WrappedRunnable;
import org.cliffc.high_scale_lib.NonBlockingHashSet;

public class CommitLogReplayer {
    static final String IGNORE_REPLAY_ERRORS_PROPERTY = "cassandra.commitlog.ignorereplayerrors";
    private static final Logger logger = LoggerFactory.getLogger(CommitLogReplayer.class);
    private static final int MAX_OUTSTANDING_REPLAY_COUNT = Integer
            .getInteger("cassandra.commitlog_max_outstanding_replay_count", 1024);
    private static final int LEGACY_END_OF_SEGMENT_MARKER = 0;

    private final Set<Keyspace> keyspacesRecovered;
    private final List<Future<?>> futures;
    private final Map<UUID, AtomicInteger> invalidMutations;
    private final AtomicInteger replayedCount;
    private final Map<UUID, ReplayPosition> cfPositions;
    private final ReplayPosition globalPosition;
    private final ICRC32 checksum;
    private byte[] buffer;
    private byte[] uncompressedBuffer;

    private final ReplayFilter replayFilter;
    private final CommitLogArchiver archiver;

    CommitLogReplayer(CommitLog commitLog, ReplayPosition globalPosition, Map<UUID, ReplayPosition> cfPositions,
            ReplayFilter replayFilter) {
        this.keyspacesRecovered = new NonBlockingHashSet<Keyspace>();
        this.futures = new ArrayList<Future<?>>();
        this.buffer = new byte[4096];
        this.uncompressedBuffer = new byte[4096];
        this.invalidMutations = new HashMap<UUID, AtomicInteger>();
        // count the number of replayed mutation. We don't really care about atomicity, but we need it to be a reference.
        this.replayedCount = new AtomicInteger();
        this.checksum = CRC32Factory.instance.create();
        this.cfPositions = cfPositions;
        this.globalPosition = globalPosition;
        this.replayFilter = replayFilter;
        this.archiver = commitLog.archiver;
    }

    public static CommitLogReplayer construct(CommitLog commitLog) {
        // compute per-CF and global replay positions
        Map<UUID, ReplayPosition> cfPositions = new HashMap<UUID, ReplayPosition>();
        Ordering<ReplayPosition> replayPositionOrdering = Ordering.from(ReplayPosition.comparator);
        ReplayFilter replayFilter = ReplayFilter.create();
        for (ColumnFamilyStore cfs : ColumnFamilyStore.all()) {
            // it's important to call RP.gRP per-cf, before aggregating all the positions w/ the Ordering.min call
            // below: gRP will return NONE if there are no flushed sstables, which is important to have in the
            // list (otherwise we'll just start replay from the first flush position that we do have, which is not correct).
            ReplayPosition rp = ReplayPosition.getReplayPosition(cfs.getSSTables());

            // but, if we've truncated the cf in question, then we need to need to start replay after the truncation
            ReplayPosition truncatedAt = SystemKeyspace.getTruncatedPosition(cfs.metadata.cfId);
            if (truncatedAt != null) {
                // Point in time restore is taken to mean that the tables need to be recovered even if they were
                // deleted at a later point in time. Any truncation record after that point must thus be cleared prior
                // to recovery (CASSANDRA-9195).
                long restoreTime = commitLog.archiver.restorePointInTime;
                long truncatedTime = SystemKeyspace.getTruncatedAt(cfs.metadata.cfId);
                if (truncatedTime > restoreTime) {
                    if (replayFilter.includes(cfs.metadata)) {
                        logger.info(
                                "Restore point in time is before latest truncation of table {}.{}. Clearing truncation record.",
                                cfs.metadata.ksName, cfs.metadata.cfName);
                        SystemKeyspace.removeTruncationRecord(cfs.metadata.cfId);
                    }
                } else {
                    rp = replayPositionOrdering.max(Arrays.asList(rp, truncatedAt));
                }
            }

            cfPositions.put(cfs.metadata.cfId, rp);
        }
        ReplayPosition globalPosition = replayPositionOrdering.min(cfPositions.values());
        logger.trace("Global replay position is {} from columnfamilies {}", globalPosition,
                FBUtilities.toString(cfPositions));
        return new CommitLogReplayer(commitLog, globalPosition, cfPositions, replayFilter);
    }

    public void recover(File[] clogs) throws IOException {
        int i;
        for (i = 0; i < clogs.length; ++i)
            recover(clogs[i], i + 1 == clogs.length);
    }

    public int blockForWrites() {
        for (Map.Entry<UUID, AtomicInteger> entry : invalidMutations.entrySet())
            logger.warn(String.format("Skipped %d mutations from unknown (probably removed) CF with id %s",
                    entry.getValue().intValue(), entry.getKey()));

        // wait for all the writes to finish on the mutation stage
        FBUtilities.waitOnFutures(futures);
        logger.trace("Finished waiting on mutations from recovery");

        // flush replayed keyspaces
        futures.clear();
        for (Keyspace keyspace : keyspacesRecovered)
            futures.addAll(keyspace.flush());
        FBUtilities.waitOnFutures(futures);
        return replayedCount.get();
    }

    private int readSyncMarker(CommitLogDescriptor descriptor, int offset, RandomAccessReader reader,
            boolean tolerateTruncation) throws IOException {
        if (offset > reader.length() - CommitLogSegment.SYNC_MARKER_SIZE) {
            // There was no room in the segment to write a final header. No data could be present here.
            return -1;
        }
        reader.seek(offset);
        ICRC32 crc = CRC32Factory.instance.create();
        crc.updateInt((int) (descriptor.id & 0xFFFFFFFFL));
        crc.updateInt((int) (descriptor.id >>> 32));
        crc.updateInt((int) reader.getPosition());
        int end = reader.readInt();
        long filecrc = reader.readInt() & 0xffffffffL;
        if (crc.getValue() != filecrc) {
            if (end != 0 || filecrc != 0) {
                handleReplayError(false,
                        "Encountered bad header at position %d of commit log %s, with invalid CRC. "
                                + "The end of segment marker should be zero.",
                        offset, reader.getPath());
            }
            return -1;
        } else if (end < offset || end > reader.length()) {
            handleReplayError(tolerateTruncation,
                    "Encountered bad header at position %d of commit log %s, with bad position but valid CRC",
                    offset, reader.getPath());
            return -1;
        }
        return end;
    }

    abstract static class ReplayFilter {
        public abstract Iterable<ColumnFamily> filter(Mutation mutation);

        public abstract boolean includes(CFMetaData metadata);

        public static ReplayFilter create() {
            // If no replaylist is supplied an empty array of strings is used to replay everything.
            if (System.getProperty("cassandra.replayList") == null)
                return new AlwaysReplayFilter();

            Multimap<String, String> toReplay = HashMultimap.create();
            for (String rawPair : System.getProperty("cassandra.replayList").split(",")) {
                String[] pair = rawPair.trim().split("\\.");
                if (pair.length != 2)
                    throw new IllegalArgumentException(
                            "Each table to be replayed must be fully qualified with keyspace name, e.g., 'system.peers'");

                Keyspace ks = Schema.instance.getKeyspaceInstance(pair[0]);
                if (ks == null)
                    throw new IllegalArgumentException("Unknown keyspace " + pair[0]);
                ColumnFamilyStore cfs = ks.getColumnFamilyStore(pair[1]);
                if (cfs == null)
                    throw new IllegalArgumentException(String.format("Unknown table %s.%s", pair[0], pair[1]));

                toReplay.put(pair[0], pair[1]);
            }
            return new CustomReplayFilter(toReplay);
        }
    }

    private static class AlwaysReplayFilter extends ReplayFilter {
        public Iterable<ColumnFamily> filter(Mutation mutation) {
            return mutation.getColumnFamilies();
        }

        public boolean includes(CFMetaData metadata) {
            return true;
        }
    }

    private static class CustomReplayFilter extends ReplayFilter {
        private Multimap<String, String> toReplay;

        public CustomReplayFilter(Multimap<String, String> toReplay) {
            this.toReplay = toReplay;
        }

        public Iterable<ColumnFamily> filter(Mutation mutation) {
            final Collection<String> cfNames = toReplay.get(mutation.getKeyspaceName());
            if (cfNames == null)
                return Collections.emptySet();

            return Iterables.filter(mutation.getColumnFamilies(), new Predicate<ColumnFamily>() {
                public boolean apply(ColumnFamily cf) {
                    return cfNames.contains(cf.metadata().cfName);
                }
            });
        }

        public boolean includes(CFMetaData metadata) {
            return toReplay.containsEntry(metadata.ksName, metadata.cfName);
        }
    }

    @SuppressWarnings("resource")
    public void recover(File file, boolean tolerateTruncation) throws IOException {
        CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(file.getName());
        RandomAccessReader reader = RandomAccessReader.open(new File(file.getAbsolutePath()));
        try {
            if (desc.version < CommitLogDescriptor.VERSION_21) {
                if (logAndCheckIfShouldSkip(file, desc))
                    return;
                if (globalPosition.segment == desc.id)
                    reader.seek(globalPosition.position);
                replaySyncSection(reader, (int) reader.getPositionLimit(), desc, desc.fileName(),
                        tolerateTruncation);
                return;
            }

            final long segmentId = desc.id;
            try {
                desc = CommitLogDescriptor.readHeader(reader);
            } catch (IOException e) {
                desc = null;
            }
            if (desc == null) {
                handleReplayError(false, "Could not read commit log descriptor in file %s", file);
                return;
            }
            if (segmentId != desc.id) {
                handleReplayError(false, "Segment id mismatch (filename %d, descriptor %d) in file %s", segmentId,
                        desc.id, file);
                // continue processing if ignored.
            }

            if (logAndCheckIfShouldSkip(file, desc))
                return;

            ICompressor compressor = null;
            if (desc.compression != null) {
                try {
                    compressor = CompressionParameters.createCompressor(desc.compression);
                } catch (ConfigurationException e) {
                    handleReplayError(false, "Unknown compression: %s", e.getMessage());
                    return;
                }
            }

            assert reader.length() <= Integer.MAX_VALUE;
            int end = (int) reader.getFilePointer();
            int replayEnd = end;

            while ((end = readSyncMarker(desc, end, reader, tolerateTruncation)) >= 0) {
                int replayPos = replayEnd + CommitLogSegment.SYNC_MARKER_SIZE;

                if (logger.isTraceEnabled())
                    logger.trace("Replaying {} between {} and {}", file, reader.getFilePointer(), end);
                if (compressor != null) {
                    int uncompressedLength = reader.readInt();
                    replayEnd = replayPos + uncompressedLength;
                } else {
                    replayEnd = end;
                }

                if (segmentId == globalPosition.segment && replayEnd < globalPosition.position)
                    // Skip over flushed section.
                    continue;

                FileDataInput sectionReader = reader;
                String errorContext = desc.fileName();
                // In the uncompressed case the last non-fully-flushed section can be anywhere in the file.
                boolean tolerateErrorsInSection = tolerateTruncation;
                if (compressor != null) {
                    // In the compressed case we know if this is the last section.
                    tolerateErrorsInSection &= end == reader.length() || end < 0;

                    int start = (int) reader.getFilePointer();
                    try {
                        int compressedLength = end - start;
                        if (logger.isTraceEnabled())
                            logger.trace("Decompressing {} between replay positions {} and {}", file, replayPos,
                                    replayEnd);
                        if (compressedLength > buffer.length)
                            buffer = new byte[(int) (1.2 * compressedLength)];
                        reader.readFully(buffer, 0, compressedLength);
                        int uncompressedLength = replayEnd - replayPos;
                        if (uncompressedLength > uncompressedBuffer.length)
                            uncompressedBuffer = new byte[(int) (1.2 * uncompressedLength)];
                        compressedLength = compressor.uncompress(buffer, 0, compressedLength, uncompressedBuffer,
                                0);
                        sectionReader = new ByteBufferDataInput(ByteBuffer.wrap(uncompressedBuffer),
                                reader.getPath(), replayPos, 0);
                        errorContext = "compressed section at " + start + " in " + errorContext;
                    } catch (IOException | ArrayIndexOutOfBoundsException e) {
                        handleReplayError(tolerateErrorsInSection,
                                "Unexpected exception decompressing section at %d: %s", start, e);
                        continue;
                    }
                }

                if (!replaySyncSection(sectionReader, replayEnd, desc, errorContext, tolerateErrorsInSection))
                    break;
            }
        } finally {
            FileUtils.closeQuietly(reader);
            logger.debug("Finished reading {}", file);
        }
    }

    public boolean logAndCheckIfShouldSkip(File file, CommitLogDescriptor desc) {
        logger.debug("Replaying {} (CL version {}, messaging version {}, compression {})", file.getPath(),
                desc.version, desc.getMessagingVersion(), desc.compression);

        if (globalPosition.segment > desc.id) {
            logger.trace("skipping replay of fully-flushed {}", file);
            return true;
        }
        return false;
    }

    /**
     * Replays a sync section containing a list of mutations.
     *
     * @return Whether replay should continue with the next section.
     */
    private boolean replaySyncSection(FileDataInput reader, int end, CommitLogDescriptor desc, String errorContext,
            boolean tolerateErrors) throws IOException {
        /* read the logs populate Mutation and apply */
        while (reader.getFilePointer() < end && !reader.isEOF()) {
            long mutationStart = reader.getFilePointer();
            if (logger.isTraceEnabled())
                logger.trace("Reading mutation at {}", mutationStart);

            long claimedCRC32;
            int serializedSize;
            try {
                // any of the reads may hit EOF
                serializedSize = reader.readInt();
                if (serializedSize == LEGACY_END_OF_SEGMENT_MARKER) {
                    logger.trace("Encountered end of segment marker at {}", reader.getFilePointer());
                    return false;
                }

                // Mutation must be at LEAST 10 bytes:
                // 3 each for a non-empty Keyspace and Key (including the
                // 2-byte length from writeUTF/writeWithShortLength) and 4 bytes for column count.
                // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
                if (serializedSize < 10) {
                    handleReplayError(tolerateErrors, "Invalid mutation size %d at %d in %s", serializedSize,
                            mutationStart, errorContext);
                    return false;
                }

                long claimedSizeChecksum;
                if (desc.version < CommitLogDescriptor.VERSION_21)
                    claimedSizeChecksum = reader.readLong();
                else
                    claimedSizeChecksum = reader.readInt() & 0xffffffffL;
                checksum.reset();
                if (desc.version < CommitLogDescriptor.VERSION_20)
                    checksum.update(serializedSize);
                else
                    checksum.updateInt(serializedSize);

                if (checksum.getValue() != claimedSizeChecksum) {
                    handleReplayError(tolerateErrors, "Mutation size checksum failure at %d in %s", mutationStart,
                            errorContext);
                    return false;
                }
                // ok.

                if (serializedSize > buffer.length)
                    buffer = new byte[(int) (1.2 * serializedSize)];
                reader.readFully(buffer, 0, serializedSize);
                if (desc.version < CommitLogDescriptor.VERSION_21)
                    claimedCRC32 = reader.readLong();
                else
                    claimedCRC32 = reader.readInt() & 0xffffffffL;
            } catch (EOFException eof) {
                handleReplayError(tolerateErrors, "Unexpected end of segment", mutationStart, errorContext);
                return false; // last CL entry didn't get completely written. that's ok.
            }

            checksum.update(buffer, 0, serializedSize);
            if (claimedCRC32 != checksum.getValue()) {
                handleReplayError(tolerateErrors, "Mutation checksum failure at %d in %s", mutationStart,
                        errorContext);
                continue;
            }
            replayMutation(buffer, serializedSize, reader.getFilePointer(), desc);
        }
        return true;
    }

    /**
     * Deserializes and replays a commit log entry.
     */
    void replayMutation(byte[] inputBuffer, int size, final long entryLocation, final CommitLogDescriptor desc)
            throws IOException {

        final Mutation mutation;
        try (FastByteArrayInputStream bufIn = new FastByteArrayInputStream(inputBuffer, 0, size)) {
            mutation = Mutation.serializer.deserialize(new DataInputStream(bufIn), desc.getMessagingVersion(),
                    ColumnSerializer.Flag.LOCAL);
            // doublecheck that what we read is [still] valid for the current schema
            for (ColumnFamily cf : mutation.getColumnFamilies())
                for (Cell cell : cf)
                    cf.getComparator().validate(cell.name());
        } catch (UnknownColumnFamilyException ex) {
            if (ex.cfId == null)
                return;
            AtomicInteger i = invalidMutations.get(ex.cfId);
            if (i == null) {
                i = new AtomicInteger(1);
                invalidMutations.put(ex.cfId, i);
            } else
                i.incrementAndGet();
            return;
        } catch (Throwable t) {
            JVMStabilityInspector.inspectThrowable(t);
            File f = File.createTempFile("mutation", "dat");

            try (DataOutputStream out = new DataOutputStream(new FileOutputStream(f))) {
                out.write(inputBuffer, 0, size);
            }

            // Checksum passed so this error can't be permissible.
            handleReplayError(false, "Unexpected error deserializing mutation; saved to %s.  "
                    + "This may be caused by replaying a mutation against a table with the same name but incompatible schema.  "
                    + "Exception follows: %s", f.getAbsolutePath(), t);
            return;
        }

        if (logger.isTraceEnabled())
            logger.trace("replaying mutation for {}.{}: {}", mutation.getKeyspaceName(),
                    ByteBufferUtil.bytesToHex(mutation.key()),
                    "{" + StringUtils.join(mutation.getColumnFamilies().iterator(), ", ") + "}");

        Runnable runnable = new WrappedRunnable() {
            public void runMayThrow() throws IOException {
                if (Schema.instance.getKSMetaData(mutation.getKeyspaceName()) == null)
                    return;
                if (pointInTimeExceeded(mutation))
                    return;

                final Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName());

                // Rebuild the mutation, omitting column families that
                //    a) the user has requested that we ignore,
                //    b) have already been flushed,
                // or c) are part of a cf that was dropped.
                // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead.
                Mutation newMutation = null;
                for (ColumnFamily columnFamily : replayFilter.filter(mutation)) {
                    if (Schema.instance.getCF(columnFamily.id()) == null)
                        continue; // dropped

                    ReplayPosition rp = cfPositions.get(columnFamily.id());

                    // replay if current segment is newer than last flushed one or,
                    // if it is the last known segment, if we are after the replay position
                    if (desc.id > rp.segment || (desc.id == rp.segment && entryLocation > rp.position)) {
                        if (newMutation == null)
                            newMutation = new Mutation(mutation.getKeyspaceName(), mutation.key());
                        newMutation.add(columnFamily);
                        replayedCount.incrementAndGet();
                    }
                }
                if (newMutation != null) {
                    assert !newMutation.isEmpty();
                    Keyspace.open(newMutation.getKeyspaceName()).apply(newMutation, false);
                    keyspacesRecovered.add(keyspace);
                }
            }
        };
        futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
        if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT) {
            FBUtilities.waitOnFutures(futures);
            futures.clear();
        }
    }

    protected boolean pointInTimeExceeded(Mutation fm) {
        long restoreTarget = archiver.restorePointInTime;

        for (ColumnFamily families : fm.getColumnFamilies()) {
            if (archiver.precision.toMillis(families.maxTimestamp()) > restoreTarget)
                return true;
        }
        return false;
    }

    static void handleReplayError(boolean permissible, String message, Object... messageArgs) throws IOException {
        String msg = String.format(message, messageArgs);
        IOException e = new CommitLogReplayException(msg);
        if (permissible)
            logger.error("Ignoring commit log replay error likely due to incomplete flush to disk", e);
        else if (Boolean.getBoolean(IGNORE_REPLAY_ERRORS_PROPERTY))
            logger.error("Ignoring commit log replay error", e);
        else if (!CommitLog.handleCommitError("Failed commit log replay", e)) {
            logger.error(
                    "Replay stopped. If you wish to override this error and continue starting the node ignoring "
                            + "commit log replay problems, specify -D" + IGNORE_REPLAY_ERRORS_PROPERTY + "=true "
                            + "on the command line");
            throw e;
        }
    }

    @SuppressWarnings("serial")
    public static class CommitLogReplayException extends IOException {
        public CommitLogReplayException(String message, Throwable cause) {
            super(message, cause);
        }

        public CommitLogReplayException(String message) {
            super(message);
        }
    }
}