org.voltcore.agreement.MeshArbiter.java Source code

Java tutorial

Introduction

Here is the source code for org.voltcore.agreement.MeshArbiter.java

Source

/* This file is part of VoltDB.
 * Copyright (C) 2008-2013 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltcore.agreement;

import static com.google.common.base.Predicates.equalTo;
import static com.google.common.base.Predicates.not;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.FaultMessage;
import org.voltcore.messaging.Mailbox;
import org.voltcore.messaging.SiteFailureForwardMessage;
import org.voltcore.messaging.SiteFailureMessage;
import org.voltcore.messaging.Subject;
import org.voltcore.messaging.VoltMessage;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltdb.VoltDB;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.Longs;

public class MeshArbiter {

    protected static int FORWARD_STALL_COUNT = 20 * 5; // 5 seconds

    protected static final VoltLogger m_recoveryLog = new VoltLogger("REJOIN");

    protected static final Subject[] justFailures = new Subject[] { Subject.FAILURE };
    protected static final Subject[] receiveSubjects = new Subject[] { Subject.FAILURE, Subject.SITE_FAILURE_UPDATE,
            Subject.SITE_FAILURE_FORWARD };

    /**
     * During arbitration this map keys contain failed sites we are seeking
     * resolution for, and the values indicate whether or not the fault was
     * witnessed directly or relayed by others
     */
    protected final Map<Long, Boolean> m_inTrouble = Maps.newTreeMap();
    /**
     * The invoking agreement site hsid
     */
    protected final long m_hsId;
    protected final Mailbox m_mailbox;
    /**
     * Companion interface that aides in pinging, and getting safe site zookeeper
     * transaction ids
     */
    protected final MeshAide m_meshAide;
    /**
     * A map whree the keys describe graph links between alive sites and
     * sites listed in the {@link #m_inTrouble} map, and the values are
     * the safe zookeeper transaction ids reported by alive sites
     */
    protected final HashMap<Pair<Long, Long>, Long> m_failedSitesLedger = Maps.newHashMap();
    /**
     * Historic list of failed sites
     */
    protected final Set<Long> m_failedSites = Sets.newTreeSet();

    protected final Map<Long, SiteFailureForwardMessage> m_forwardCandidates = Maps.newHashMap();
    /**
     * it builds mesh graphs, and determines the the kill set to resolve
     * an arbitration
     */
    protected final AgreementSeeker m_seeker;

    /**
     * useful when probing the state of this mesh arbiter
     */
    protected volatile int m_inTroubleCount = 0;
    /**
     * useful when probing the state of this mesh arbiter. Each
     * resolved arbitration increments this counter
     */
    protected volatile int m_failedSitesCount = 0;

    public MeshArbiter(final long hsId, final Mailbox mailbox, final MeshAide meshAide) {

        m_hsId = hsId;
        m_mailbox = mailbox;
        m_meshAide = meshAide;
        m_seeker = new AgreementSeeker(ArbitrationStrategy.MATCHING_CARDINALITY, m_hsId);
    }

    public boolean isInArbitration() {
        return m_inTroubleCount > 0;
    }

    public int getFailedSitesCount() {
        return m_failedSitesCount;
    }

    enum Discard {
        Suicide {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " reporter: "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        AlreadyFailed {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        ReporterFailed {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog
                        .info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        Unknown {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        ReporterUnknown {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog
                        .info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        ReporterWitnessed {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog
                        .info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        SelfUnwitnessed {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        AlreadyKnow {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " " + CoreUtils.hsIdToString(fm.failedSite)
                        + " reporter: " + CoreUtils.hsIdToString(fm.reportingSite)
                        + (fm.decided ? " decided: true" : ""));
            }
        },
        OtherUnwitnessed {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog
                        .info("Agreement, Discarding " + name() + " other: " + CoreUtils.hsIdToString(fm.failedSite)
                                + ", repoter: " + CoreUtils.hsIdToString(fm.reportingSite) + ", survivors: ["
                                + CoreUtils.hsIdCollectionToString(fm.survivors) + "]");
            }
        },
        SoleSurvivor {
            @Override
            void log(FaultMessage fm) {
                m_recoveryLog.info("Agreement, Discarding " + name() + " repoter: "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        DoNot {
            @Override
            void log(FaultMessage fm) {
            }
        };

        abstract void log(FaultMessage fm);
    }

    protected Discard mayIgnore(Set<Long> hsIds, FaultMessage fm) {
        Boolean alreadyWitnessed = m_inTrouble.get(fm.failedSite);

        if (fm.failedSite == m_hsId) {
            return Discard.Suicide;
        } else if (m_failedSites.contains(fm.failedSite)) {
            return Discard.AlreadyFailed;
        } else if (!hsIds.contains(fm.failedSite)) {
            return Discard.Unknown;
        } else if (m_failedSites.contains(fm.reportingSite)) {
            return Discard.ReporterFailed;
        } else if (!hsIds.contains(fm.reportingSite)) {
            return Discard.ReporterUnknown;
        } else if (fm.isSoleSurvivor()) {
            return Discard.SoleSurvivor;
        } else if (Boolean.TRUE.equals(m_inTrouble.get(fm.reportingSite))) {
            return Discard.ReporterWitnessed;
        } else if (!fm.witnessed && fm.reportingSite == m_hsId) {
            return Discard.SelfUnwitnessed;
        } else if (alreadyWitnessed != null
                && (alreadyWitnessed || alreadyWitnessed == (fm.witnessed || fm.decided))) {
            return Discard.AlreadyKnow;
        } else if (fm.survivors.contains(fm.failedSite)) {
            /*
             * by the time we get here we fm.failedSite is
             * within our survivors: not among failed, and among
             * hsids (not(not(among hsids)))
             */
            return Discard.OtherUnwitnessed;
        } else {
            return Discard.DoNot;
        }
    }

    /**
     * Process the fault message, and if necessary start arbitration.
     * @param hsIds pre-failure mesh ids
     * @param fm a {@link FaultMessage}
     * @return a map where the keys are the sites we need to disconnect from, and
     *   the values the last know safe zookeeper transaction ids for the sites
     *   we need to disconnect from. A map with entries indicate that an
     *   arbitration resolutions has been reached, while a map without entries
     *   indicate either a stale message, or that an agreement has not been
     *   reached
     */
    public Map<Long, Long> reconfigureOnFault(Set<Long> hsIds, FaultMessage fm) {
        boolean proceed = false;
        do {
            Discard ignoreIt = mayIgnore(hsIds, fm);
            if (Discard.DoNot == ignoreIt) {
                m_inTrouble.put(fm.failedSite, fm.witnessed || fm.decided);
                m_recoveryLog.info("Agreement, Processing " + fm);
                proceed = true;
            } else {
                ignoreIt.log(fm);
            }

            fm = (FaultMessage) m_mailbox.recv(justFailures);
        } while (fm != null);

        if (!proceed) {
            return ImmutableMap.of();
        }

        m_inTroubleCount = m_inTrouble.size();

        // we are here if failed site was not previously recorded
        // or it was previously recorded but it became witnessed from unwitnessed
        m_seeker.startSeekingFor(Sets.difference(hsIds, m_failedSites), m_inTrouble);

        discoverGlobalFaultData_send(hsIds);

        if (discoverGlobalFaultData_rcv(hsIds)) {
            Map<Long, Long> lastTxnIdByFailedSite = extractGlobalFaultData(hsIds);
            if (lastTxnIdByFailedSite.isEmpty()) {
                return ImmutableMap.of();
            }

            Set<Long> witnessed = Maps.filterValues(m_inTrouble, equalTo(Boolean.TRUE)).keySet();
            Set<Long> notClosed = Sets.difference(witnessed, lastTxnIdByFailedSite.keySet());
            if (!notClosed.isEmpty()) {
                m_recoveryLog.warn("Agreement, witnessed but not decided: ["
                        + CoreUtils.hsIdCollectionToString(notClosed) + "] seeker: " + m_seeker);
            }

            notifyOnKill(hsIds, lastTxnIdByFailedSite);

            m_failedSites.addAll(lastTxnIdByFailedSite.keySet());
            m_failedSitesCount = m_failedSites.size();

            m_recoveryLog
                    .info("Agreement, Adding " + CoreUtils.hsIdCollectionToString(lastTxnIdByFailedSite.keySet())
                            + " to failed sites history");

            clearInTrouble(lastTxnIdByFailedSite.keySet());
            m_seeker.clear();

            return lastTxnIdByFailedSite;
        } else {
            return ImmutableMap.of();
        }
    }

    /**
     * Notify all survivors when you are closing links to nodes
     * @param decision map where the keys contain the kill sites
     *   and its values are their last known safe transaction ids
     */
    protected void notifyOnKill(Set<Long> hsIds, Map<Long, Long> decision) {

        SiteFailureMessage.Builder sfmb = SiteFailureMessage.builder().decisions(decision.keySet())
                .failures(decision.keySet());

        Set<Long> dests = Sets.filter(m_seeker.getSurvivors(), not(equalTo(m_hsId)));
        if (dests.isEmpty())
            return;

        sfmb.survivors(Sets.difference(m_seeker.getSurvivors(), decision.keySet()));
        sfmb.safeTxnIds(getSafeTxnIdsForSites(hsIds));

        SiteFailureMessage sfm = sfmb.build();
        m_mailbox.send(Longs.toArray(dests), sfm);

        m_recoveryLog.info("Agreement, Sending [" + CoreUtils.hsIdCollectionToString(dests) + "]  " + sfm);
    }

    protected void clearInTrouble(Set<Long> decision) {
        m_forwardCandidates.clear();
        m_failedSitesLedger.clear();
        m_inTrouble.clear();
        m_inTroubleCount = 0;
    }

    protected Map<Long, Long> getSafeTxnIdsForSites(Set<Long> hsIds) {
        ImmutableMap.Builder<Long, Long> safeb = ImmutableMap.builder();
        for (long h : Sets.filter(hsIds, not(equalTo(m_hsId)))) {
            safeb.put(h, m_meshAide.getNewestSafeTransactionForInitiator(h));
        }
        return safeb.build();
    }

    /**
     * Send one message to each surviving execution site providing this site's
     * multi-partition commit point and this site's safe txnid
     * (the receiver will filter the later for its
     * own partition). Do this once for each failed initiator that we know about.
     * Sends all data all the time to avoid a need for request/response.
     */
    private void discoverGlobalFaultData_send(Set<Long> hsIds) {
        Set<Long> dests = Sets.filter(m_seeker.getSurvivors(), not(equalTo(m_hsId)));

        SiteFailureMessage.Builder msgBuilder = SiteFailureMessage.builder().survivors(m_seeker.getSurvivors())
                .failures(m_inTrouble.keySet()).safeTxnIds(getSafeTxnIdsForSites(hsIds));

        SiteFailureMessage sfm = msgBuilder.build();
        sfm.m_sourceHSId = m_hsId;

        updateFailedSitesLedger(hsIds, sfm);
        m_seeker.add(sfm);

        m_mailbox.send(Longs.toArray(dests), sfm);

        m_recoveryLog.info("Agreement, Sending survivors " + sfm);
    }

    protected void updateFailedSitesLedger(Set<Long> hsIds, SiteFailureMessage sfm) {
        for (Map.Entry<Long, Long> e : sfm.m_safeTxnIds.entrySet()) {

            if (!hsIds.contains(e.getKey()) || m_hsId == e.getKey() || e.getKey() == sfm.m_sourceHSId)
                continue;

            m_failedSitesLedger.put(Pair.of(sfm.m_sourceHSId, e.getKey()), e.getValue());
        }
    }

    protected void addForwardCandidate(SiteFailureForwardMessage sffm) {
        SiteFailureForwardMessage prev = m_forwardCandidates.get(sffm.m_reportingHSId);

        if (prev != null && prev.m_survivors.size() < sffm.m_survivors.size())
            return;

        m_forwardCandidates.put(sffm.m_reportingHSId, sffm);
    }

    /**
     * Collect the failure site update messages from all sites This site sent
     * its own mailbox the above broadcast the maximum is local to this site.
     * This also ensures at least one response.
     *
     * Concurrent failures can be detected by additional reports from the FaultDistributor
     * or a mismatch in the set of failed hosts reported in a message from another site
     */
    private boolean discoverGlobalFaultData_rcv(Set<Long> hsIds) {

        long blockedOnReceiveStart = System.currentTimeMillis();
        long lastReportTime = 0;
        boolean haveEnough = false;
        int[] forwardStallCount = new int[] { FORWARD_STALL_COUNT };

        do {
            VoltMessage m = m_mailbox.recvBlocking(receiveSubjects, 5);

            /*
             * If fault resolution takes longer then 10 seconds start logging
             */
            final long now = System.currentTimeMillis();
            if (now - blockedOnReceiveStart > 10000) {
                if (now - lastReportTime > 60000) {
                    lastReportTime = System.currentTimeMillis();
                    haveNecessaryFaultInfo(m_seeker.getSurvivors(), true);
                }
            }

            if (m == null) {
                // Send a heartbeat to keep the dead host timeout active.  Needed because IV2 doesn't
                // generate its own heartbeats to keep this running.
                m_meshAide.sendHeartbeats(m_seeker.getSurvivors());

            } else if (m.getSubject() == Subject.SITE_FAILURE_UPDATE.getId()) {

                SiteFailureMessage sfm = (SiteFailureMessage) m;

                if (!m_seeker.getSurvivors().contains(m.m_sourceHSId) || m_failedSites.contains(m.m_sourceHSId)
                        || m_failedSites.containsAll(sfm.getFailedSites()))
                    continue;

                updateFailedSitesLedger(hsIds, sfm);

                m_seeker.add(sfm);
                addForwardCandidate(new SiteFailureForwardMessage(sfm));

                m_recoveryLog.info("Agreement, Received " + sfm);

            } else if (m.getSubject() == Subject.SITE_FAILURE_FORWARD.getId()) {

                SiteFailureForwardMessage fsfm = (SiteFailureForwardMessage) m;

                addForwardCandidate(fsfm);

                if (!hsIds.contains(fsfm.m_sourceHSId) || m_seeker.getSurvivors().contains(fsfm.m_reportingHSId)
                        || m_failedSites.contains(fsfm.m_reportingHSId)
                        || m_failedSites.containsAll(fsfm.getFailedSites()))
                    continue;

                m_seeker.add(fsfm);

                m_recoveryLog.info("Agreement, Received forward " + fsfm);

                forwardStallCount[0] = FORWARD_STALL_COUNT;

            } else if (m.getSubject() == Subject.FAILURE.getId()) {
                /*
                 * If the fault distributor reports a new fault, ignore it if it is known , otherwise
                 * re-deliver the message to ourself and then abort so that the process can restart.
                 */
                FaultMessage fm = (FaultMessage) m;

                Discard ignoreIt = mayIgnore(hsIds, fm);
                if (Discard.DoNot == ignoreIt) {
                    m_mailbox.deliverFront(m);
                    m_recoveryLog
                            .info("Agreement, Detected a concurrent failure from FaultDistributor, new failed site "
                                    + CoreUtils.hsIdToString(fm.failedSite));
                    return false;
                } else {
                    if (m_recoveryLog.isDebugEnabled()) {
                        ignoreIt.log(fm);
                    }
                }
            }

            haveEnough = haveEnough || haveNecessaryFaultInfo(m_seeker.getSurvivors(), false);
            if (haveEnough) {

                Iterator<Map.Entry<Long, SiteFailureForwardMessage>> itr = m_forwardCandidates.entrySet()
                        .iterator();

                while (itr.hasNext()) {
                    Map.Entry<Long, SiteFailureForwardMessage> e = itr.next();
                    Set<Long> unseenBy = m_seeker.forWhomSiteIsDead(e.getKey());
                    if (unseenBy.size() > 0) {
                        m_mailbox.send(Longs.toArray(unseenBy), e.getValue());
                        m_recoveryLog.info("Agreement, fowarding to " + CoreUtils.hsIdCollectionToString(unseenBy)
                                + " " + e.getValue());
                    }
                    itr.remove();
                }
            }

        } while (!haveEnough || m_seeker.needForward(forwardStallCount));

        return true;
    }

    private boolean haveNecessaryFaultInfo(Set<Long> survivors, boolean log) {
        List<Pair<Long, Long>> missingMessages = new ArrayList<Pair<Long, Long>>();
        for (long survivingSite : survivors) {
            for (Long failingSite : m_inTrouble.keySet()) {
                Pair<Long, Long> key = Pair.of(survivingSite, failingSite);
                if (survivingSite != failingSite && !m_failedSitesLedger.containsKey(key)) {
                    missingMessages.add(key);
                }
            }
        }
        if (log) {
            StringBuilder sb = new StringBuilder();
            sb.append('[');
            boolean first = true;
            for (Pair<Long, Long> p : missingMessages) {
                if (!first)
                    sb.append(", ");
                first = false;
                sb.append(CoreUtils.hsIdToString(p.getFirst()));
                sb.append("+>");
                sb.append(CoreUtils.hsIdToString(p.getSecond()));
            }
            sb.append(']');
            if (missingMessages.isEmpty() && m_seeker.needForward()) {
                sb.append(" ");
                sb.append(m_seeker);
            }

            m_recoveryLog.warn("Agreement, Failure resolution stalled waiting for (Reporter +> Failed) "
                    + "information: " + sb.toString());
        }
        return missingMessages.isEmpty();
    }

    private Map<Long, Long> extractGlobalFaultData(Set<Long> hsIds) {

        if (!haveNecessaryFaultInfo(m_seeker.getSurvivors(), false)) {
            VoltDB.crashLocalVoltDB("Error extracting fault data", true, null);
        }

        Set<Long> toBeKilled = m_seeker.nextKill();
        if (toBeKilled.isEmpty()) {
            m_recoveryLog.warn("Agreement, seeker failed to yield a kill set: " + m_seeker);
        }

        Map<Long, Long> initiatorSafeInitPoint = new HashMap<Long, Long>();

        Iterator<Map.Entry<Pair<Long, Long>, Long>> iter = m_failedSitesLedger.entrySet().iterator();

        while (iter.hasNext()) {

            final Map.Entry<Pair<Long, Long>, Long> entry = iter.next();
            final Pair<Long, Long> key = entry.getKey();
            final Long safeTxnId = entry.getValue();

            if (!hsIds.contains(key.getFirst()) || !toBeKilled.contains(key.getSecond())) {
                continue;
            }

            Long initiatorId = key.getSecond();
            if (!initiatorSafeInitPoint.containsKey(initiatorId)) {
                initiatorSafeInitPoint.put(initiatorId, Long.MIN_VALUE);
            }

            initiatorSafeInitPoint.put(initiatorId, Math.max(initiatorSafeInitPoint.get(initiatorId), safeTxnId));
        }

        assert (!initiatorSafeInitPoint.containsValue(Long.MIN_VALUE));

        return ImmutableMap.copyOf(initiatorSafeInitPoint);
    }

}