From a5f56036458bb74a92ccc5bc6c52b573b604a4c9 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Sat, 15 Apr 2023 12:23:44 -0700 Subject: [PATCH] Reset our WAL on edge conditions instead of trying to recover. Also if we are timing out and trying to become a candidate but are doing a catchup check if we are stalled. Signed-off-by: Derek Collison --- server/raft.go | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/server/raft.go b/server/raft.go index 4c631258..e27f1a56 100644 --- a/server/raft.go +++ b/server/raft.go @@ -1792,6 +1792,10 @@ func (n *raft) runAsFollower() { n.debug("Not switching to candidate, observer only") } else if n.isCatchingUp() { n.debug("Not switching to candidate, catching up") + // Check to see if our catchup has stalled. + if n.catchupStalled() { + n.cancelCatchup() + } } else { n.switchToCandidate() return @@ -2523,14 +2527,16 @@ func (n *raft) applyCommit(index uint64) error { var err error if ae, err = n.loadEntry(index); err != nil { if err != ErrStoreClosed && err != ErrStoreEOF { - if err == errBadMsg { - n.warn("Got an error loading %d index: %v - will reset", index, err) - n.resetWAL() - } else { - n.warn("Got an error loading %d index: %v", index, err) + n.warn("Got an error loading %d index: %v - will reset", index, err) + if n.state == Leader { + n.stepdown.push(n.selectNextLeader()) } + // Reset and cancel any catchup. + n.resetWAL() + n.cancelCatchup() + } else { + n.commit = original } - n.commit = original return errEntryLoadFailed } } else { @@ -3275,17 +3281,13 @@ func (n *raft) storeToWAL(ae *appendEntry) error { // Sanity checking for now. if index := ae.pindex + 1; index != seq { - n.warn("Wrong index, ae is %+v, index stored was %d, n.pindex is %d", ae, seq, n.pindex) - if index > seq { - // We are missing store state from our state. We need to stepdown at this point. - if n.state == Leader { - n.stepdown.push(n.selectNextLeader()) - } - } else { - // Truncate back to our last known. - n.truncateWAL(n.pterm, n.pindex) - n.cancelCatchup() + n.warn("Wrong index, ae is %+v, index stored was %d, n.pindex is %d, will reset", ae, seq, n.pindex) + if n.state == Leader { + n.stepdown.push(n.selectNextLeader()) } + // Reset and cancel any catchup. + n.resetWAL() + n.cancelCatchup() return errEntryStoreFailed }