From 1ba617bba0e0b038fd2cac790bae9db1e777066d Mon Sep 17 00:00:00 2001 From: Ivan Kozlovic Date: Tue, 12 Apr 2022 18:56:28 -0600 Subject: [PATCH] Fixed data race with RAFT node election timer Got this race: ``` ================== WARNING: DATA RACE Read at 0x00c001c880e8 by goroutine 342: github.com/nats-io/nats-server/v2/server.(*raft).resetElect() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1525 +0x44 github.com/nats-io/nats-server/v2/server.(*raft).resetElectionTimeout() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1520 +0xa4 github.com/nats-io/nats-server/v2/server.(*raft).handleAppendEntry() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:2537 +0x12e github.com/nats-io/nats-server/v2/server.(*raft).handleAppendEntry-fm() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:2525 +0xcc ... Previous write at 0x00c001c880e8 by goroutine 587: github.com/nats-io/nats-server/v2/server.(*raft).resetElect() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1526 +0x113 github.com/nats-io/nats-server/v2/server.(*raft).resetElectionTimeout() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1520 +0xa4 github.com/nats-io/nats-server/v2/server.(*Server).startRaftNode() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:484 +0x20d1 github.com/nats-io/nats-server/v2/server.(*jetStream).createRaftGroup() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/jetstream_cluster.go:1497 +0x9ed github.com/nats-io/nats-server/v2/server.(*jetStream).processClusterCreateConsumer() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/jetstream_cluster.go:3063 +0xba4 ... ================== WARNING: DATA RACE Read at 0x00c0006671f0 by goroutine 342: time.(*Timer).Stop() /usr/local/go/src/time/sleep.go:78 +0x84 github.com/nats-io/nats-server/v2/server.(*raft).resetElect() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1528 +0x58 github.com/nats-io/nats-server/v2/server.(*raft).resetElectionTimeout() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1520 +0xa4 github.com/nats-io/nats-server/v2/server.(*raft).handleAppendEntry() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:2537 +0x12e github.com/nats-io/nats-server/v2/server.(*raft).handleAppendEntry-fm() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:2525 +0xcc ... Previous write at 0x00c0006671f0 by goroutine 587: time.NewTimer() /usr/local/go/src/time/sleep.go:92 +0xb3 github.com/nats-io/nats-server/v2/server.(*raft).resetElect() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1526 +0x104 github.com/nats-io/nats-server/v2/server.(*raft).resetElectionTimeout() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:1520 +0xa4 github.com/nats-io/nats-server/v2/server.(*Server).startRaftNode() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/raft.go:484 +0x20d1 github.com/nats-io/nats-server/v2/server.(*jetStream).createRaftGroup() /Users/ivan/dev/go/src/github.com/nats-io/nats-server/server/jetstream_cluster.go:1497 +0x9ed ... ``` Looked at all places where resetElect() or resetElectionTimeout() was invoked without being protected by the raft's lock and added it. Signed-off-by: Ivan Kozlovic --- server/raft.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/raft.go b/server/raft.go index e47feb14..dcf288c7 100644 --- a/server/raft.go +++ b/server/raft.go @@ -1631,10 +1631,14 @@ func (n *raft) runAsFollower() { case <-elect.C: // If we are out of resources we just want to stay in this state for the moment. if n.outOfResources() { + n.Lock() n.resetElectionTimeout() + n.Unlock() n.debug("Not switching to candidate, no resources") } else if n.isObserver() { + n.Lock() n.resetElect(48 * time.Hour) + n.Unlock() n.debug("Not switching to candidate, observer only") } else if n.isCatchingUp() { n.debug("Not switching to candidate, catching up") @@ -2534,7 +2538,9 @@ func (n *raft) handleAppendEntry(sub *subscription, c *client, _ *Account, subje } else { n.warn("AppendEntry failed to be placed on internal channel: corrupt entry") } + n.Lock() n.resetElectionTimeout() + n.Unlock() } // Lock should be held.