From 92c70712debf5f89056ec0b6f4a6fb2949e815b5 Mon Sep 17 00:00:00 2001 From: Byron Ruth Date: Sun, 14 May 2023 13:06:01 -0400 Subject: [PATCH 1/8] Add workflow for stale issues Signed-off-by: Byron Ruth --- .github/workflows/stale-issues.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/stale-issues.yaml diff --git a/.github/workflows/stale-issues.yaml b/.github/workflows/stale-issues.yaml new file mode 100644 index 00000000..75a70ead --- /dev/null +++ b/.github/workflows/stale-issues.yaml @@ -0,0 +1,17 @@ +name: stale-issues +on: + schedule: + - cron: "30 1 * * *" + +jobs: + - name: stale-issues + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v8 + with: + debug-only: true # Set until the behavior is tuned. + days-before-stale: 28 # Mark stale after 4 weeks (28 days) of inactivity + days-before-close: 7 # Close 1 week after marked `Stale` and without follow-up activity + exempt-all-milestones: true # Any issue/PR within a milestone will be omitted + #exempt-assigness: "foo,bar" # Exempt issues/PRs assigned to particular users From 7e9a427d131a05ed758618cae36d407768fcc3bf Mon Sep 17 00:00:00 2001 From: Byron Ruth Date: Sun, 14 May 2023 13:19:30 -0400 Subject: [PATCH 2/8] Fix workflow structure Signed-off-by: Byron Ruth --- .github/workflows/stale-issues.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/stale-issues.yaml b/.github/workflows/stale-issues.yaml index 75a70ead..9d00094c 100644 --- a/.github/workflows/stale-issues.yaml +++ b/.github/workflows/stale-issues.yaml @@ -4,14 +4,14 @@ on: - cron: "30 1 * * *" jobs: - - name: stale-issues - stale: - runs-on: ubuntu-latest - steps: - - uses: actions/stale@v8 - with: - debug-only: true # Set until the behavior is tuned. - days-before-stale: 28 # Mark stale after 4 weeks (28 days) of inactivity - days-before-close: 7 # Close 1 week after marked `Stale` and without follow-up activity - exempt-all-milestones: true # Any issue/PR within a milestone will be omitted - #exempt-assigness: "foo,bar" # Exempt issues/PRs assigned to particular users + stale: + runs-on: ubuntu-latest + + steps: + - uses: actions/stale@v8 + with: + debug-only: true # Set until the behavior is tuned. + days-before-stale: 28 # Mark stale after 4 weeks (28 days) of inactivity + days-before-close: 7 # Close 1 week after marked `Stale` and without follow-up activity + exempt-all-milestones: true # Any issue/PR within a milestone will be omitted + #exempt-assigness: "foo,bar" # Exempt issues/PRs assigned to particular users From f764853e00487315140196ee57d50889eb48c6a3 Mon Sep 17 00:00:00 2001 From: Byron Ruth Date: Mon, 15 May 2023 09:14:31 -0400 Subject: [PATCH 3/8] Disable auto-closing, increase stale threshold Signed-off-by: Byron Ruth --- .github/workflows/stale-issues.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale-issues.yaml b/.github/workflows/stale-issues.yaml index 9d00094c..1d518508 100644 --- a/.github/workflows/stale-issues.yaml +++ b/.github/workflows/stale-issues.yaml @@ -3,6 +3,10 @@ on: schedule: - cron: "30 1 * * *" +permissions: + issues: write + pull-requests: write + jobs: stale: runs-on: ubuntu-latest @@ -11,7 +15,7 @@ jobs: - uses: actions/stale@v8 with: debug-only: true # Set until the behavior is tuned. - days-before-stale: 28 # Mark stale after 4 weeks (28 days) of inactivity - days-before-close: 7 # Close 1 week after marked `Stale` and without follow-up activity + days-before-stale: 56 # Mark stale after 8 weeks (56 days) of inactivity + days-before-close: -1 # Disable auto-closing exempt-all-milestones: true # Any issue/PR within a milestone will be omitted #exempt-assigness: "foo,bar" # Exempt issues/PRs assigned to particular users From 5db57fb053a03dd84928ed84fb14da9e2267d1eb Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Tue, 16 May 2023 14:02:29 -0700 Subject: [PATCH 4/8] Bump to 2.9.17-RC.2 Signed-off-by: Derek Collison --- server/const.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/const.go b/server/const.go index c2675cea..4749abed 100644 --- a/server/const.go +++ b/server/const.go @@ -41,7 +41,7 @@ var ( const ( // VERSION is the current version for the server. - VERSION = "2.9.17-RC.1" + VERSION = "2.9.17-RC.2" // PROTO is the currently supported protocol. // 0 was the original From f3553791b11598c3cd049621b22a39ee7eec1b9d Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Wed, 17 May 2023 13:14:33 -0700 Subject: [PATCH 5/8] Updates to stream reset logic. 1. When catching up do not try forever and if needed reset cluster state. 2. In checking if a stream is healthy check for node drift. 3. When restarting a stream make sure the current node is stopped. Signed-off-by: Derek Collison --- server/jetstream_cluster.go | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/server/jetstream_cluster.go b/server/jetstream_cluster.go index 60354874..5972ba5b 100644 --- a/server/jetstream_cluster.go +++ b/server/jetstream_cluster.go @@ -456,6 +456,9 @@ func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) { } // Make sure to clear out the raft node if still present in the meta layer. if rg := sa.Group; rg != nil && rg.node != nil { + if rg.node.State() != Closed { + rg.node.Stop() + } rg.node = nil } js.mu.Unlock() @@ -493,7 +496,7 @@ func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) { // For R1 it will make sure the stream is present on this server. func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { js.mu.Lock() - cc := js.cluster + s, cc := js.srv, js.cluster if cc == nil { // Non-clustered mode js.mu.Unlock() @@ -523,7 +526,11 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { if !mset.isCatchingUp() { return true } + } else if node != nil && node != mset.raftNode() { + s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName) + mset.resetClusteredState(nil) } + return false } @@ -7590,6 +7597,10 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) (e error) { } } + // Do not let this go on forever. + const maxRetries = 3 + var numRetries int + RETRY: // On retry, we need to release the semaphore we got. Call will be no-op // if releaseSem boolean has not been set to true on successfully getting @@ -7606,13 +7617,20 @@ RETRY: sub = nil } - // Block here if we have too many requests in flight. - <-s.syncOutSem - releaseSem = true if !s.isRunning() { return ErrServerNotRunning } + numRetries++ + if numRetries >= maxRetries { + // Force a hard reset here. + return errFirstSequenceMismatch + } + + // Block here if we have too many requests in flight. + <-s.syncOutSem + releaseSem = true + // We may have been blocked for a bit, so the reset need to ensure that we // consume the already fired timer. if !notActive.Stop() { From 44a5875968ab246e3253d0c2b3885352628c1770 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Wed, 17 May 2023 16:05:46 -0700 Subject: [PATCH 6/8] Avoimd deadlock with usage lock for an account during checkAndSyncUsage(). Signed-off-by: Derek Collison --- server/jetstream.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/server/jetstream.go b/server/jetstream.go index 50cc8a7d..0c0381ce 100644 --- a/server/jetstream.go +++ b/server/jetstream.go @@ -1813,6 +1813,17 @@ func (jsa *jsAccount) checkAndSyncUsage(tierName string, storeType StorageType) } s := js.srv + // We need to collect the stream stores before we acquire the usage lock since in storeUpdates the + // stream lock could be held if deletion are inline with storing a new message, e.g. via limits. + var stores []StreamStore + for _, mset := range jsa.streams { + mset.mu.RLock() + if mset.tier == tierName && mset.stype == storeType && mset.store != nil { + stores = append(stores, mset.store) + } + mset.mu.RUnlock() + } + // Now range and qualify, hold usage lock to prevent updates. jsa.usageMu.Lock() defer jsa.usageMu.Unlock() @@ -1822,15 +1833,12 @@ func (jsa *jsAccount) checkAndSyncUsage(tierName string, storeType StorageType) return } + // Collect current total for all stream stores that matched. var total int64 var state StreamState - for _, mset := range jsa.streams { - mset.mu.RLock() - if mset.tier == tierName && mset.stype == storeType { - mset.store.FastState(&state) - total += int64(state.Bytes) - } - mset.mu.RUnlock() + for _, store := range stores { + store.FastState(&state) + total += int64(state.Bytes) } var needClusterUpdate bool From a8d7d3886e81ac9aae160f279e7d5c3a1dfad669 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Wed, 17 May 2023 16:19:39 -0700 Subject: [PATCH 7/8] Make sure to delete the stream assignment node here Signed-off-by: Derek Collison --- server/jetstream_cluster.go | 1 + 1 file changed, 1 insertion(+) diff --git a/server/jetstream_cluster.go b/server/jetstream_cluster.go index 5972ba5b..0c6e0fd7 100644 --- a/server/jetstream_cluster.go +++ b/server/jetstream_cluster.go @@ -528,6 +528,7 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { } } else if node != nil && node != mset.raftNode() { s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName) + node.Delete() mset.resetClusteredState(nil) } From 7dfe5e528e9736d1a74a3f8322ad5911ef0ed9cd Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Wed, 17 May 2023 16:46:10 -0700 Subject: [PATCH 8/8] Bump to 2.9.17-RC.3 Signed-off-by: Derek Collison --- server/const.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/const.go b/server/const.go index 4749abed..20113e4a 100644 --- a/server/const.go +++ b/server/const.go @@ -41,7 +41,7 @@ var ( const ( // VERSION is the current version for the server. - VERSION = "2.9.17-RC.2" + VERSION = "2.9.17-RC.3" // PROTO is the currently supported protocol. // 0 was the original