From cae91b8cad1847ddbb160a7cd85e8f0b0bb86ca2 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Mon, 24 Apr 2023 22:14:04 -0700 Subject: [PATCH] In single server mode healthz could mistake a snapshot staging directory during a restore as an account. If the restore took a long time, stalled, or was aborted, would cause healthz to fail. Signed-off-by: Derek Collison --- server/jetstream_test.go | 34 ++++++++++++++++++++++++++++++++++ server/monitor.go | 3 +++ 2 files changed, 37 insertions(+) diff --git a/server/jetstream_test.go b/server/jetstream_test.go index adc11269..98f2279a 100644 --- a/server/jetstream_test.go +++ b/server/jetstream_test.go @@ -19968,3 +19968,37 @@ func TestJetStreamKVHistoryRegression(t *testing.T) { }) } } + +func TestJetStreamSnapshotRestoreStallAndHealthz(t *testing.T) { + s := RunBasicJetStreamServer(t) + defer s.Shutdown() + + nc, js := jsClientConnect(t, s) + defer nc.Close() + + _, err := js.AddStream(&nats.StreamConfig{ + Name: "ORDERS", + Subjects: []string{"orders.*"}, + }) + require_NoError(t, err) + + for i := 0; i < 1000; i++ { + sendStreamMsg(t, nc, "orders.created", "new order") + } + + hs := s.healthz(nil) + if hs.Status != "ok" || hs.Error != _EMPTY_ { + t.Fatalf("Expected health to be ok, got %+v", hs) + } + + // Simulate the stagingf directory for restores. This is normally cleaned up + // but since its at the root of the storage directory make sure healthz is not affected. + snapDir := filepath.Join(s.getJetStream().config.StoreDir, snapStagingDir) + require_NoError(t, os.MkdirAll(snapDir, defaultDirPerms)) + + // Make sure healthz ok. + hs = s.healthz(nil) + if hs.Status != "ok" || hs.Error != _EMPTY_ { + t.Fatalf("Expected health to be ok, got %+v", hs) + } +} diff --git a/server/monitor.go b/server/monitor.go index 57b7380f..7d118ab4 100644 --- a/server/monitor.go +++ b/server/monitor.go @@ -3083,6 +3083,9 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus { // Whip through account folders and pull each stream name. fis, _ := os.ReadDir(sdir) for _, fi := range fis { + if fi.Name() == snapStagingDir { + continue + } acc, err := s.LookupAccount(fi.Name()) if err != nil { health.Status = na