From e1f0361b983844ec83f777fb39eac4edac406e36 Mon Sep 17 00:00:00 2001 From: Ivan Kozlovic Date: Wed, 14 Sep 2022 17:48:06 -0600 Subject: [PATCH] [ADDED] JetStream: ability to remove a server by peer ID instead of name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This can be helpful after a partial cluster restart since in that case the server name may not be known. However "server report jetstream" would report the peer ID that then can be used. For instance here is the output after a cluster restart where server "C" is not restarted. ``` nats -s nats://sys:pwd@localhost:4222 server report jetstream ... ╭────────────────────────────────────────────────────────────────────────────────────────────────╮ │ RAFT Meta Group Information │ ├─────────────────────────────────────────────────────┬────────┬─────────┬────────┬────────┬─────┤ │ Name │ Leader │ Current │ Online │ Active │ Lag │ ├─────────────────────────────────────────────────────┼────────┼─────────┼────────┼────────┼─────┤ │ A │ yes │ true │ true │ 0.00s │ 0 │ │ B │ │ true │ true │ 0.53s │ 0 │ │ Server name unknown at this time (peerID: jZ6RvVRH) │ │ false │ false │ 0.00s │ 0 │ ╰─────────────────────────────────────────────────────┴────────┴─────────┴────────┴────────┴─────╯ ``` With a change to the NATS CLI we could have something like: ``` nats -s nats://sys:pwd@localhost:4222 server raft peer-remove jZ6RvVRH --by_id ``` Signed-off-by: Ivan Kozlovic --- server/jetstream_api.go | 8 +++ server/jetstream_cluster_2_test.go | 84 ++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/server/jetstream_api.go b/server/jetstream_api.go index 0be83e5f..12023985 100644 --- a/server/jetstream_api.go +++ b/server/jetstream_api.go @@ -581,6 +581,7 @@ const JSApiLeaderStepDownResponseType = "io.nats.jetstream.api.v1.meta_leader_st type JSApiMetaServerRemoveRequest struct { // Server name of the peer to be removed. Server string `json:"peer"` + ByID bool `json:"by_id,omitempty"` } // JSApiMetaServerRemoveResponse is the response to a peer removal request in the meta group. @@ -2215,6 +2216,13 @@ func (s *Server) jsLeaderServerRemoveRequest(sub *subscription, c *client, _ *Ac var found string js.mu.RLock() for _, p := range cc.meta.Peers() { + if req.ByID { + if p.ID == req.Server { + found = req.Server + break + } + continue + } si, ok := s.nodeToInfo.Load(p.ID) if ok && si.(nodeInfo).name == req.Server { found = p.ID diff --git a/server/jetstream_cluster_2_test.go b/server/jetstream_cluster_2_test.go index cf42cacb..01f4920e 100644 --- a/server/jetstream_cluster_2_test.go +++ b/server/jetstream_cluster_2_test.go @@ -7259,3 +7259,87 @@ func TestJetStreamClusterCompressedStreamMessages(t *testing.T) { t.Fatalf("Did not receive completion signal") } } + +func TestJetStreamClusterRemovePeerByID(t *testing.T) { + c := createJetStreamClusterExplicit(t, "R3S", 3) + defer c.shutdown() + + s := c.randomNonLeader() + nc, js := jsClientConnect(t, s) + defer nc.Close() + + _, err := js.AddStream(&nats.StreamConfig{ + Name: "TEST", + Subjects: []string{"foo", "bar"}, + Replicas: 3, + }) + require_NoError(t, err) + + // Wait for a leader + c.waitOnStreamLeader(globalAccountName, "TEST") + + // Get the name of the one that is not restarted + srvName := c.opts[2].ServerName + // And its node ID + peerID := c.servers[2].Node() + + nc.Close() + // Now stop the whole cluster + c.stopAll() + // Restart all but one + for i := 0; i < 2; i++ { + opts := c.opts[i] + s, o := RunServerWithConfig(opts.ConfigFile) + c.servers[i] = s + c.opts[i] = o + } + + c.waitOnClusterReadyWithNumPeers(2) + c.waitOnStreamLeader(globalAccountName, "TEST") + + // Now attempt to remove by name, this should fail because the cluster + // was restarted and names are not persisted. + ml := c.leader() + nc, err = nats.Connect(ml.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) + require_NoError(t, err) + defer nc.Close() + + req := &JSApiMetaServerRemoveRequest{Server: srvName} + jsreq, err := json.Marshal(req) + require_NoError(t, err) + rmsg, err := nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) + require_NoError(t, err) + + var resp JSApiMetaServerRemoveResponse + err = json.Unmarshal(rmsg.Data, &resp) + require_NoError(t, err) + require_True(t, resp.Error != nil) + require_True(t, IsNatsErr(resp.Error, JSClusterServerNotMemberErr)) + + // Now try by ID, but first with an ID that does not match any peerID + req.Server = "some_bad_id" + req.ByID = true + jsreq, err = json.Marshal(req) + require_NoError(t, err) + rmsg, err = nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) + require_NoError(t, err) + + resp = JSApiMetaServerRemoveResponse{} + err = json.Unmarshal(rmsg.Data, &resp) + require_NoError(t, err) + require_True(t, resp.Error != nil) + require_True(t, IsNatsErr(resp.Error, JSClusterServerNotMemberErr)) + + // Now with the proper peer ID + req.Server = peerID + jsreq, err = json.Marshal(req) + require_NoError(t, err) + rmsg, err = nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) + require_NoError(t, err) + + resp = JSApiMetaServerRemoveResponse{} + err = json.Unmarshal(rmsg.Data, &resp) + require_NoError(t, err) + require_True(t, resp.Error == nil) + require_True(t, resp.Success) +}