From 61cccbce0204498cff8d27a23f1c3659dda54d92 Mon Sep 17 00:00:00 2001 From: Ivan Kozlovic Date: Fri, 12 Jun 2020 16:01:13 -0600 Subject: [PATCH 1/2] [FIXED] LeafNode solicit failure race could leave conn registered This was found due to a recent test that was flapping. The test was not checking the correct server for leafnode connection, but that uncovered the following bug: When a leafnode connection is solicited, the read/write loops are started. Then, the connection lock is released and several functions invoked to register the connection with an account and add to the connection leafs map. The problem is that the readloop (for instance) could get a read error and close the connection *before* the above said code executes, which would lead to a connection incorrectly registered. This could be fixed either by delaying the start of read/write loops after the registration is done, or like in this PR, check the connection close status after registration, and if closed, manually undoing the registration with account/leafs map. Signed-off-by: Ivan Kozlovic --- server/leafnode.go | 45 ++++++++++++++++++++++++++++------------- server/leafnode_test.go | 4 ++-- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/server/leafnode.go b/server/leafnode.go index 2305dccf..8ccbeacf 100644 --- a/server/leafnode.go +++ b/server/leafnode.go @@ -604,6 +604,7 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client { // Determines if we are soliciting the connection or not. var solicited bool var sendSysConnectEvent bool + var acc *Account c.mu.Lock() c.initClient() @@ -625,7 +626,8 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client { // TODO: Decide what should be the optimal behavior here. // For now, if lookup fails, we will constantly try // to recreate this LN connection. - acc, err := s.LookupAccount(remote.LocalAccount) + var err error + acc, err = s.LookupAccount(remote.LocalAccount) if err != nil { c.Errorf("No local account %q for leafnode: %v", remote.LocalAccount, err) c.closeConnection(MissingAccount) @@ -726,18 +728,16 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client { // Force handshake c.mu.Unlock() if err = conn.Handshake(); err != nil { - if solicited { - // If we overrode and used the saved tlsName but that failed - // we will clear that here. This is for the case that another server - // does not have the same tlsName, maybe only IPs. - // https://github.com/nats-io/nats-server/issues/1256 - if _, ok := err.(x509.HostnameError); ok { - remote.Lock() - if host == remote.tlsName { - remote.tlsName = "" - } - remote.Unlock() + // If we overrode and used the saved tlsName but that failed + // we will clear that here. This is for the case that another server + // does not have the same tlsName, maybe only IPs. + // https://github.com/nats-io/nats-server/issues/1256 + if _, ok := err.(x509.HostnameError); ok { + remote.Lock() + if host == remote.tlsName { + remote.tlsName = "" } + remote.Unlock() } c.Errorf("TLS handshake error: %v", err) c.closeConnection(TLSHandshakeError) @@ -829,11 +829,28 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client { // Also send our local subs. if solicited { // Make sure we register with the account here. - c.registerWithAccount(c.acc) + c.registerWithAccount(acc) s.addLeafNodeConnection(c) s.initLeafNodeSmapAndSendSubs(c) if sendSysConnectEvent { - s.sendLeafNodeConnect(c.acc) + s.sendLeafNodeConnect(acc) + } + + // The above functions are not atomically under the client + // lock doing those operations. It is possible - since we + // have started the read/write loops - that the connection + // is closed before or in between. This would leave the + // closed LN connection possible registered with the account + // and/or the server's leafs map. So check if connection + // is closed, and if so, manually cleanup. + c.mu.Lock() + closed := c.isClosed() + c.mu.Unlock() + if closed { + s.removeLeafNodeConnection(c) + if prev := acc.removeClient(c); prev == 1 { + s.decActiveAccounts() + } } } diff --git a/server/leafnode_test.go b/server/leafnode_test.go index cccf10b9..2afb1520 100644 --- a/server/leafnode_test.go +++ b/server/leafnode_test.go @@ -1566,10 +1566,10 @@ func TestLeafNodeTLSVerifyAndMap(t *testing.T) { if !test.provideCert { // Wait a bit and make sure we are not connecting time.Sleep(100 * time.Millisecond) - checkLeafNodeConnectedCount(t, sl, 0) + checkLeafNodeConnectedCount(t, s, 0) return } - checkLeafNodeConnected(t, sl) + checkLeafNodeConnected(t, s) var uname string var accname string From b36672a6bc260f2dd7b0e97d3e95af8f2a85c3a5 Mon Sep 17 00:00:00 2001 From: Ivan Kozlovic Date: Fri, 12 Jun 2020 16:51:40 -0600 Subject: [PATCH 2/2] Fixed flapper Signed-off-by: Ivan Kozlovic --- test/leafnode_test.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/leafnode_test.go b/test/leafnode_test.go index 6d629075..30bee447 100644 --- a/test/leafnode_test.go +++ b/test/leafnode_test.go @@ -2158,10 +2158,15 @@ func TestLeafNodeConnectionLimitsSingleServer(t *testing.T) { checkAccConnectionCounts(t, 2) - // Make sure s4 has 0 still. - if nln := s4.NumLeafNodes(); nln != 0 { - t.Fatalf("Expected no leafnodes accounted for in s4, got %d", nln) - } + // Make sure s4 has 0 still. We need checkFor because it is possible + // that when we check we have actually the connection registered for + // a short period before it is closed due to limit. + checkFor(t, time.Second, 15*time.Millisecond, func() error { + if nln := s4.NumLeafNodes(); nln != 0 { + return fmt.Errorf("Expected no leafnodes accounted for in s4, got %d", nln) + } + return nil + }) // Make sure this is still 2. checkLeafNodeConnections(t, s, 2)