Merge pull request #1475 from nats-io/fix_leafnode_solicit_failure_race

[FIXED] LeafNode solicit failure race could leave conn registered
This commit is contained in:
Ivan Kozlovic
2020-06-12 17:04:16 -06:00
committed by GitHub
3 changed files with 42 additions and 20 deletions

View File

@@ -604,6 +604,7 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client {
// Determines if we are soliciting the connection or not.
var solicited bool
var sendSysConnectEvent bool
var acc *Account
c.mu.Lock()
c.initClient()
@@ -625,7 +626,8 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client {
// TODO: Decide what should be the optimal behavior here.
// For now, if lookup fails, we will constantly try
// to recreate this LN connection.
acc, err := s.LookupAccount(remote.LocalAccount)
var err error
acc, err = s.LookupAccount(remote.LocalAccount)
if err != nil {
c.Errorf("No local account %q for leafnode: %v", remote.LocalAccount, err)
c.closeConnection(MissingAccount)
@@ -726,18 +728,16 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client {
// Force handshake
c.mu.Unlock()
if err = conn.Handshake(); err != nil {
if solicited {
// If we overrode and used the saved tlsName but that failed
// we will clear that here. This is for the case that another server
// does not have the same tlsName, maybe only IPs.
// https://github.com/nats-io/nats-server/issues/1256
if _, ok := err.(x509.HostnameError); ok {
remote.Lock()
if host == remote.tlsName {
remote.tlsName = ""
}
remote.Unlock()
// If we overrode and used the saved tlsName but that failed
// we will clear that here. This is for the case that another server
// does not have the same tlsName, maybe only IPs.
// https://github.com/nats-io/nats-server/issues/1256
if _, ok := err.(x509.HostnameError); ok {
remote.Lock()
if host == remote.tlsName {
remote.tlsName = ""
}
remote.Unlock()
}
c.Errorf("TLS handshake error: %v", err)
c.closeConnection(TLSHandshakeError)
@@ -829,11 +829,28 @@ func (s *Server) createLeafNode(conn net.Conn, remote *leafNodeCfg) *client {
// Also send our local subs.
if solicited {
// Make sure we register with the account here.
c.registerWithAccount(c.acc)
c.registerWithAccount(acc)
s.addLeafNodeConnection(c)
s.initLeafNodeSmapAndSendSubs(c)
if sendSysConnectEvent {
s.sendLeafNodeConnect(c.acc)
s.sendLeafNodeConnect(acc)
}
// The above functions are not atomically under the client
// lock doing those operations. It is possible - since we
// have started the read/write loops - that the connection
// is closed before or in between. This would leave the
// closed LN connection possible registered with the account
// and/or the server's leafs map. So check if connection
// is closed, and if so, manually cleanup.
c.mu.Lock()
closed := c.isClosed()
c.mu.Unlock()
if closed {
s.removeLeafNodeConnection(c)
if prev := acc.removeClient(c); prev == 1 {
s.decActiveAccounts()
}
}
}

View File

@@ -1566,10 +1566,10 @@ func TestLeafNodeTLSVerifyAndMap(t *testing.T) {
if !test.provideCert {
// Wait a bit and make sure we are not connecting
time.Sleep(100 * time.Millisecond)
checkLeafNodeConnectedCount(t, sl, 0)
checkLeafNodeConnectedCount(t, s, 0)
return
}
checkLeafNodeConnected(t, sl)
checkLeafNodeConnected(t, s)
var uname string
var accname string

View File

@@ -2158,10 +2158,15 @@ func TestLeafNodeConnectionLimitsSingleServer(t *testing.T) {
checkAccConnectionCounts(t, 2)
// Make sure s4 has 0 still.
if nln := s4.NumLeafNodes(); nln != 0 {
t.Fatalf("Expected no leafnodes accounted for in s4, got %d", nln)
}
// Make sure s4 has 0 still. We need checkFor because it is possible
// that when we check we have actually the connection registered for
// a short period before it is closed due to limit.
checkFor(t, time.Second, 15*time.Millisecond, func() error {
if nln := s4.NumLeafNodes(); nln != 0 {
return fmt.Errorf("Expected no leafnodes accounted for in s4, got %d", nln)
}
return nil
})
// Make sure this is still 2.
checkLeafNodeConnections(t, s, 2)