Merge pull request #799 from nats-io/update_lame_duck_mode

Introduce some delay before closing clients in LameDuck mode.
2026-04-02 11:48:43 -07:00 · 2018-11-08 19:46:19 -07:00
parent 7b25c0890d eb17950971
commit 0744bb64c3
3 changed files with 72 additions and 3 deletions
--- a/server/route.go
+++ b/server/route.go
@@ -1488,7 +1488,12 @@ func (s *Server) reConnectToRoute(rURL *url.URL, rtype RouteType) {
 	if tryForEver {
 		delay += DEFAULT_ROUTE_RECONNECT
 	}
-	time.Sleep(delay)
+	select {
+	case <-time.After(delay):
+	case <-s.quitCh:
+		s.grWG.Done()
+		return
+	}
 	s.connectToRoute(rURL, tryForEver)
 }

--- a/server/server.go
+++ b/server/server.go
@@ -38,6 +38,12 @@ import (
 	"github.com/nats-io/gnatsd/logger"
 )

+// Time to wait before starting closing clients when in LD mode.
+const lameDuckModeDefaultInitialDelay = int64(time.Second)
+
+// Make this a variable so that we can change during tests
+var lameDuckModeInitialDelay = int64(lameDuckModeDefaultInitialDelay)
+
 // Info is the information sent to clients to help them understand information
 // about this server.
 type Info struct {
@@ -1626,8 +1632,16 @@ func (s *Server) lameDuckMode() {
 	}
 	s.mu.Unlock()

-	t := time.NewTimer(10 * time.Second)
-	s.Noticef("Closing existing clients")
+	t := time.NewTimer(time.Duration(atomic.LoadInt64(&lameDuckModeInitialDelay)))
+	// Delay start of closing of client connections in case
+	// we have several servers that we want to signal to enter LD mode
+	// and not have their client reconnect to each other.
+	select {
+	case <-t.C:
+		s.Noticef("Closing existing clients")
+	case <-s.quitCh:
+		return
+	}
 	for i, client := range clients {
 		client.closeConnection(ServerShutdown)
 		if batch == 1 || i%batch == 0 {
--- a/server/server_test.go
+++ b/server/server_test.go
@@ -19,6 +19,7 @@ import (
 	"net"
 	"os"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -649,6 +650,9 @@ func TestProfilingNoTimeout(t *testing.T) {
 }

 func TestLameDuckMode(t *testing.T) {
+	atomic.StoreInt64(&lameDuckModeInitialDelay, 0)
+	defer atomic.StoreInt64(&lameDuckModeInitialDelay, lameDuckModeDefaultInitialDelay)
+
 	optsA := DefaultOptions()
 	optsA.Cluster.Host = "127.0.0.1"
 	srvA := RunServer(optsA)
@@ -796,4 +800,50 @@ func TestLameDuckMode(t *testing.T) {
 	checkClientsCount(t, srvB, total)

 	stopClientsAndSrvB(ncs)
+
+	// Now test that we introduce delay before starting closing client connections.
+	// This allow to "signal" multiple servers and avoid their clients to reconnect
+	// to a server that is going to be going in LD mode.
+	atomic.StoreInt64(&lameDuckModeInitialDelay, int64(100*time.Millisecond))
+
+	optsA.LameDuckDuration = 10 * time.Millisecond
+	srvA = RunServer(optsA)
+	defer srvA.Shutdown()
+
+	optsB.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", srvA.ClusterAddr().Port))
+	optsB.LameDuckDuration = 10 * time.Millisecond
+	srvB = RunServer(optsB)
+	defer srvB.Shutdown()
+
+	optsC := DefaultOptions()
+	optsC.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", srvA.ClusterAddr().Port))
+	optsC.LameDuckDuration = 10 * time.Millisecond
+	srvC := RunServer(optsC)
+	defer srvC.Shutdown()
+
+	checkClusterFormed(t, srvA, srvB, srvC)
+
+	rt := int32(0)
+	nc, err := nats.Connect(fmt.Sprintf("nats://127.0.0.1:%d", optsA.Port),
+		nats.ReconnectWait(15*time.Millisecond),
+		nats.ReconnectHandler(func(*nats.Conn) {
+			atomic.AddInt32(&rt, 1)
+		}))
+	if err != nil {
+		t.Fatalf("Error on connect: %v", err)
+	}
+	defer nc.Close()
+
+	go srvA.lameDuckMode()
+	// Wait a bit, but less than lameDuckModeInitialDelay that we set in this
+	// test to 100ms.
+	time.Sleep(30 * time.Millisecond)
+	go srvB.lameDuckMode()
+
+	srvA.grWG.Wait()
+	srvB.grWG.Wait()
+	checkClientsCount(t, srvC, 1)
+	if n := atomic.LoadInt32(&rt); n != 1 {
+		t.Fatalf("Expected client to reconnect only once, got %v", n)
+	}
 }