Merge pull request #799 from nats-io/update_lame_duck_mode

Introduce some delay before closing clients in LameDuck mode.
This commit is contained in:
Ivan Kozlovic
2018-11-08 19:46:19 -07:00
committed by GitHub
3 changed files with 72 additions and 3 deletions

View File

@@ -1488,7 +1488,12 @@ func (s *Server) reConnectToRoute(rURL *url.URL, rtype RouteType) {
if tryForEver {
delay += DEFAULT_ROUTE_RECONNECT
}
time.Sleep(delay)
select {
case <-time.After(delay):
case <-s.quitCh:
s.grWG.Done()
return
}
s.connectToRoute(rURL, tryForEver)
}

View File

@@ -38,6 +38,12 @@ import (
"github.com/nats-io/gnatsd/logger"
)
// Time to wait before starting closing clients when in LD mode.
const lameDuckModeDefaultInitialDelay = int64(time.Second)
// Make this a variable so that we can change during tests
var lameDuckModeInitialDelay = int64(lameDuckModeDefaultInitialDelay)
// Info is the information sent to clients to help them understand information
// about this server.
type Info struct {
@@ -1626,8 +1632,16 @@ func (s *Server) lameDuckMode() {
}
s.mu.Unlock()
t := time.NewTimer(10 * time.Second)
s.Noticef("Closing existing clients")
t := time.NewTimer(time.Duration(atomic.LoadInt64(&lameDuckModeInitialDelay)))
// Delay start of closing of client connections in case
// we have several servers that we want to signal to enter LD mode
// and not have their client reconnect to each other.
select {
case <-t.C:
s.Noticef("Closing existing clients")
case <-s.quitCh:
return
}
for i, client := range clients {
client.closeConnection(ServerShutdown)
if batch == 1 || i%batch == 0 {

View File

@@ -19,6 +19,7 @@ import (
"net"
"os"
"strings"
"sync/atomic"
"testing"
"time"
@@ -649,6 +650,9 @@ func TestProfilingNoTimeout(t *testing.T) {
}
func TestLameDuckMode(t *testing.T) {
atomic.StoreInt64(&lameDuckModeInitialDelay, 0)
defer atomic.StoreInt64(&lameDuckModeInitialDelay, lameDuckModeDefaultInitialDelay)
optsA := DefaultOptions()
optsA.Cluster.Host = "127.0.0.1"
srvA := RunServer(optsA)
@@ -796,4 +800,50 @@ func TestLameDuckMode(t *testing.T) {
checkClientsCount(t, srvB, total)
stopClientsAndSrvB(ncs)
// Now test that we introduce delay before starting closing client connections.
// This allow to "signal" multiple servers and avoid their clients to reconnect
// to a server that is going to be going in LD mode.
atomic.StoreInt64(&lameDuckModeInitialDelay, int64(100*time.Millisecond))
optsA.LameDuckDuration = 10 * time.Millisecond
srvA = RunServer(optsA)
defer srvA.Shutdown()
optsB.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", srvA.ClusterAddr().Port))
optsB.LameDuckDuration = 10 * time.Millisecond
srvB = RunServer(optsB)
defer srvB.Shutdown()
optsC := DefaultOptions()
optsC.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", srvA.ClusterAddr().Port))
optsC.LameDuckDuration = 10 * time.Millisecond
srvC := RunServer(optsC)
defer srvC.Shutdown()
checkClusterFormed(t, srvA, srvB, srvC)
rt := int32(0)
nc, err := nats.Connect(fmt.Sprintf("nats://127.0.0.1:%d", optsA.Port),
nats.ReconnectWait(15*time.Millisecond),
nats.ReconnectHandler(func(*nats.Conn) {
atomic.AddInt32(&rt, 1)
}))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer nc.Close()
go srvA.lameDuckMode()
// Wait a bit, but less than lameDuckModeInitialDelay that we set in this
// test to 100ms.
time.Sleep(30 * time.Millisecond)
go srvB.lameDuckMode()
srvA.grWG.Wait()
srvB.grWG.Wait()
checkClientsCount(t, srvC, 1)
if n := atomic.LoadInt32(&rt); n != 1 {
t.Fatalf("Expected client to reconnect only once, got %v", n)
}
}