Inhibit Go's default TCP keepalive settings for NATS (#1562)

Inhibit Go's default TCP keepalive settings for NATS

Go 1.13 changed the semantics of the tuning parameters for TCP keepalives, including the default value.  This affects all TCP listeners.  The NATS protocol has its own L7 keepalive system (PING/PONG) and the Go defaults are not a good fit for some valid deployment scenarios, while Go doesn't directly expose a working API for tuning these.

Rather than add a configuration knob and pull in another dependency (with portability issues) just disable TCP keepalives for all listeners used for speaking the NATS protocol.

Change the tests so we test the same logic.  Do not change HTTP monitoring, profiling, or the websocket API listeners.

Change KeepAlive on client connections too.
This commit is contained in:
Phil Pennock
2020-08-14 13:37:59 -04:00
committed by GitHub
parent 400b044ea0
commit 3c680eceb9
7 changed files with 42 additions and 9 deletions

View File

@@ -425,7 +425,7 @@ func (s *Server) startGatewayAcceptLoop() {
return
}
hp := net.JoinHostPort(opts.Gateway.Host, strconv.Itoa(port))
l, e := net.Listen("tcp", hp)
l, e := natsListen("tcp", hp)
if e != nil {
s.mu.Unlock()
s.Fatalf("Error listening on gateway port: %d - %v", opts.Gateway.Port, e)
@@ -630,7 +630,7 @@ func (s *Server) solicitGateway(cfg *gatewayCfg, firstConnect bool) {
} else {
s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
}
conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
conn, err := natsDialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
if err == nil {
// We could connect, create the gateway connection and return.
s.createGateway(cfg, u, conn)

View File

@@ -5790,7 +5790,7 @@ func TestGatewayAccountInterestModeSwitchOnlyOncePerAccount(t *testing.T) {
}
func TestGatewaySingleOutbound(t *testing.T) {
l, err := net.Listen("tcp", "127.0.0.1:0")
l, err := natsListen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("Error on listen: %v", err)
}

View File

@@ -290,7 +290,7 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool)
ipStr = fmt.Sprintf(" (%s)", url)
}
s.Debugf("Trying to connect as leafnode to remote server on %q%s", rURL.Host, ipStr)
conn, err = net.DialTimeout("tcp", url, dialTimeout)
conn, err = natsDialTimeout("tcp", url, dialTimeout)
}
if err != nil {
attempts++
@@ -359,7 +359,7 @@ func (s *Server) startLeafNodeAcceptLoop() {
return
}
hp := net.JoinHostPort(opts.LeafNode.Host, strconv.Itoa(port))
l, e := net.Listen("tcp", hp)
l, e := natsListen("tcp", hp)
if e != nil {
s.mu.Unlock()
s.Fatalf("Error listening on leafnode port: %d - %v", opts.LeafNode.Port, e)

View File

@@ -1675,7 +1675,7 @@ func (s *Server) startRouteAcceptLoop() {
}
hp := net.JoinHostPort(opts.Cluster.Host, strconv.Itoa(port))
l, e := net.Listen("tcp", hp)
l, e := natsListen("tcp", hp)
if e != nil {
s.mu.Unlock()
s.Fatalf("Error listening on router port: %d - %v", opts.Cluster.Port, e)
@@ -1842,7 +1842,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver, firstConnect bool) {
return
}
s.Debugf("Trying to connect to route on %s", rURL.Host)
conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
conn, err := natsDialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
if err != nil {
attempts++
if s.shouldReportConnectErr(firstConnect, attempts) {

View File

@@ -92,7 +92,7 @@ func TestRouteConfig(t *testing.T) {
}
func TestClusterAdvertise(t *testing.T) {
lst, err := net.Listen("tcp", "127.0.0.1:0")
lst, err := natsListen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("Error starting listener: %v", err)
}

View File

@@ -1632,7 +1632,7 @@ func (s *Server) AcceptLoop(clr chan struct{}) {
return
}
hp := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
l, e := net.Listen("tcp", hp)
l, e := natsListen("tcp", hp)
if e != nil {
s.mu.Unlock()
s.Fatalf("Error listening on port: %s, %q", hp, e)

View File

@@ -14,6 +14,7 @@
package server
import (
"context"
"errors"
"fmt"
"math"
@@ -188,3 +189,35 @@ func (m refCountedUrlSet) getAsStringSlice() []string {
}
return a
}
// natsListenConfig provides a common configuration to match the one used by
// net.Listen() but with our own defaults.
// Go 1.13 introduced default-on TCP keepalives with aggressive timings and
// there's no sane portable way in Go with stdlib to split the initial timer
// from the retry timer. Linux/BSD defaults are 2hrs/75s and Go sets both
// to 15s; the issue re making them indepedently tunable has been open since
// 2014 and this code here is being written in 2020.
// The NATS protocol has its own L7 PING/PONG keepalive system and the Go
// defaults are inappropriate for IoT deployment scenarios.
// Replace any NATS-protocol calls to net.Listen(...) with
// natsListenConfig.Listen(ctx,...) or use natsListen(); leave calls for HTTP
// monitoring, etc, on the default.
var natsListenConfig = &net.ListenConfig{
KeepAlive: -1,
}
// natsListen() is the same as net.Listen() except that TCP keepalives are
// disabled (to match Go's behavior before Go 1.13).
func natsListen(network, address string) (net.Listener, error) {
return natsListenConfig.Listen(context.Background(), network, address)
}
// natsDialTimeout is the same as net.DialTimeout() except the TCP keepalives
// are disabled (to match Go's behavior before Go 1.13).
func natsDialTimeout(network, address string, timeout time.Duration) (net.Conn, error) {
d := net.Dialer{
Timeout: timeout,
KeepAlive: -1,
}
return d.Dial(network, address)
}