mirror of
https://github.com/gogrlx/nats-server.git
synced 2026-04-02 03:38:42 -07:00
[UPDATED] Reduce report of failed connection attempts
This applies to routes, gateways and leaf node connections. The failed attempts will be printed at the first, after the first minute and then every hour. The connect/error statements now include the attempt number. Note that in debug mode, all attempts are traced, so you may get double trace (one for debug, one for info/error). Resolves #969 Signed-off-by: Ivan Kozlovic <ivan@synadia.com>
This commit is contained in:
@@ -556,7 +556,13 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
|
||||
} else {
|
||||
typeStr = "explicit"
|
||||
}
|
||||
|
||||
const connFmt = "Connecting to %s gateway %q (%s) at %s (attempt %v)"
|
||||
const connErrFmt = "Error connecting to %s gateway %q (%s) at %s (attempt %v): %v"
|
||||
|
||||
for s.isRunning() && len(urls) > 0 {
|
||||
attempts++
|
||||
report := shouldReportConnectErr(attempts)
|
||||
// Iteration is random
|
||||
for _, u := range urls {
|
||||
address, err := s.getRandomIP(s.gateway.resolver, u.Host)
|
||||
@@ -564,21 +570,26 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
|
||||
s.Errorf("Error getting IP for %s gateway %q (%s): %v", typeStr, cfg.Name, u.Host, err)
|
||||
continue
|
||||
}
|
||||
s.Noticef("Connecting to %s gateway %q (%s) at %s", typeStr, cfg.Name, u.Host, address)
|
||||
s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
|
||||
if report {
|
||||
s.Noticef(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
|
||||
}
|
||||
conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
|
||||
if err == nil {
|
||||
// We could connect, create the gateway connection and return.
|
||||
s.createGateway(cfg, u, conn)
|
||||
return
|
||||
}
|
||||
s.Errorf("Error connecting to %s gateway %q (%s) at %s: %v", typeStr, cfg.Name, u.Host, address, err)
|
||||
s.Debugf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
|
||||
if report {
|
||||
s.Errorf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
|
||||
}
|
||||
// Break this loop if server is being shutdown...
|
||||
if !s.isRunning() {
|
||||
break
|
||||
}
|
||||
}
|
||||
if isImplicit {
|
||||
attempts++
|
||||
if opts.Gateway.ConnectRetries == 0 || attempts > opts.Gateway.ConnectRetries {
|
||||
s.gateway.Lock()
|
||||
delete(s.gateway.remotes, cfg.Name)
|
||||
|
||||
@@ -165,6 +165,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
|
||||
|
||||
var conn net.Conn
|
||||
|
||||
const connErrFmt = "Error trying to connect as leaf node to remote server (attempt %v): %v"
|
||||
|
||||
attempts := 0
|
||||
for s.isRunning() && s.remoteLeafNodeStillValid(remote) {
|
||||
rURL := remote.pickNextURL()
|
||||
url, err := s.getRandomIP(resolver, rURL.Host)
|
||||
@@ -177,7 +180,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
|
||||
conn, err = net.DialTimeout("tcp", url, dialTimeout)
|
||||
}
|
||||
if err != nil {
|
||||
s.Errorf("Error trying to connect as leaf node to remote server: %v", err)
|
||||
attempts++
|
||||
s.Debugf(connErrFmt, attempts, err)
|
||||
if shouldReportConnectErr(attempts) {
|
||||
s.Errorf(connErrFmt, attempts, err)
|
||||
}
|
||||
select {
|
||||
case <-s.quitCh:
|
||||
return
|
||||
|
||||
@@ -99,6 +99,9 @@ const sendRouteSubsInGoRoutineThreshold = 1024 * 1024 // 1MB
|
||||
// Warning when user configures cluster TLS insecure
|
||||
const clusterTLSInsecureWarning = "TLS Hostname verification disabled, system will not verify identity of the solicited route"
|
||||
|
||||
// Can be changed for tests
|
||||
var routeConnectDelay = DEFAULT_ROUTE_CONNECT
|
||||
|
||||
// This will add a timer to watch over remote reply subjects in case
|
||||
// they fail to receive a response. The duration will be taken from the
|
||||
// accounts map timeout to match.
|
||||
@@ -1587,6 +1590,8 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
|
||||
defer s.grWG.Done()
|
||||
|
||||
const connErrFmt = "Error trying to connect to route (attempt %v): %v"
|
||||
|
||||
attempts := 0
|
||||
for s.isRunning() && rURL != nil {
|
||||
if tryForEver && !s.routeStillValid(rURL) {
|
||||
@@ -1595,12 +1600,15 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
s.Debugf("Trying to connect to route on %s", rURL.Host)
|
||||
conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
|
||||
if err != nil {
|
||||
s.Errorf("Error trying to connect to route: %v", err)
|
||||
attempts++
|
||||
s.Debugf(connErrFmt, attempts, err)
|
||||
if shouldReportConnectErr(attempts) {
|
||||
s.Errorf(connErrFmt, attempts, err)
|
||||
}
|
||||
if !tryForEver {
|
||||
if opts.Cluster.ConnectRetries <= 0 {
|
||||
return
|
||||
}
|
||||
attempts++
|
||||
if attempts > opts.Cluster.ConnectRetries {
|
||||
return
|
||||
}
|
||||
@@ -1608,7 +1616,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
select {
|
||||
case <-s.quitCh:
|
||||
return
|
||||
case <-time.After(DEFAULT_ROUTE_CONNECT):
|
||||
case <-time.After(routeConnectDelay):
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,10 @@ import (
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
func init() {
|
||||
routeConnectDelay = 15 * time.Millisecond
|
||||
}
|
||||
|
||||
func checkNumRoutes(t *testing.T, s *Server, expected int) {
|
||||
t.Helper()
|
||||
checkFor(t, 5*time.Second, 15*time.Millisecond, func() error {
|
||||
|
||||
@@ -14,9 +14,11 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -1139,3 +1141,140 @@ func TestInsecureSkipVerifyNotSupportedForClientAndGateways(t *testing.T) {
|
||||
}
|
||||
checkServerFails(t, o)
|
||||
}
|
||||
|
||||
func TestConnErrorReports(t *testing.T) {
|
||||
tmpFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating temp file: %v", err)
|
||||
}
|
||||
log := tmpFile.Name()
|
||||
tmpFile.Close()
|
||||
defer os.Remove(log)
|
||||
|
||||
remoteURLs := RoutesFromStr("nats://127.0.0.1:1234")
|
||||
|
||||
opts := DefaultOptions()
|
||||
opts.Cluster.Port = -1
|
||||
opts.Routes = remoteURLs
|
||||
opts.NoLog = false
|
||||
opts.LogFile = log
|
||||
opts.Logtime = true
|
||||
opts.Debug = true
|
||||
s := RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. In tests, routeConnectDelay is set to 15ms.
|
||||
time.Sleep(80 * routeConnectDelay)
|
||||
s.Shutdown()
|
||||
|
||||
content, err := ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
checkContent := func(t *testing.T, txt string, attempt int, shouldBeThere bool) {
|
||||
t.Helper()
|
||||
present := bytes.Contains(content, []byte(fmt.Sprintf("%s (attempt %d)", txt, attempt)))
|
||||
if shouldBeThere && !present {
|
||||
t.Fatalf("Did not find expected log statement (%s) for attempt %d: %s", txt, attempt, content)
|
||||
} else if !shouldBeThere && present {
|
||||
t.Fatalf("Log statement (%s) for attempt %d should not be present: %s", txt, attempt, content)
|
||||
}
|
||||
}
|
||||
|
||||
// We should find both [DBG] and [ERR] for the failed first attempt
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 1, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 1, true)
|
||||
// We should find only [DBG] for second attempt...
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 2, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 2, false)
|
||||
// All for 60th
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 60, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 60, true)
|
||||
// Only DBG again for 61
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 61, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 61, false)
|
||||
|
||||
os.Remove(log)
|
||||
|
||||
// Now try with leaf nodes
|
||||
opts.Cluster.Port = 0
|
||||
opts.Routes = nil
|
||||
opts.LeafNode.Remotes = []*RemoteLeafOpts{&RemoteLeafOpts{URL: remoteURLs[0]}}
|
||||
opts.LeafNode.ReconnectInterval = 15 * time.Millisecond
|
||||
s = RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. Between each attempt, we will wait
|
||||
// for `opts.LeafNode.ReconnectInterval`.
|
||||
time.Sleep(80 * opts.LeafNode.ReconnectInterval)
|
||||
s.Shutdown()
|
||||
|
||||
content, err = ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
// We should find both [DBG] and [ERR] for the failed first attempt
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 1, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 1, true)
|
||||
// For second attempt, only debug
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 2, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 2, false)
|
||||
// But 60, we should get both again
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 60, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 60, true)
|
||||
// 61, only debug, etc..
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 61, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 61, false)
|
||||
|
||||
os.Remove(log)
|
||||
|
||||
// Now try with gateways
|
||||
opts.LeafNode.Remotes = nil
|
||||
opts.Gateway.Name = "A"
|
||||
opts.Gateway.Port = -1
|
||||
opts.Gateway.Gateways = []*RemoteGatewayOpts{
|
||||
&RemoteGatewayOpts{
|
||||
Name: "B",
|
||||
URLs: remoteURLs,
|
||||
},
|
||||
}
|
||||
opts.gatewaysSolicitDelay = 15 * time.Millisecond
|
||||
s = RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. In tests, gatewayConnectDelay is set to 15ms.
|
||||
time.Sleep(80 * gatewayConnectDelay)
|
||||
s.Shutdown()
|
||||
|
||||
content, err = ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
// For gateways, we also check our notice that we attempt to connect
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
// Should get failed attempt 1 for both DBG and ERR
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
|
||||
// For second attempt, just DBG for both connect and connect error
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
|
||||
|
||||
// Then all for 60th attempt
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
|
||||
// Then again only debug for 61
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
|
||||
}
|
||||
|
||||
@@ -110,3 +110,12 @@ func parseHostPort(hostPort string, defaultPort int) (host string, port int, err
|
||||
func urlsAreEqual(u1, u2 *url.URL) bool {
|
||||
return reflect.DeepEqual(u1, u2)
|
||||
}
|
||||
|
||||
// Returns true for the first attempt, after a minute and then an hour.
|
||||
// It is assumed that each failed attempt is done every second.
|
||||
func shouldReportConnectErr(attempts int) bool {
|
||||
if attempts == 1 || attempts == 60 || attempts%3600 == 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user