diff --git a/server/gateway.go b/server/gateway.go index da8aeae2..10888d1d 100644 --- a/server/gateway.go +++ b/server/gateway.go @@ -556,7 +556,13 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) { } else { typeStr = "explicit" } + + const connFmt = "Connecting to %s gateway %q (%s) at %s (attempt %v)" + const connErrFmt = "Error connecting to %s gateway %q (%s) at %s (attempt %v): %v" + for s.isRunning() && len(urls) > 0 { + attempts++ + report := shouldReportConnectErr(attempts) // Iteration is random for _, u := range urls { address, err := s.getRandomIP(s.gateway.resolver, u.Host) @@ -564,21 +570,26 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) { s.Errorf("Error getting IP for %s gateway %q (%s): %v", typeStr, cfg.Name, u.Host, err) continue } - s.Noticef("Connecting to %s gateway %q (%s) at %s", typeStr, cfg.Name, u.Host, address) + s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts) + if report { + s.Noticef(connFmt, typeStr, cfg.Name, u.Host, address, attempts) + } conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL) if err == nil { // We could connect, create the gateway connection and return. s.createGateway(cfg, u, conn) return } - s.Errorf("Error connecting to %s gateway %q (%s) at %s: %v", typeStr, cfg.Name, u.Host, address, err) + s.Debugf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err) + if report { + s.Errorf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err) + } // Break this loop if server is being shutdown... if !s.isRunning() { break } } if isImplicit { - attempts++ if opts.Gateway.ConnectRetries == 0 || attempts > opts.Gateway.ConnectRetries { s.gateway.Lock() delete(s.gateway.remotes, cfg.Name) diff --git a/server/leafnode.go b/server/leafnode.go index 4819fdb5..c7a7b9b2 100644 --- a/server/leafnode.go +++ b/server/leafnode.go @@ -165,6 +165,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) { var conn net.Conn + const connErrFmt = "Error trying to connect as leaf node to remote server (attempt %v): %v" + + attempts := 0 for s.isRunning() && s.remoteLeafNodeStillValid(remote) { rURL := remote.pickNextURL() url, err := s.getRandomIP(resolver, rURL.Host) @@ -177,7 +180,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) { conn, err = net.DialTimeout("tcp", url, dialTimeout) } if err != nil { - s.Errorf("Error trying to connect as leaf node to remote server: %v", err) + attempts++ + s.Debugf(connErrFmt, attempts, err) + if shouldReportConnectErr(attempts) { + s.Errorf(connErrFmt, attempts, err) + } select { case <-s.quitCh: return diff --git a/server/route.go b/server/route.go index 228a293d..c33159cb 100644 --- a/server/route.go +++ b/server/route.go @@ -99,6 +99,9 @@ const sendRouteSubsInGoRoutineThreshold = 1024 * 1024 // 1MB // Warning when user configures cluster TLS insecure const clusterTLSInsecureWarning = "TLS Hostname verification disabled, system will not verify identity of the solicited route" +// Can be changed for tests +var routeConnectDelay = DEFAULT_ROUTE_CONNECT + // This will add a timer to watch over remote reply subjects in case // they fail to receive a response. The duration will be taken from the // accounts map timeout to match. @@ -1587,6 +1590,8 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) { defer s.grWG.Done() + const connErrFmt = "Error trying to connect to route (attempt %v): %v" + attempts := 0 for s.isRunning() && rURL != nil { if tryForEver && !s.routeStillValid(rURL) { @@ -1595,12 +1600,15 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) { s.Debugf("Trying to connect to route on %s", rURL.Host) conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL) if err != nil { - s.Errorf("Error trying to connect to route: %v", err) + attempts++ + s.Debugf(connErrFmt, attempts, err) + if shouldReportConnectErr(attempts) { + s.Errorf(connErrFmt, attempts, err) + } if !tryForEver { if opts.Cluster.ConnectRetries <= 0 { return } - attempts++ if attempts > opts.Cluster.ConnectRetries { return } @@ -1608,7 +1616,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) { select { case <-s.quitCh: return - case <-time.After(DEFAULT_ROUTE_CONNECT): + case <-time.After(routeConnectDelay): continue } } diff --git a/server/routes_test.go b/server/routes_test.go index 29990bc2..9bb6a4ea 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -27,6 +27,10 @@ import ( "github.com/nats-io/nats.go" ) +func init() { + routeConnectDelay = 15 * time.Millisecond +} + func checkNumRoutes(t *testing.T, s *Server, expected int) { t.Helper() checkFor(t, 5*time.Second, 15*time.Millisecond, func() error { diff --git a/server/server_test.go b/server/server_test.go index 8c8de54f..305e6b5f 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -14,9 +14,11 @@ package server import ( + "bytes" "context" "flag" "fmt" + "io/ioutil" "net" "net/url" "os" @@ -1139,3 +1141,140 @@ func TestInsecureSkipVerifyNotSupportedForClientAndGateways(t *testing.T) { } checkServerFails(t, o) } + +func TestConnErrorReports(t *testing.T) { + tmpFile, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("Error creating temp file: %v", err) + } + log := tmpFile.Name() + tmpFile.Close() + defer os.Remove(log) + + remoteURLs := RoutesFromStr("nats://127.0.0.1:1234") + + opts := DefaultOptions() + opts.Cluster.Port = -1 + opts.Routes = remoteURLs + opts.NoLog = false + opts.LogFile = log + opts.Logtime = true + opts.Debug = true + s := RunServer(opts) + defer s.Shutdown() + + // Wait for more than 60 connect attempts. In tests, routeConnectDelay is set to 15ms. + time.Sleep(80 * routeConnectDelay) + s.Shutdown() + + content, err := ioutil.ReadFile(log) + if err != nil { + t.Fatalf("Error reading log file: %v", err) + } + + checkContent := func(t *testing.T, txt string, attempt int, shouldBeThere bool) { + t.Helper() + present := bytes.Contains(content, []byte(fmt.Sprintf("%s (attempt %d)", txt, attempt))) + if shouldBeThere && !present { + t.Fatalf("Did not find expected log statement (%s) for attempt %d: %s", txt, attempt, content) + } else if !shouldBeThere && present { + t.Fatalf("Log statement (%s) for attempt %d should not be present: %s", txt, attempt, content) + } + } + + // We should find both [DBG] and [ERR] for the failed first attempt + checkContent(t, "[DBG] Error trying to connect to route", 1, true) + checkContent(t, "[ERR] Error trying to connect to route", 1, true) + // We should find only [DBG] for second attempt... + checkContent(t, "[DBG] Error trying to connect to route", 2, true) + checkContent(t, "[ERR] Error trying to connect to route", 2, false) + // All for 60th + checkContent(t, "[DBG] Error trying to connect to route", 60, true) + checkContent(t, "[ERR] Error trying to connect to route", 60, true) + // Only DBG again for 61 + checkContent(t, "[DBG] Error trying to connect to route", 61, true) + checkContent(t, "[ERR] Error trying to connect to route", 61, false) + + os.Remove(log) + + // Now try with leaf nodes + opts.Cluster.Port = 0 + opts.Routes = nil + opts.LeafNode.Remotes = []*RemoteLeafOpts{&RemoteLeafOpts{URL: remoteURLs[0]}} + opts.LeafNode.ReconnectInterval = 15 * time.Millisecond + s = RunServer(opts) + defer s.Shutdown() + + // Wait for more than 60 connect attempts. Between each attempt, we will wait + // for `opts.LeafNode.ReconnectInterval`. + time.Sleep(80 * opts.LeafNode.ReconnectInterval) + s.Shutdown() + + content, err = ioutil.ReadFile(log) + if err != nil { + t.Fatalf("Error reading log file: %v", err) + } + + // We should find both [DBG] and [ERR] for the failed first attempt + checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 1, true) + checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 1, true) + // For second attempt, only debug + checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 2, true) + checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 2, false) + // But 60, we should get both again + checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 60, true) + checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 60, true) + // 61, only debug, etc.. + checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 61, true) + checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 61, false) + + os.Remove(log) + + // Now try with gateways + opts.LeafNode.Remotes = nil + opts.Gateway.Name = "A" + opts.Gateway.Port = -1 + opts.Gateway.Gateways = []*RemoteGatewayOpts{ + &RemoteGatewayOpts{ + Name: "B", + URLs: remoteURLs, + }, + } + opts.gatewaysSolicitDelay = 15 * time.Millisecond + s = RunServer(opts) + defer s.Shutdown() + + // Wait for more than 60 connect attempts. In tests, gatewayConnectDelay is set to 15ms. + time.Sleep(80 * gatewayConnectDelay) + s.Shutdown() + + content, err = ioutil.ReadFile(log) + if err != nil { + t.Fatalf("Error reading log file: %v", err) + } + + // For gateways, we also check our notice that we attempt to connect + checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true) + checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true) + // Should get failed attempt 1 for both DBG and ERR + checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true) + checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true) + + // For second attempt, just DBG for both connect and connect error + checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true) + checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false) + checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true) + checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false) + + // Then all for 60th attempt + checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true) + checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true) + checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true) + checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true) + + // Then again only debug for 61 + checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true) + checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false) + checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true) + checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false) +} diff --git a/server/util.go b/server/util.go index 0035154e..47e9d382 100644 --- a/server/util.go +++ b/server/util.go @@ -110,3 +110,12 @@ func parseHostPort(hostPort string, defaultPort int) (host string, port int, err func urlsAreEqual(u1, u2 *url.URL) bool { return reflect.DeepEqual(u1, u2) } + +// Returns true for the first attempt, after a minute and then an hour. +// It is assumed that each failed attempt is done every second. +func shouldReportConnectErr(attempts int) bool { + if attempts == 1 || attempts == 60 || attempts%3600 == 0 { + return true + } + return false +}