[UPDATED] Reduce report of failed connection attempts

This applies to routes, gateways and leaf node connections.
The failed attempts will be printed at the first, after the first
minute and then every hour.
The connect/error statements now include the attempt number.

Note that in debug mode, all attempts are traced, so you may get
double trace (one for debug, one for info/error).

Resolves #969

Signed-off-by: Ivan Kozlovic <ivan@synadia.com>
This commit is contained in:
Ivan Kozlovic
2019-05-20 10:13:56 -06:00
parent 25cd64b891
commit 03930ba0e4
6 changed files with 185 additions and 7 deletions

View File

@@ -556,7 +556,13 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
} else {
typeStr = "explicit"
}
const connFmt = "Connecting to %s gateway %q (%s) at %s (attempt %v)"
const connErrFmt = "Error connecting to %s gateway %q (%s) at %s (attempt %v): %v"
for s.isRunning() && len(urls) > 0 {
attempts++
report := shouldReportConnectErr(attempts)
// Iteration is random
for _, u := range urls {
address, err := s.getRandomIP(s.gateway.resolver, u.Host)
@@ -564,21 +570,26 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
s.Errorf("Error getting IP for %s gateway %q (%s): %v", typeStr, cfg.Name, u.Host, err)
continue
}
s.Noticef("Connecting to %s gateway %q (%s) at %s", typeStr, cfg.Name, u.Host, address)
s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
if report {
s.Noticef(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
}
conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
if err == nil {
// We could connect, create the gateway connection and return.
s.createGateway(cfg, u, conn)
return
}
s.Errorf("Error connecting to %s gateway %q (%s) at %s: %v", typeStr, cfg.Name, u.Host, address, err)
s.Debugf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
if report {
s.Errorf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
}
// Break this loop if server is being shutdown...
if !s.isRunning() {
break
}
}
if isImplicit {
attempts++
if opts.Gateway.ConnectRetries == 0 || attempts > opts.Gateway.ConnectRetries {
s.gateway.Lock()
delete(s.gateway.remotes, cfg.Name)

View File

@@ -165,6 +165,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
var conn net.Conn
const connErrFmt = "Error trying to connect as leaf node to remote server (attempt %v): %v"
attempts := 0
for s.isRunning() && s.remoteLeafNodeStillValid(remote) {
rURL := remote.pickNextURL()
url, err := s.getRandomIP(resolver, rURL.Host)
@@ -177,7 +180,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
conn, err = net.DialTimeout("tcp", url, dialTimeout)
}
if err != nil {
s.Errorf("Error trying to connect as leaf node to remote server: %v", err)
attempts++
s.Debugf(connErrFmt, attempts, err)
if shouldReportConnectErr(attempts) {
s.Errorf(connErrFmt, attempts, err)
}
select {
case <-s.quitCh:
return

View File

@@ -99,6 +99,9 @@ const sendRouteSubsInGoRoutineThreshold = 1024 * 1024 // 1MB
// Warning when user configures cluster TLS insecure
const clusterTLSInsecureWarning = "TLS Hostname verification disabled, system will not verify identity of the solicited route"
// Can be changed for tests
var routeConnectDelay = DEFAULT_ROUTE_CONNECT
// This will add a timer to watch over remote reply subjects in case
// they fail to receive a response. The duration will be taken from the
// accounts map timeout to match.
@@ -1587,6 +1590,8 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
defer s.grWG.Done()
const connErrFmt = "Error trying to connect to route (attempt %v): %v"
attempts := 0
for s.isRunning() && rURL != nil {
if tryForEver && !s.routeStillValid(rURL) {
@@ -1595,12 +1600,15 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
s.Debugf("Trying to connect to route on %s", rURL.Host)
conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
if err != nil {
s.Errorf("Error trying to connect to route: %v", err)
attempts++
s.Debugf(connErrFmt, attempts, err)
if shouldReportConnectErr(attempts) {
s.Errorf(connErrFmt, attempts, err)
}
if !tryForEver {
if opts.Cluster.ConnectRetries <= 0 {
return
}
attempts++
if attempts > opts.Cluster.ConnectRetries {
return
}
@@ -1608,7 +1616,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
select {
case <-s.quitCh:
return
case <-time.After(DEFAULT_ROUTE_CONNECT):
case <-time.After(routeConnectDelay):
continue
}
}

View File

@@ -27,6 +27,10 @@ import (
"github.com/nats-io/nats.go"
)
func init() {
routeConnectDelay = 15 * time.Millisecond
}
func checkNumRoutes(t *testing.T, s *Server, expected int) {
t.Helper()
checkFor(t, 5*time.Second, 15*time.Millisecond, func() error {

View File

@@ -14,9 +14,11 @@
package server
import (
"bytes"
"context"
"flag"
"fmt"
"io/ioutil"
"net"
"net/url"
"os"
@@ -1139,3 +1141,140 @@ func TestInsecureSkipVerifyNotSupportedForClientAndGateways(t *testing.T) {
}
checkServerFails(t, o)
}
func TestConnErrorReports(t *testing.T) {
tmpFile, err := ioutil.TempFile("", "")
if err != nil {
t.Fatalf("Error creating temp file: %v", err)
}
log := tmpFile.Name()
tmpFile.Close()
defer os.Remove(log)
remoteURLs := RoutesFromStr("nats://127.0.0.1:1234")
opts := DefaultOptions()
opts.Cluster.Port = -1
opts.Routes = remoteURLs
opts.NoLog = false
opts.LogFile = log
opts.Logtime = true
opts.Debug = true
s := RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. In tests, routeConnectDelay is set to 15ms.
time.Sleep(80 * routeConnectDelay)
s.Shutdown()
content, err := ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
checkContent := func(t *testing.T, txt string, attempt int, shouldBeThere bool) {
t.Helper()
present := bytes.Contains(content, []byte(fmt.Sprintf("%s (attempt %d)", txt, attempt)))
if shouldBeThere && !present {
t.Fatalf("Did not find expected log statement (%s) for attempt %d: %s", txt, attempt, content)
} else if !shouldBeThere && present {
t.Fatalf("Log statement (%s) for attempt %d should not be present: %s", txt, attempt, content)
}
}
// We should find both [DBG] and [ERR] for the failed first attempt
checkContent(t, "[DBG] Error trying to connect to route", 1, true)
checkContent(t, "[ERR] Error trying to connect to route", 1, true)
// We should find only [DBG] for second attempt...
checkContent(t, "[DBG] Error trying to connect to route", 2, true)
checkContent(t, "[ERR] Error trying to connect to route", 2, false)
// All for 60th
checkContent(t, "[DBG] Error trying to connect to route", 60, true)
checkContent(t, "[ERR] Error trying to connect to route", 60, true)
// Only DBG again for 61
checkContent(t, "[DBG] Error trying to connect to route", 61, true)
checkContent(t, "[ERR] Error trying to connect to route", 61, false)
os.Remove(log)
// Now try with leaf nodes
opts.Cluster.Port = 0
opts.Routes = nil
opts.LeafNode.Remotes = []*RemoteLeafOpts{&RemoteLeafOpts{URL: remoteURLs[0]}}
opts.LeafNode.ReconnectInterval = 15 * time.Millisecond
s = RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. Between each attempt, we will wait
// for `opts.LeafNode.ReconnectInterval`.
time.Sleep(80 * opts.LeafNode.ReconnectInterval)
s.Shutdown()
content, err = ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
// We should find both [DBG] and [ERR] for the failed first attempt
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 1, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 1, true)
// For second attempt, only debug
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 2, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 2, false)
// But 60, we should get both again
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 60, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 60, true)
// 61, only debug, etc..
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 61, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 61, false)
os.Remove(log)
// Now try with gateways
opts.LeafNode.Remotes = nil
opts.Gateway.Name = "A"
opts.Gateway.Port = -1
opts.Gateway.Gateways = []*RemoteGatewayOpts{
&RemoteGatewayOpts{
Name: "B",
URLs: remoteURLs,
},
}
opts.gatewaysSolicitDelay = 15 * time.Millisecond
s = RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. In tests, gatewayConnectDelay is set to 15ms.
time.Sleep(80 * gatewayConnectDelay)
s.Shutdown()
content, err = ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
// For gateways, we also check our notice that we attempt to connect
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
// Should get failed attempt 1 for both DBG and ERR
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
// For second attempt, just DBG for both connect and connect error
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
// Then all for 60th attempt
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
// Then again only debug for 61
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
}

View File

@@ -110,3 +110,12 @@ func parseHostPort(hostPort string, defaultPort int) (host string, port int, err
func urlsAreEqual(u1, u2 *url.URL) bool {
return reflect.DeepEqual(u1, u2)
}
// Returns true for the first attempt, after a minute and then an hour.
// It is assumed that each failed attempt is done every second.
func shouldReportConnectErr(attempts int) bool {
if attempts == 1 || attempts == 60 || attempts%3600 == 0 {
return true
}
return false
}