Merge pull request #1000 from nats-io/reduce_connect_err_report_frequency

[UPDATED] Reduce report of failed connection attempts
This commit is contained in:
Ivan Kozlovic
2019-05-20 10:52:52 -06:00
committed by GitHub
6 changed files with 185 additions and 7 deletions

View File

@@ -556,7 +556,13 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
} else {
typeStr = "explicit"
}
const connFmt = "Connecting to %s gateway %q (%s) at %s (attempt %v)"
const connErrFmt = "Error connecting to %s gateway %q (%s) at %s (attempt %v): %v"
for s.isRunning() && len(urls) > 0 {
attempts++
report := shouldReportConnectErr(attempts)
// Iteration is random
for _, u := range urls {
address, err := s.getRandomIP(s.gateway.resolver, u.Host)
@@ -564,21 +570,26 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
s.Errorf("Error getting IP for %s gateway %q (%s): %v", typeStr, cfg.Name, u.Host, err)
continue
}
s.Noticef("Connecting to %s gateway %q (%s) at %s", typeStr, cfg.Name, u.Host, address)
s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
if report {
s.Noticef(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
}
conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
if err == nil {
// We could connect, create the gateway connection and return.
s.createGateway(cfg, u, conn)
return
}
s.Errorf("Error connecting to %s gateway %q (%s) at %s: %v", typeStr, cfg.Name, u.Host, address, err)
s.Debugf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
if report {
s.Errorf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
}
// Break this loop if server is being shutdown...
if !s.isRunning() {
break
}
}
if isImplicit {
attempts++
if opts.Gateway.ConnectRetries == 0 || attempts > opts.Gateway.ConnectRetries {
s.gateway.Lock()
delete(s.gateway.remotes, cfg.Name)

View File

@@ -165,6 +165,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
var conn net.Conn
const connErrFmt = "Error trying to connect as leaf node to remote server (attempt %v): %v"
attempts := 0
for s.isRunning() && s.remoteLeafNodeStillValid(remote) {
rURL := remote.pickNextURL()
url, err := s.getRandomIP(resolver, rURL.Host)
@@ -177,7 +180,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
conn, err = net.DialTimeout("tcp", url, dialTimeout)
}
if err != nil {
s.Errorf("Error trying to connect as leaf node to remote server: %v", err)
attempts++
s.Debugf(connErrFmt, attempts, err)
if shouldReportConnectErr(attempts) {
s.Errorf(connErrFmt, attempts, err)
}
select {
case <-s.quitCh:
return

View File

@@ -99,6 +99,9 @@ const sendRouteSubsInGoRoutineThreshold = 1024 * 1024 // 1MB
// Warning when user configures cluster TLS insecure
const clusterTLSInsecureWarning = "TLS Hostname verification disabled, system will not verify identity of the solicited route"
// Can be changed for tests
var routeConnectDelay = DEFAULT_ROUTE_CONNECT
// This will add a timer to watch over remote reply subjects in case
// they fail to receive a response. The duration will be taken from the
// accounts map timeout to match.
@@ -1587,6 +1590,8 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
defer s.grWG.Done()
const connErrFmt = "Error trying to connect to route (attempt %v): %v"
attempts := 0
for s.isRunning() && rURL != nil {
if tryForEver && !s.routeStillValid(rURL) {
@@ -1595,12 +1600,15 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
s.Debugf("Trying to connect to route on %s", rURL.Host)
conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
if err != nil {
s.Errorf("Error trying to connect to route: %v", err)
attempts++
s.Debugf(connErrFmt, attempts, err)
if shouldReportConnectErr(attempts) {
s.Errorf(connErrFmt, attempts, err)
}
if !tryForEver {
if opts.Cluster.ConnectRetries <= 0 {
return
}
attempts++
if attempts > opts.Cluster.ConnectRetries {
return
}
@@ -1608,7 +1616,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
select {
case <-s.quitCh:
return
case <-time.After(DEFAULT_ROUTE_CONNECT):
case <-time.After(routeConnectDelay):
continue
}
}

View File

@@ -27,6 +27,10 @@ import (
"github.com/nats-io/nats.go"
)
func init() {
routeConnectDelay = 15 * time.Millisecond
}
func checkNumRoutes(t *testing.T, s *Server, expected int) {
t.Helper()
checkFor(t, 5*time.Second, 15*time.Millisecond, func() error {

View File

@@ -14,9 +14,11 @@
package server
import (
"bytes"
"context"
"flag"
"fmt"
"io/ioutil"
"net"
"net/url"
"os"
@@ -1139,3 +1141,140 @@ func TestInsecureSkipVerifyNotSupportedForClientAndGateways(t *testing.T) {
}
checkServerFails(t, o)
}
func TestConnErrorReports(t *testing.T) {
tmpFile, err := ioutil.TempFile("", "")
if err != nil {
t.Fatalf("Error creating temp file: %v", err)
}
log := tmpFile.Name()
tmpFile.Close()
defer os.Remove(log)
remoteURLs := RoutesFromStr("nats://127.0.0.1:1234")
opts := DefaultOptions()
opts.Cluster.Port = -1
opts.Routes = remoteURLs
opts.NoLog = false
opts.LogFile = log
opts.Logtime = true
opts.Debug = true
s := RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. In tests, routeConnectDelay is set to 15ms.
time.Sleep(80 * routeConnectDelay)
s.Shutdown()
content, err := ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
checkContent := func(t *testing.T, txt string, attempt int, shouldBeThere bool) {
t.Helper()
present := bytes.Contains(content, []byte(fmt.Sprintf("%s (attempt %d)", txt, attempt)))
if shouldBeThere && !present {
t.Fatalf("Did not find expected log statement (%s) for attempt %d: %s", txt, attempt, content)
} else if !shouldBeThere && present {
t.Fatalf("Log statement (%s) for attempt %d should not be present: %s", txt, attempt, content)
}
}
// We should find both [DBG] and [ERR] for the failed first attempt
checkContent(t, "[DBG] Error trying to connect to route", 1, true)
checkContent(t, "[ERR] Error trying to connect to route", 1, true)
// We should find only [DBG] for second attempt...
checkContent(t, "[DBG] Error trying to connect to route", 2, true)
checkContent(t, "[ERR] Error trying to connect to route", 2, false)
// All for 60th
checkContent(t, "[DBG] Error trying to connect to route", 60, true)
checkContent(t, "[ERR] Error trying to connect to route", 60, true)
// Only DBG again for 61
checkContent(t, "[DBG] Error trying to connect to route", 61, true)
checkContent(t, "[ERR] Error trying to connect to route", 61, false)
os.Remove(log)
// Now try with leaf nodes
opts.Cluster.Port = 0
opts.Routes = nil
opts.LeafNode.Remotes = []*RemoteLeafOpts{&RemoteLeafOpts{URL: remoteURLs[0]}}
opts.LeafNode.ReconnectInterval = 15 * time.Millisecond
s = RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. Between each attempt, we will wait
// for `opts.LeafNode.ReconnectInterval`.
time.Sleep(80 * opts.LeafNode.ReconnectInterval)
s.Shutdown()
content, err = ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
// We should find both [DBG] and [ERR] for the failed first attempt
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 1, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 1, true)
// For second attempt, only debug
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 2, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 2, false)
// But 60, we should get both again
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 60, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 60, true)
// 61, only debug, etc..
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 61, true)
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 61, false)
os.Remove(log)
// Now try with gateways
opts.LeafNode.Remotes = nil
opts.Gateway.Name = "A"
opts.Gateway.Port = -1
opts.Gateway.Gateways = []*RemoteGatewayOpts{
&RemoteGatewayOpts{
Name: "B",
URLs: remoteURLs,
},
}
opts.gatewaysSolicitDelay = 15 * time.Millisecond
s = RunServer(opts)
defer s.Shutdown()
// Wait for more than 60 connect attempts. In tests, gatewayConnectDelay is set to 15ms.
time.Sleep(80 * gatewayConnectDelay)
s.Shutdown()
content, err = ioutil.ReadFile(log)
if err != nil {
t.Fatalf("Error reading log file: %v", err)
}
// For gateways, we also check our notice that we attempt to connect
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
// Should get failed attempt 1 for both DBG and ERR
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
// For second attempt, just DBG for both connect and connect error
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
// Then all for 60th attempt
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
// Then again only debug for 61
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
}

View File

@@ -110,3 +110,12 @@ func parseHostPort(hostPort string, defaultPort int) (host string, port int, err
func urlsAreEqual(u1, u2 *url.URL) bool {
return reflect.DeepEqual(u1, u2)
}
// Returns true for the first attempt, after a minute and then an hour.
// It is assumed that each failed attempt is done every second.
func shouldReportConnectErr(attempts int) bool {
if attempts == 1 || attempts == 60 || attempts%3600 == 0 {
return true
}
return false
}