mirror of
https://github.com/gogrlx/nats-server.git
synced 2026-04-15 02:30:40 -07:00
Merge pull request #1000 from nats-io/reduce_connect_err_report_frequency
[UPDATED] Reduce report of failed connection attempts
This commit is contained in:
@@ -556,7 +556,13 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
|
||||
} else {
|
||||
typeStr = "explicit"
|
||||
}
|
||||
|
||||
const connFmt = "Connecting to %s gateway %q (%s) at %s (attempt %v)"
|
||||
const connErrFmt = "Error connecting to %s gateway %q (%s) at %s (attempt %v): %v"
|
||||
|
||||
for s.isRunning() && len(urls) > 0 {
|
||||
attempts++
|
||||
report := shouldReportConnectErr(attempts)
|
||||
// Iteration is random
|
||||
for _, u := range urls {
|
||||
address, err := s.getRandomIP(s.gateway.resolver, u.Host)
|
||||
@@ -564,21 +570,26 @@ func (s *Server) solicitGateway(cfg *gatewayCfg) {
|
||||
s.Errorf("Error getting IP for %s gateway %q (%s): %v", typeStr, cfg.Name, u.Host, err)
|
||||
continue
|
||||
}
|
||||
s.Noticef("Connecting to %s gateway %q (%s) at %s", typeStr, cfg.Name, u.Host, address)
|
||||
s.Debugf(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
|
||||
if report {
|
||||
s.Noticef(connFmt, typeStr, cfg.Name, u.Host, address, attempts)
|
||||
}
|
||||
conn, err := net.DialTimeout("tcp", address, DEFAULT_ROUTE_DIAL)
|
||||
if err == nil {
|
||||
// We could connect, create the gateway connection and return.
|
||||
s.createGateway(cfg, u, conn)
|
||||
return
|
||||
}
|
||||
s.Errorf("Error connecting to %s gateway %q (%s) at %s: %v", typeStr, cfg.Name, u.Host, address, err)
|
||||
s.Debugf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
|
||||
if report {
|
||||
s.Errorf(connErrFmt, typeStr, cfg.Name, u.Host, address, attempts, err)
|
||||
}
|
||||
// Break this loop if server is being shutdown...
|
||||
if !s.isRunning() {
|
||||
break
|
||||
}
|
||||
}
|
||||
if isImplicit {
|
||||
attempts++
|
||||
if opts.Gateway.ConnectRetries == 0 || attempts > opts.Gateway.ConnectRetries {
|
||||
s.gateway.Lock()
|
||||
delete(s.gateway.remotes, cfg.Name)
|
||||
|
||||
@@ -165,6 +165,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
|
||||
|
||||
var conn net.Conn
|
||||
|
||||
const connErrFmt = "Error trying to connect as leaf node to remote server (attempt %v): %v"
|
||||
|
||||
attempts := 0
|
||||
for s.isRunning() && s.remoteLeafNodeStillValid(remote) {
|
||||
rURL := remote.pickNextURL()
|
||||
url, err := s.getRandomIP(resolver, rURL.Host)
|
||||
@@ -177,7 +180,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg) {
|
||||
conn, err = net.DialTimeout("tcp", url, dialTimeout)
|
||||
}
|
||||
if err != nil {
|
||||
s.Errorf("Error trying to connect as leaf node to remote server: %v", err)
|
||||
attempts++
|
||||
s.Debugf(connErrFmt, attempts, err)
|
||||
if shouldReportConnectErr(attempts) {
|
||||
s.Errorf(connErrFmt, attempts, err)
|
||||
}
|
||||
select {
|
||||
case <-s.quitCh:
|
||||
return
|
||||
|
||||
@@ -99,6 +99,9 @@ const sendRouteSubsInGoRoutineThreshold = 1024 * 1024 // 1MB
|
||||
// Warning when user configures cluster TLS insecure
|
||||
const clusterTLSInsecureWarning = "TLS Hostname verification disabled, system will not verify identity of the solicited route"
|
||||
|
||||
// Can be changed for tests
|
||||
var routeConnectDelay = DEFAULT_ROUTE_CONNECT
|
||||
|
||||
// This will add a timer to watch over remote reply subjects in case
|
||||
// they fail to receive a response. The duration will be taken from the
|
||||
// accounts map timeout to match.
|
||||
@@ -1587,6 +1590,8 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
|
||||
defer s.grWG.Done()
|
||||
|
||||
const connErrFmt = "Error trying to connect to route (attempt %v): %v"
|
||||
|
||||
attempts := 0
|
||||
for s.isRunning() && rURL != nil {
|
||||
if tryForEver && !s.routeStillValid(rURL) {
|
||||
@@ -1595,12 +1600,15 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
s.Debugf("Trying to connect to route on %s", rURL.Host)
|
||||
conn, err := net.DialTimeout("tcp", rURL.Host, DEFAULT_ROUTE_DIAL)
|
||||
if err != nil {
|
||||
s.Errorf("Error trying to connect to route: %v", err)
|
||||
attempts++
|
||||
s.Debugf(connErrFmt, attempts, err)
|
||||
if shouldReportConnectErr(attempts) {
|
||||
s.Errorf(connErrFmt, attempts, err)
|
||||
}
|
||||
if !tryForEver {
|
||||
if opts.Cluster.ConnectRetries <= 0 {
|
||||
return
|
||||
}
|
||||
attempts++
|
||||
if attempts > opts.Cluster.ConnectRetries {
|
||||
return
|
||||
}
|
||||
@@ -1608,7 +1616,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver bool) {
|
||||
select {
|
||||
case <-s.quitCh:
|
||||
return
|
||||
case <-time.After(DEFAULT_ROUTE_CONNECT):
|
||||
case <-time.After(routeConnectDelay):
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,10 @@ import (
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
func init() {
|
||||
routeConnectDelay = 15 * time.Millisecond
|
||||
}
|
||||
|
||||
func checkNumRoutes(t *testing.T, s *Server, expected int) {
|
||||
t.Helper()
|
||||
checkFor(t, 5*time.Second, 15*time.Millisecond, func() error {
|
||||
|
||||
@@ -14,9 +14,11 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -1139,3 +1141,140 @@ func TestInsecureSkipVerifyNotSupportedForClientAndGateways(t *testing.T) {
|
||||
}
|
||||
checkServerFails(t, o)
|
||||
}
|
||||
|
||||
func TestConnErrorReports(t *testing.T) {
|
||||
tmpFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating temp file: %v", err)
|
||||
}
|
||||
log := tmpFile.Name()
|
||||
tmpFile.Close()
|
||||
defer os.Remove(log)
|
||||
|
||||
remoteURLs := RoutesFromStr("nats://127.0.0.1:1234")
|
||||
|
||||
opts := DefaultOptions()
|
||||
opts.Cluster.Port = -1
|
||||
opts.Routes = remoteURLs
|
||||
opts.NoLog = false
|
||||
opts.LogFile = log
|
||||
opts.Logtime = true
|
||||
opts.Debug = true
|
||||
s := RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. In tests, routeConnectDelay is set to 15ms.
|
||||
time.Sleep(80 * routeConnectDelay)
|
||||
s.Shutdown()
|
||||
|
||||
content, err := ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
checkContent := func(t *testing.T, txt string, attempt int, shouldBeThere bool) {
|
||||
t.Helper()
|
||||
present := bytes.Contains(content, []byte(fmt.Sprintf("%s (attempt %d)", txt, attempt)))
|
||||
if shouldBeThere && !present {
|
||||
t.Fatalf("Did not find expected log statement (%s) for attempt %d: %s", txt, attempt, content)
|
||||
} else if !shouldBeThere && present {
|
||||
t.Fatalf("Log statement (%s) for attempt %d should not be present: %s", txt, attempt, content)
|
||||
}
|
||||
}
|
||||
|
||||
// We should find both [DBG] and [ERR] for the failed first attempt
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 1, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 1, true)
|
||||
// We should find only [DBG] for second attempt...
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 2, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 2, false)
|
||||
// All for 60th
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 60, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 60, true)
|
||||
// Only DBG again for 61
|
||||
checkContent(t, "[DBG] Error trying to connect to route", 61, true)
|
||||
checkContent(t, "[ERR] Error trying to connect to route", 61, false)
|
||||
|
||||
os.Remove(log)
|
||||
|
||||
// Now try with leaf nodes
|
||||
opts.Cluster.Port = 0
|
||||
opts.Routes = nil
|
||||
opts.LeafNode.Remotes = []*RemoteLeafOpts{&RemoteLeafOpts{URL: remoteURLs[0]}}
|
||||
opts.LeafNode.ReconnectInterval = 15 * time.Millisecond
|
||||
s = RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. Between each attempt, we will wait
|
||||
// for `opts.LeafNode.ReconnectInterval`.
|
||||
time.Sleep(80 * opts.LeafNode.ReconnectInterval)
|
||||
s.Shutdown()
|
||||
|
||||
content, err = ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
// We should find both [DBG] and [ERR] for the failed first attempt
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 1, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 1, true)
|
||||
// For second attempt, only debug
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 2, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 2, false)
|
||||
// But 60, we should get both again
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 60, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 60, true)
|
||||
// 61, only debug, etc..
|
||||
checkContent(t, "[DBG] Error trying to connect as leaf node to remote server", 61, true)
|
||||
checkContent(t, "[ERR] Error trying to connect as leaf node to remote server", 61, false)
|
||||
|
||||
os.Remove(log)
|
||||
|
||||
// Now try with gateways
|
||||
opts.LeafNode.Remotes = nil
|
||||
opts.Gateway.Name = "A"
|
||||
opts.Gateway.Port = -1
|
||||
opts.Gateway.Gateways = []*RemoteGatewayOpts{
|
||||
&RemoteGatewayOpts{
|
||||
Name: "B",
|
||||
URLs: remoteURLs,
|
||||
},
|
||||
}
|
||||
opts.gatewaysSolicitDelay = 15 * time.Millisecond
|
||||
s = RunServer(opts)
|
||||
defer s.Shutdown()
|
||||
|
||||
// Wait for more than 60 connect attempts. In tests, gatewayConnectDelay is set to 15ms.
|
||||
time.Sleep(80 * gatewayConnectDelay)
|
||||
s.Shutdown()
|
||||
|
||||
content, err = ioutil.ReadFile(log)
|
||||
if err != nil {
|
||||
t.Fatalf("Error reading log file: %v", err)
|
||||
}
|
||||
|
||||
// For gateways, we also check our notice that we attempt to connect
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
// Should get failed attempt 1 for both DBG and ERR
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 1, true)
|
||||
|
||||
// For second attempt, just DBG for both connect and connect error
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 2, false)
|
||||
|
||||
// Then all for 60th attempt
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 60, true)
|
||||
|
||||
// Then again only debug for 61
|
||||
checkContent(t, "[DBG] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
|
||||
checkContent(t, "[INF] Connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
|
||||
checkContent(t, "[DBG] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, true)
|
||||
checkContent(t, "[ERR] Error connecting to explicit gateway \"B\" (127.0.0.1:1234) at 127.0.0.1:1234", 61, false)
|
||||
}
|
||||
|
||||
@@ -110,3 +110,12 @@ func parseHostPort(hostPort string, defaultPort int) (host string, port int, err
|
||||
func urlsAreEqual(u1, u2 *url.URL) bool {
|
||||
return reflect.DeepEqual(u1, u2)
|
||||
}
|
||||
|
||||
// Returns true for the first attempt, after a minute and then an hour.
|
||||
// It is assumed that each failed attempt is done every second.
|
||||
func shouldReportConnectErr(attempts int) bool {
|
||||
if attempts == 1 || attempts == 60 || attempts%3600 == 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user