diff --git a/server/accounts.go b/server/accounts.go index 6f023b81..9def3124 100644 --- a/server/accounts.go +++ b/server/accounts.go @@ -91,8 +91,14 @@ type Account struct { tags jwt.TagList nameTag string lastLimErr int64 + routePoolIdx int } +const ( + accDedicatedRoute = -1 + accTransitioningToDedicatedRoute = -2 +) + // Account based limits. type limits struct { mpay int32 @@ -1331,11 +1337,11 @@ func (a *Account) sendTrackingLatency(si *serviceImport, responder *client) bool // FIXME(dlc) - We need to clean these up but this should happen // already with the auto-expire logic. if responder != nil && responder.kind != CLIENT { - si.acc.mu.Lock() + a.mu.Lock() if si.m1 != nil { m1, m2 := sl, si.m1 m1.merge(m2) - si.acc.mu.Unlock() + a.mu.Unlock() a.srv.sendInternalAccountMsg(a, si.latency.subject, m1) a.mu.Lock() si.rc = nil @@ -1343,7 +1349,7 @@ func (a *Account) sendTrackingLatency(si *serviceImport, responder *client) bool return true } si.m1 = sl - si.acc.mu.Unlock() + a.mu.Unlock() return false } else { a.srv.sendInternalAccountMsg(a, si.latency.subject, sl) diff --git a/server/auth.go b/server/auth.go index eeebd632..b2ec8363 100644 --- a/server/auth.go +++ b/server/auth.go @@ -1198,8 +1198,8 @@ func (s *Server) isRouterAuthorized(c *client) bool { // Check custom auth first, then TLS map if enabled // then single user/pass. - if s.opts.CustomRouterAuthentication != nil { - return s.opts.CustomRouterAuthentication.Check(c) + if opts.CustomRouterAuthentication != nil { + return opts.CustomRouterAuthentication.Check(c) } if opts.Cluster.TLSMap || opts.Cluster.TLSCheckKnownURLs { diff --git a/server/client.go b/server/client.go index a02a8072..b26565c1 100644 --- a/server/client.go +++ b/server/client.go @@ -732,7 +732,13 @@ func (c *client) Kind() int { // registerWithAccount will register the given user with a specific // account. This will change the subject namespace. func (c *client) registerWithAccount(acc *Account) error { - if acc == nil || acc.sl == nil { + if acc == nil { + return ErrBadAccount + } + acc.mu.RLock() + bad := acc.sl == nil + acc.mu.RUnlock() + if bad { return ErrBadAccount } // If we were previously registered, usually to $G, do accounting here to remove. @@ -2219,10 +2225,7 @@ func (c *client) generateClientInfoJSON(info Info) []byte { info.ClientConnectURLs = info.WSConnectURLs } info.WSConnectURLs = nil - // Generate the info json - b, _ := json.Marshal(info) - pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} - return bytes.Join(pcs, []byte(" ")) + return generateInfoJSON(&info) } func (c *client) sendErr(err string) { @@ -2869,10 +2872,6 @@ func (c *client) unsubscribe(acc *Account, sub *subscription, force, remove bool c.traceOp("<-> %s", "DELSUB", sub.sid) } - if c.kind != CLIENT && c.kind != SYSTEM { - c.removeReplySubTimeout(sub) - } - // Remove accounting if requested. This will be false when we close a connection // with open subscriptions. if remove { @@ -3025,8 +3024,10 @@ func (c *client) msgHeaderForRouteOrLeaf(subj, reply []byte, rt *routeTarget, ac // Router (and Gateway) nodes are RMSG. Set here since leafnodes may rewrite. mh[0] = 'R' } - mh = append(mh, acc.Name...) - mh = append(mh, ' ') + if len(subclient.route.accName) == 0 { + mh = append(mh, acc.Name...) + mh = append(mh, ' ') + } } else { // Leaf nodes are LMSG mh[0] = 'L' @@ -4904,13 +4905,11 @@ func (c *client) closeConnection(reason ClosedState) { } var ( - connectURLs []string - wsConnectURLs []string - kind = c.kind - srv = c.srv - noReconnect = c.flags.isSet(noReconnect) - acc = c.acc - spoke bool + kind = c.kind + srv = c.srv + noReconnect = c.flags.isSet(noReconnect) + acc = c.acc + spoke bool ) // Snapshot for use if we are a client connection. @@ -4929,11 +4928,6 @@ func (c *client) closeConnection(reason ClosedState) { spoke = c.isSpokeLeafNode() } - if c.route != nil { - connectURLs = c.route.connectURLs - wsConnectURLs = c.route.wsConnURLs - } - // If we have remote latency tracking running shut that down. if c.rrTracking != nil { c.rrTracking.ptmr.Stop() @@ -4946,17 +4940,10 @@ func (c *client) closeConnection(reason ClosedState) { if acc != nil && (kind == CLIENT || kind == LEAF || kind == JETSTREAM) { acc.sl.RemoveBatch(subs) } else if kind == ROUTER { - go c.removeRemoteSubs() + c.removeRemoteSubs() } if srv != nil { - // If this is a route that disconnected, possibly send an INFO with - // the updated list of connect URLs to clients that know how to - // handle async INFOs. - if (len(connectURLs) > 0 || len(wsConnectURLs) > 0) && !srv.getOpts().Cluster.NoAdvertise { - srv.removeConnectURLsAndSendINFOToClients(connectURLs, wsConnectURLs) - } - // Unregister srv.removeClient(c) @@ -5051,33 +5038,35 @@ func (c *client) reconnect() { // Check for a solicited route. If it was, start up a reconnect unless // we are already connected to the other end. if c.isSolicitedRoute() || retryImplicit { + srv.mu.Lock() + defer srv.mu.Unlock() + // Capture these under lock c.mu.Lock() rid := c.route.remoteID rtype := c.route.routeType rurl := c.route.url + accName := string(c.route.accName) + checkRID := accName == _EMPTY_ && srv.routesPoolSize <= 1 && rid != _EMPTY_ c.mu.Unlock() - srv.mu.Lock() - defer srv.mu.Unlock() - // It is possible that the server is being shutdown. // If so, don't try to reconnect if !srv.running { return } - if rid != "" && srv.remotes[rid] != nil { - srv.Debugf("Not attempting reconnect for solicited route, already connected to \"%s\"", rid) + if checkRID && srv.routes[rid] != nil { + srv.Debugf("Not attempting reconnect for solicited route, already connected to %q", rid) return } else if rid == srv.info.ID { srv.Debugf("Detected route to self, ignoring %q", rurl.Redacted()) return } else if rtype != Implicit || retryImplicit { - srv.Debugf("Attempting reconnect for solicited route \"%s\"", rurl.Redacted()) + srv.Debugf("Attempting reconnect for solicited route %q", rurl.Redacted()) // Keep track of this go-routine so we can wait for it on // server shutdown. - srv.startGoRoutine(func() { srv.reConnectToRoute(rurl, rtype) }) + srv.startGoRoutine(func() { srv.reConnectToRoute(rurl, rtype, accName) }) } } else if srv != nil && kind == GATEWAY && gwIsOutbound { if gwCfg != nil { diff --git a/server/config_check_test.go b/server/config_check_test.go index 0ba2c77f..f1d9ec39 100644 --- a/server/config_check_test.go +++ b/server/config_check_test.go @@ -1579,6 +1579,30 @@ func TestConfigCheck(t *testing.T) { errorLine: 5, errorPos: 6, }, + { + name: "wrong type for cluter pool size", + config: ` + cluster { + port: -1 + pool_size: "abc" + } + `, + err: fmt.Errorf("interface conversion: interface {} is string, not int64"), + errorLine: 4, + errorPos: 6, + }, + { + name: "wrong type for cluter accounts", + config: ` + cluster { + port: -1 + accounts: 123 + } + `, + err: fmt.Errorf("error parsing accounts: unsupported type int64"), + errorLine: 4, + errorPos: 6, + }, } checkConfig := func(config string) error { diff --git a/server/const.go b/server/const.go index 21511957..b40cecde 100644 --- a/server/const.go +++ b/server/const.go @@ -121,6 +121,9 @@ const ( // DEFAULT_ROUTE_DIAL Route dial timeout. DEFAULT_ROUTE_DIAL = 1 * time.Second + // DEFAULT_ROUTE_POOL_SIZE Route default pool size + DEFAULT_ROUTE_POOL_SIZE = 3 + // DEFAULT_LEAF_NODE_RECONNECT LeafNode reconnect interval. DEFAULT_LEAF_NODE_RECONNECT = time.Second diff --git a/server/events.go b/server/events.go index 939291a1..e5f5c71d 100644 --- a/server/events.go +++ b/server/events.go @@ -679,9 +679,9 @@ func (s *Server) sendStatsz(subj string) { m.Stats.SlowConsumers = atomic.LoadInt64(&s.slowConsumers) m.Stats.NumSubs = s.numSubscriptions() // Routes - for _, r := range s.routes { + s.forEachRoute(func(r *client) { m.Stats.Routes = append(m.Stats.Routes, routeStat(r)) - } + }) // Gateways if s.gateway.enabled { gw := s.gateway @@ -2270,14 +2270,14 @@ func (s *Server) remoteLatencyUpdate(sub *subscription, _ *client, _ *Account, s // So we have not processed the response tracking measurement yet. if m1 == nil { - si.acc.mu.Lock() + acc.mu.Lock() // Double check since could have slipped in. m1 = si.m1 if m1 == nil { // Store our value there for them to pick up. si.m1 = &m2 } - si.acc.mu.Unlock() + acc.mu.Unlock() if m1 == nil { return } diff --git a/server/events_test.go b/server/events_test.go index 904fc7ab..a26933c9 100644 --- a/server/events_test.go +++ b/server/events_test.go @@ -1883,8 +1883,10 @@ func TestServerEventsStatsZ(t *testing.T) { if m.Stats.Received.Msgs < 1 { t.Fatalf("Did not match received msgs of >=1, got %d", m.Stats.Received.Msgs) } - if lr := len(m.Stats.Routes); lr != 1 { - t.Fatalf("Expected a route, but got %d", lr) + // Default pool size + 1 for systemt account + expectedRoutes := DEFAULT_ROUTE_POOL_SIZE + 1 + if lr := len(m.Stats.Routes); lr != expectedRoutes { + t.Fatalf("Expected %d routes, but got %d", expectedRoutes, lr) } // Now let's prompt this server to send us the statsz @@ -1912,8 +1914,8 @@ func TestServerEventsStatsZ(t *testing.T) { if m2.Stats.Received.Msgs < 1 { t.Fatalf("Did not match received msgs of >= 1, got %d", m2.Stats.Received.Msgs) } - if lr := len(m2.Stats.Routes); lr != 1 { - t.Fatalf("Expected a route, but got %d", lr) + if lr := len(m2.Stats.Routes); lr != expectedRoutes { + t.Fatalf("Expected %d routes, but got %d", expectedRoutes, lr) } msg, err = ncs.Request(subj, nil, time.Second) @@ -1939,11 +1941,13 @@ func TestServerEventsStatsZ(t *testing.T) { if m3.Stats.Received.Msgs < 2 { t.Fatalf("Did not match received msgs of >= 2, got %d", m3.Stats.Received.Msgs) } - if lr := len(m3.Stats.Routes); lr != 1 { - t.Fatalf("Expected a route, but got %d", lr) + if lr := len(m3.Stats.Routes); lr != expectedRoutes { + t.Fatalf("Expected %d routes, but got %d", expectedRoutes, lr) } - if sr := m3.Stats.Routes[0]; sr.Name != "B_SRV" { - t.Fatalf("Expected server A's route to B to have Name set to %q, got %q", "B", sr.Name) + for _, sr := range m3.Stats.Routes { + if sr.Name != "B_SRV" { + t.Fatalf("Expected server A's route to B to have Name set to %q, got %q", "B", sr.Name) + } } // Now query B and check that route's name is "A" @@ -1957,11 +1961,13 @@ func TestServerEventsStatsZ(t *testing.T) { if err := json.Unmarshal(msg.Data, &m); err != nil { t.Fatalf("Error unmarshalling the statz json: %v", err) } - if lr := len(m.Stats.Routes); lr != 1 { - t.Fatalf("Expected a route, but got %d", lr) + if lr := len(m.Stats.Routes); lr != expectedRoutes { + t.Fatalf("Expected %d routes, but got %d", expectedRoutes, lr) } - if sr := m.Stats.Routes[0]; sr.Name != "A_SRV" { - t.Fatalf("Expected server B's route to A to have Name set to %q, got %q", "A_SRV", sr.Name) + for _, sr := range m.Stats.Routes { + if sr.Name != "A_SRV" { + t.Fatalf("Expected server B's route to A to have Name set to %q, got %q", "A_SRV", sr.Name) + } } } diff --git a/server/gateway.go b/server/gateway.go index 35b50a12..b5979279 100644 --- a/server/gateway.go +++ b/server/gateway.go @@ -1215,11 +1215,11 @@ func (s *Server) forwardNewGatewayToLocalCluster(oinfo *Info) { b, _ := json.Marshal(info) infoJSON := []byte(fmt.Sprintf(InfoProto, b)) - for _, r := range s.routes { + s.forEachRemote(func(r *client) { r.mu.Lock() r.enqueueProto(infoJSON) r.mu.Unlock() - } + }) } // Sends queue subscriptions interest to remote gateway. @@ -2745,8 +2745,7 @@ func (g *srvGateway) getClusterHash() []byte { // Store this route in map with the key being the remote server's name hash // and the remote server's ID hash used by gateway replies mapping routing. -func (s *Server) storeRouteByHash(srvNameHash, srvIDHash string, c *client) { - s.routesByHash.Store(srvNameHash, c) +func (s *Server) storeRouteByHash(srvIDHash string, c *client) { if !s.gateway.enabled { return } @@ -2754,8 +2753,7 @@ func (s *Server) storeRouteByHash(srvNameHash, srvIDHash string, c *client) { } // Remove the route with the given keys from the map. -func (s *Server) removeRouteByHash(srvNameHash, srvIDHash string) { - s.routesByHash.Delete(srvNameHash) +func (s *Server) removeRouteByHash(srvIDHash string) { if !s.gateway.enabled { return } @@ -2764,11 +2762,33 @@ func (s *Server) removeRouteByHash(srvNameHash, srvIDHash string) { // Returns the route with given hash or nil if not found. // This is for gateways only. -func (g *srvGateway) getRouteByHash(hash []byte) *client { - if v, ok := g.routesIDByHash.Load(string(hash)); ok { - return v.(*client) +func (s *Server) getRouteByHash(hash, accName []byte) (*client, bool) { + id := string(hash) + var perAccount bool + if v, ok := s.accRouteByHash.Load(string(accName)); ok { + if v == nil { + id += string(accName) + perAccount = true + } else { + id += strconv.Itoa(v.(int)) + } } - return nil + if v, ok := s.gateway.routesIDByHash.Load(id); ok { + return v.(*client), perAccount + } else if !perAccount { + // Check if we have a "no pool" connection at index 0. + if v, ok := s.gateway.routesIDByHash.Load(string(hash) + "0"); ok { + if r := v.(*client); r != nil { + r.mu.Lock() + noPool := r.route.noPool + r.mu.Unlock() + if noPool { + return r, false + } + } + } + } + return nil, perAccount } // Returns the subject from the routed reply @@ -2824,10 +2844,11 @@ func (c *client) handleGatewayReply(msg []byte) (processed bool) { } var route *client + var perAccount bool // If the origin is not this server, get the route this should be sent to. if c.kind == GATEWAY && srvHash != nil && !bytes.Equal(srvHash, c.srv.gateway.sIDHash) { - route = c.srv.gateway.getRouteByHash(srvHash) + route, perAccount = c.srv.getRouteByHash(srvHash, c.pa.account) // This will be possibly nil, and in this case we will try to process // the interest from this server. } @@ -2839,8 +2860,12 @@ func (c *client) handleGatewayReply(msg []byte) (processed bool) { // getAccAndResultFromCache() var _pacache [256]byte pacache := _pacache[:0] - pacache = append(pacache, c.pa.account...) - pacache = append(pacache, ' ') + // For routes that are dedicated to an account, do not put the account + // name in the pacache. + if c.kind == GATEWAY || (c.kind == ROUTER && c.route != nil && len(c.route.accName) == 0) { + pacache = append(pacache, c.pa.account...) + pacache = append(pacache, ' ') + } pacache = append(pacache, c.pa.subject...) c.pa.pacache = pacache @@ -2885,8 +2910,10 @@ func (c *client) handleGatewayReply(msg []byte) (processed bool) { var bufa [256]byte var buf = bufa[:0] buf = append(buf, msgHeadProto...) - buf = append(buf, acc.Name...) - buf = append(buf, ' ') + if !perAccount { + buf = append(buf, acc.Name...) + buf = append(buf, ' ') + } buf = append(buf, orgSubject...) buf = append(buf, ' ') if len(c.pa.reply) > 0 { diff --git a/server/gateway_test.go b/server/gateway_test.go index 05a810a3..156db2a7 100644 --- a/server/gateway_test.go +++ b/server/gateway_test.go @@ -3042,6 +3042,7 @@ func TestGatewayRoutedServerWithoutGatewayConfigured(t *testing.T) { waitForOutboundGateways(t, s2, 1, time.Second) o3 := DefaultOptions() + o3.NoSystemAccount = true o3.Cluster.Name = "B" o3.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", s2.ClusterAddr().Port)) s3 := New(o3) @@ -3159,9 +3160,8 @@ func TestGatewayUnknownGatewayCommand(t *testing.T) { var route *client s2.mu.Lock() - for _, r := range s2.routes { + if r := getFirstRoute(s2); r != nil { route = r - break } s2.mu.Unlock() @@ -5094,141 +5094,180 @@ func (c *delayedWriteConn) Write(b []byte) (int, error) { // there, but also to subscribers on that reply subject // on the other cluster. func TestGatewaySendReplyAcrossGateways(t *testing.T) { - ob := testDefaultOptionsForGateway("B") - ob.Accounts = []*Account{NewAccount("ACC")} - ob.Users = []*User{{Username: "user", Password: "pwd", Account: ob.Accounts[0]}} - sb := runGatewayServer(ob) - defer sb.Shutdown() + for _, test := range []struct { + name string + poolSize int + peracc bool + }{ + {"no pooling", -1, false}, + {"pooling", 5, false}, + {"per account", 0, true}, + } { + t.Run(test.name, func(t *testing.T) { + ob := testDefaultOptionsForGateway("B") + ob.Accounts = []*Account{NewAccount("ACC")} + ob.Users = []*User{{Username: "user", Password: "pwd", Account: ob.Accounts[0]}} + sb := runGatewayServer(ob) + defer sb.Shutdown() - oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) - oa1.Accounts = []*Account{NewAccount("ACC")} - oa1.Users = []*User{{Username: "user", Password: "pwd", Account: oa1.Accounts[0]}} - sa1 := runGatewayServer(oa1) - defer sa1.Shutdown() - - waitForOutboundGateways(t, sb, 1, time.Second) - waitForInboundGateways(t, sb, 1, time.Second) - waitForOutboundGateways(t, sa1, 1, time.Second) - waitForInboundGateways(t, sa1, 1, time.Second) - - // Now start another server in cluster "A". This will allow us - // to test the reply from cluster "B" coming back directly to - // the server where the request originates, and indirectly through - // route. - oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) - oa2.Accounts = []*Account{NewAccount("ACC")} - oa2.Users = []*User{{Username: "user", Password: "pwd", Account: oa2.Accounts[0]}} - oa2.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa1.Cluster.Host, oa1.Cluster.Port)) - sa2 := runGatewayServer(oa2) - defer sa2.Shutdown() - - waitForOutboundGateways(t, sa2, 1, time.Second) - waitForInboundGateways(t, sb, 2, time.Second) - checkClusterFormed(t, sa1, sa2) - - replySubj := "bar" - - // Setup a responder on sb - ncb := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", ob.Host, ob.Port)) - defer ncb.Close() - natsSub(t, ncb, "foo", func(m *nats.Msg) { - m.Respond([]byte("reply")) - }) - // Set a subscription on the reply subject on sb - subSB := natsSubSync(t, ncb, replySubj) - natsFlush(t, ncb) - checkExpectedSubs(t, 2, sb) - - testReqReply := func(t *testing.T, host string, port int, createSubOnA bool) { - t.Helper() - nca := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", host, port)) - defer nca.Close() - if createSubOnA { - subSA := natsSubSync(t, nca, replySubj) - natsPubReq(t, nca, "foo", replySubj, []byte("hello")) - natsNexMsg(t, subSA, time.Second) - // Check for duplicates - if _, err := subSA.NextMsg(50 * time.Millisecond); err == nil { - t.Fatalf("Received duplicate message on subSA!") + oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) + oa1.Cluster.PoolSize = test.poolSize + if test.peracc { + oa1.Cluster.Accounts = []string{"ACC"} } - } else { - natsPubReq(t, nca, "foo", replySubj, []byte("hello")) - } - natsNexMsg(t, subSB, time.Second) - // Check for duplicates - if _, err := subSB.NextMsg(50 * time.Millisecond); err == nil { - t.Fatalf("Received duplicate message on subSB!") - } + oa1.Accounts = []*Account{NewAccount("ACC")} + oa1.Users = []*User{{Username: "user", Password: "pwd", Account: oa1.Accounts[0]}} + sa1 := runGatewayServer(oa1) + defer sa1.Shutdown() + + waitForOutboundGateways(t, sb, 1, time.Second) + waitForInboundGateways(t, sb, 1, time.Second) + waitForOutboundGateways(t, sa1, 1, time.Second) + waitForInboundGateways(t, sa1, 1, time.Second) + + // Now start another server in cluster "A". This will allow us + // to test the reply from cluster "B" coming back directly to + // the server where the request originates, and indirectly through + // route. + oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) + oa2.Cluster.PoolSize = test.poolSize + if test.peracc { + oa2.Cluster.Accounts = []string{"ACC"} + } + oa2.Accounts = []*Account{NewAccount("ACC")} + oa2.Users = []*User{{Username: "user", Password: "pwd", Account: oa2.Accounts[0]}} + oa2.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa1.Cluster.Host, oa1.Cluster.Port)) + sa2 := runGatewayServer(oa2) + defer sa2.Shutdown() + + waitForOutboundGateways(t, sa2, 1, time.Second) + waitForInboundGateways(t, sb, 2, time.Second) + checkClusterFormed(t, sa1, sa2) + + replySubj := "bar" + + // Setup a responder on sb + ncb := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", ob.Host, ob.Port)) + defer ncb.Close() + natsSub(t, ncb, "foo", func(m *nats.Msg) { + m.Respond([]byte("reply")) + }) + // Set a subscription on the reply subject on sb + subSB := natsSubSync(t, ncb, replySubj) + natsFlush(t, ncb) + checkExpectedSubs(t, 2, sb) + + testReqReply := func(t *testing.T, host string, port int, createSubOnA bool) { + t.Helper() + nca := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", host, port)) + defer nca.Close() + if createSubOnA { + subSA := natsSubSync(t, nca, replySubj) + natsPubReq(t, nca, "foo", replySubj, []byte("hello")) + natsNexMsg(t, subSA, time.Second) + // Check for duplicates + if _, err := subSA.NextMsg(50 * time.Millisecond); err == nil { + t.Fatalf("Received duplicate message on subSA!") + } + } else { + natsPubReq(t, nca, "foo", replySubj, []byte("hello")) + } + natsNexMsg(t, subSB, time.Second) + // Check for duplicates + if _, err := subSB.NextMsg(50 * time.Millisecond); err == nil { + t.Fatalf("Received duplicate message on subSB!") + } + } + // Create requestor on sa1 to check for direct reply from GW: + testReqReply(t, oa1.Host, oa1.Port, true) + // Wait for subscription to be gone... + checkExpectedSubs(t, 0, sa1) + // Now create requestor on sa2, it will receive reply through sa1. + testReqReply(t, oa2.Host, oa2.Port, true) + checkExpectedSubs(t, 0, sa1) + checkExpectedSubs(t, 0, sa2) + + // Now issue requests but without any interest in the requestor's + // origin cluster and make sure the other cluster gets the reply. + testReqReply(t, oa1.Host, oa1.Port, false) + testReqReply(t, oa2.Host, oa2.Port, false) + + // There is a possible race between sa2 sending the RS+ for the + // subscription on the reply subject, and the GW reply making it + // to sa1 before the RS+ is processed there. + // We are going to force this race by making the route connection + // block as needed. + + acc, _ := sa2.LookupAccount("ACC") + acc.mu.RLock() + api := acc.routePoolIdx + acc.mu.RUnlock() + + var route *client + sa2.mu.Lock() + if test.peracc { + if conns, ok := sa2.accRoutes["ACC"]; ok { + for _, r := range conns { + route = r + break + } + } + } else if test.poolSize > 0 { + sa2.forEachRoute(func(r *client) { + r.mu.Lock() + if r.route.poolIdx == api { + route = r + } + r.mu.Unlock() + }) + } else if r := getFirstRoute(sa2); r != nil { + route = r + } + sa2.mu.Unlock() + route.mu.Lock() + routeConn := &delayedWriteConn{ + Conn: route.nc, + wg: sync.WaitGroup{}, + } + route.nc = routeConn + route.mu.Unlock() + + delayRoute := func() { + routeConn.Lock() + routeConn.delay = true + routeConn.Unlock() + } + stopDelayRoute := func() { + routeConn.Lock() + routeConn.delay = false + wg := &routeConn.wg + routeConn.Unlock() + wg.Wait() + } + + delayRoute() + testReqReply(t, oa2.Host, oa2.Port, true) + stopDelayRoute() + + // Same test but now we have a local interest on the reply subject + // on sa1 to make sure that interest there does not prevent sending + // the RMSG to sa2, which is the origin of the request. + checkExpectedSubs(t, 0, sa1) + checkExpectedSubs(t, 0, sa2) + nca1 := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", oa1.Host, oa1.Port)) + defer nca1.Close() + subSA1 := natsSubSync(t, nca1, replySubj) + natsFlush(t, nca1) + checkExpectedSubs(t, 1, sa1) + checkExpectedSubs(t, 1, sa2) + + delayRoute() + testReqReply(t, oa2.Host, oa2.Port, true) + stopDelayRoute() + + natsNexMsg(t, subSA1, time.Second) + }) } - // Create requestor on sa1 to check for direct reply from GW: - testReqReply(t, oa1.Host, oa1.Port, true) - // Wait for subscription to be gone... - checkExpectedSubs(t, 0, sa1) - // Now create requestor on sa2, it will receive reply through sa1. - testReqReply(t, oa2.Host, oa2.Port, true) - checkExpectedSubs(t, 0, sa1) - checkExpectedSubs(t, 0, sa2) - - // Now issue requests but without any interest in the requestor's - // origin cluster and make sure the other cluster gets the reply. - testReqReply(t, oa1.Host, oa1.Port, false) - testReqReply(t, oa2.Host, oa2.Port, false) - - // There is a possible race between sa2 sending the RS+ for the - // subscription on the reply subject, and the GW reply making it - // to sa1 before the RS+ is processed there. - // We are going to force this race by making the route connection - // block as needed. - - var route *client - sa2.mu.Lock() - for _, r := range sa2.routes { - route = r - break - } - sa2.mu.Unlock() - route.mu.Lock() - routeConn := &delayedWriteConn{ - Conn: route.nc, - wg: sync.WaitGroup{}, - } - route.nc = routeConn - route.mu.Unlock() - - delayRoute := func() { - routeConn.Lock() - routeConn.delay = true - routeConn.Unlock() - } - stopDelayRoute := func() { - routeConn.Lock() - routeConn.delay = false - wg := &routeConn.wg - routeConn.Unlock() - wg.Wait() - } - - delayRoute() - testReqReply(t, oa2.Host, oa2.Port, true) - stopDelayRoute() - - // Same test but now we have a local interest on the reply subject - // on sa1 to make sure that interest there does not prevent sending - // the RMSG to sa2, which is the origin of the request. - checkExpectedSubs(t, 0, sa1) - checkExpectedSubs(t, 0, sa2) - nca1 := natsConnect(t, fmt.Sprintf("nats://user:pwd@%s:%d", oa1.Host, oa1.Port)) - defer nca1.Close() - subSA1 := natsSubSync(t, nca1, replySubj) - natsFlush(t, nca1) - checkExpectedSubs(t, 1, sa1) - checkExpectedSubs(t, 1, sa2) - - delayRoute() - testReqReply(t, oa2.Host, oa2.Port, true) - stopDelayRoute() - - natsNexMsg(t, subSA1, time.Second) } // This test will have a requestor on cluster A and responder @@ -5316,6 +5355,7 @@ func TestGatewaySendReplyAcrossGatewaysServiceImport(t *testing.T) { defer sb.Shutdown() oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) + oa1.Cluster.PoolSize = 1 setAccountUserPassInOptions(oa1, "$foo", "clientAFoo", "password") setAccountUserPassInOptions(oa1, "$bar", "clientABar", "password") sa1 := runGatewayServer(oa1) @@ -5327,6 +5367,7 @@ func TestGatewaySendReplyAcrossGatewaysServiceImport(t *testing.T) { waitForInboundGateways(t, sa1, 1, time.Second) oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) + oa2.Cluster.PoolSize = 1 setAccountUserPassInOptions(oa2, "$foo", "clientAFoo", "password") setAccountUserPassInOptions(oa2, "$bar", "clientABar", "password") oa2.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa1.Cluster.Host, oa1.Cluster.Port)) @@ -5334,6 +5375,7 @@ func TestGatewaySendReplyAcrossGatewaysServiceImport(t *testing.T) { defer sa2.Shutdown() oa3 := testGatewayOptionsFromToWithServers(t, "A", "B", sb) + oa3.Cluster.PoolSize = 1 setAccountUserPassInOptions(oa3, "$foo", "clientAFoo", "password") setAccountUserPassInOptions(oa3, "$bar", "clientABar", "password") oa3.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa1.Cluster.Host, oa1.Cluster.Port)) @@ -5445,18 +5487,18 @@ func TestGatewaySendReplyAcrossGatewaysServiceImport(t *testing.T) { t.Helper() s.mu.Lock() defer s.mu.Unlock() - for _, r := range s.routes { + s.forEachRoute(func(r *client) { r.mu.Lock() if r.route.remoteID != sa1.ID() { r.mu.Unlock() - continue + return } inMsgs := atomic.LoadInt64(&r.inMsgs) r.mu.Unlock() if inMsgs != expected { t.Fatalf("Expected %v incoming msgs, got %v", expected, inMsgs) } - } + }) } // Wait a bit to make sure that we don't have a loop that // cause messages to be routed more than needed. @@ -6696,7 +6738,6 @@ func TestGatewayDuplicateServerName(t *testing.T) { ob3 := testDefaultOptionsForGateway("B") ob3.ServerName = "a_nats2" ob3.Accounts = []*Account{NewAccount("sys")} - ob3.SystemAccount = "sys" ob3.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", ob2.Cluster.Port)) sb3 := RunServer(ob3) defer sb3.Shutdown() @@ -6721,7 +6762,6 @@ func TestGatewayDuplicateServerName(t *testing.T) { oc := testGatewayOptionsFromToWithServers(t, "C", "B", sb1) oc.ServerName = "a_nats2" oc.Accounts = []*Account{NewAccount("sys")} - oc.SystemAccount = "sys" sc := RunServer(oc) defer sc.Shutdown() scl := &captureErrorLogger{errCh: make(chan string, 100)} diff --git a/server/jetstream.go b/server/jetstream.go index 6b435214..67c3566c 100644 --- a/server/jetstream.go +++ b/server/jetstream.go @@ -1030,9 +1030,12 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits) erro } js.mu.Lock() - if _, ok := js.accounts[a.Name]; ok && a.JetStreamEnabled() { + if jsa, ok := js.accounts[a.Name]; ok { + a.mu.Lock() + a.js = jsa + a.mu.Unlock() js.mu.Unlock() - return fmt.Errorf("jetstream already enabled for account") + return a.enableAllJetStreamServiceImportsAndMappings() } // Check the limits against existing reservations. @@ -1054,12 +1057,11 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits) erro jsa.usageMu.Unlock() js.accounts[a.Name] = jsa - js.mu.Unlock() - - // Stamp inside account as well. + // Stamp inside account as well. Needs to be done under js's lock. a.mu.Lock() a.js = jsa a.mu.Unlock() + js.mu.Unlock() // Create the proper imports here. if err := a.enableAllJetStreamServiceImportsAndMappings(); err != nil { diff --git a/server/jetstream_cluster_3_test.go b/server/jetstream_cluster_3_test.go index 88f55b64..46138816 100644 --- a/server/jetstream_cluster_3_test.go +++ b/server/jetstream_cluster_3_test.go @@ -1136,13 +1136,13 @@ func TestJetStreamClusterScaleDownWhileNoQuorum(t *testing.T) { } sl.mu.Lock() - for _, r := range sl.routes { + sl.forEachRoute(func(r *client) { r.mu.Lock() ncu := &networkCableUnplugged{Conn: r.nc, unplugged: true} ncu.wg.Add(1) r.nc = ncu r.mu.Unlock() - } + }) sl.mu.Unlock() // Wait for the stream info to fail @@ -1170,7 +1170,7 @@ func TestJetStreamClusterScaleDownWhileNoQuorum(t *testing.T) { }, nats.MaxWait(5*time.Second)) sl.mu.Lock() - for _, r := range sl.routes { + sl.forEachRoute(func(r *client) { r.mu.Lock() ncu := r.nc.(*networkCableUnplugged) ncu.Lock() @@ -1178,7 +1178,7 @@ func TestJetStreamClusterScaleDownWhileNoQuorum(t *testing.T) { ncu.wg.Done() ncu.Unlock() r.mu.Unlock() - } + }) sl.mu.Unlock() checkClusterFormed(t, c.servers...) @@ -1391,12 +1391,11 @@ func TestJetStreamParallelStreamCreation(t *testing.T) { // Make them all fire at once. <-startCh - _, err := js.AddStream(&nats.StreamConfig{ + if _, err := js.AddStream(&nats.StreamConfig{ Name: "TEST", Subjects: []string{"common.*.*"}, Replicas: 3, - }) - if err != nil { + }); err != nil { errCh <- err } }() @@ -1486,6 +1485,7 @@ func TestJetStreamParallelConsumerCreation(t *testing.T) { Replicas: 3, }) require_NoError(t, err) + c.waitOnStreamLeader(globalAccountName, "TEST") np := 50 diff --git a/server/leafnode.go b/server/leafnode.go index 9a5b7639..eb5d7ce6 100644 --- a/server/leafnode.go +++ b/server/leafnode.go @@ -860,9 +860,7 @@ func (s *Server) generateLeafNodeInfoJSON() { s.leafNodeInfo.Cluster = s.cachedClusterName() s.leafNodeInfo.LeafNodeURLs = s.leafURLsMap.getAsStringSlice() s.leafNodeInfo.WSConnectURLs = s.websocket.connectURLsMap.getAsStringSlice() - b, _ := json.Marshal(s.leafNodeInfo) - pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} - s.leafNodeInfoJSON = bytes.Join(pcs, []byte(" ")) + s.leafNodeInfoJSON = generateInfoJSON(&s.leafNodeInfo) } // Sends an async INFO protocol so that the connected servers can update @@ -1003,14 +1001,12 @@ func (s *Server) createLeafNode(conn net.Conn, rURL *url.URL, remote *leafNodeCf copy(c.nonce, nonce[:]) info.Nonce = string(c.nonce) info.CID = c.cid - b, _ := json.Marshal(info) - - pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} + proto := generateInfoJSON(info) // We have to send from this go routine because we may // have to block for TLS handshake before we start our // writeLoop go routine. The other side needs to receive // this before it can initiate the TLS handshake.. - c.sendProtoNow(bytes.Join(pcs, []byte(" "))) + c.sendProtoNow(proto) // The above call could have marked the connection as closed (due to TCP error). if c.isClosed() { @@ -1616,9 +1612,7 @@ func (s *Server) sendPermsAndAccountInfo(c *client) { info.Export = c.opts.Export info.RemoteAccount = c.acc.Name info.ConnectInfo = true - b, _ := json.Marshal(info) - pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} - c.enqueueProto(bytes.Join(pcs, []byte(" "))) + c.enqueueProto(generateInfoJSON(info)) c.mu.Unlock() } diff --git a/server/monitor.go b/server/monitor.go index b06ab22e..6986ebed 100644 --- a/server/monitor.go +++ b/server/monitor.go @@ -778,6 +778,7 @@ type RouteInfo struct { NumSubs uint32 `json:"subscriptions"` Subs []string `json:"subscriptions_list,omitempty"` SubsDetail []SubDetail `json:"subscriptions_list_detail,omitempty"` + Account string `json:"account,omitempty"` } // Routez returns a Routez struct containing information about routes. @@ -790,7 +791,7 @@ func (s *Server) Routez(routezOpts *RoutezOptions) (*Routez, error) { } s.mu.Lock() - rs.NumRoutes = len(s.routes) + rs.NumRoutes = s.numRoutes() // copy the server id for monitoring rs.ID = s.info.ID @@ -801,8 +802,7 @@ func (s *Server) Routez(routezOpts *RoutezOptions) (*Routez, error) { rs.Export = perms.Export } - // Walk the list - for _, r := range s.routes { + addRoute := func(r *client) { r.mu.Lock() ri := &RouteInfo{ Rid: r.cid, @@ -821,6 +821,7 @@ func (s *Server) Routez(routezOpts *RoutezOptions) (*Routez, error) { LastActivity: r.last, Uptime: myUptime(rs.Now.Sub(r.start)), Idle: myUptime(rs.Now.Sub(r.last)), + Account: string(r.route.accName), } if len(r.subs) > 0 { @@ -840,6 +841,11 @@ func (s *Server) Routez(routezOpts *RoutezOptions) (*Routez, error) { r.mu.Unlock() rs.Routes = append(rs.Routes, ri) } + + // Walk the list + s.forEachRoute(func(r *client) { + addRoute(r) + }) s.mu.Unlock() return rs, nil } @@ -1214,6 +1220,7 @@ type ClusterOptsVarz struct { TLSTimeout float64 `json:"tls_timeout,omitempty"` TLSRequired bool `json:"tls_required,omitempty"` TLSVerify bool `json:"tls_verify,omitempty"` + PoolSize int `json:"pool_size,omitempty"` } // GatewayOptsVarz contains monitoring gateway information @@ -1467,6 +1474,7 @@ func (s *Server) createVarz(pcpu float64, rss int64) *Varz { TLSTimeout: c.TLSTimeout, TLSRequired: clustTlsReq, TLSVerify: clustTlsReq, + PoolSize: opts.Cluster.PoolSize, }, Gateway: GatewayOptsVarz{ Name: gw.Name, @@ -1634,8 +1642,8 @@ func (s *Server) updateVarzRuntimeFields(v *Varz, forceUpdate bool, pcpu float64 } v.Connections = len(s.clients) v.TotalConnections = s.totalClients - v.Routes = len(s.routes) - v.Remotes = len(s.remotes) + v.Routes = s.numRoutes() + v.Remotes = s.numRemotes() v.Leafs = len(s.leafs) v.InMsgs = atomic.LoadInt64(&s.inMsgs) v.InBytes = atomic.LoadInt64(&s.inBytes) diff --git a/server/monitor_test.go b/server/monitor_test.go index 49eb86f4..4aab06ce 100644 --- a/server/monitor_test.go +++ b/server/monitor_test.go @@ -1346,12 +1346,12 @@ func TestConnzWithRoutes(t *testing.T) { for mode := 0; mode < 2; mode++ { rz := pollRoutez(t, s, mode, url+urlSuffix, &RoutezOptions{Subscriptions: subs == 1, SubscriptionsDetail: subs == 2}) - if rz.NumRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", rz.NumRoutes) + if rz.NumRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, rz.NumRoutes) } - if len(rz.Routes) != 1 { - t.Fatalf("Expected route array of 1, got %v", len(rz.Routes)) + if len(rz.Routes) != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected route array of %d, got %v", DEFAULT_ROUTE_POOL_SIZE, len(rz.Routes)) } route := rz.Routes[0] @@ -2328,6 +2328,7 @@ func TestRoutezPermissions(t *testing.T) { defer s1.Shutdown() opts = DefaultMonitorOptions() + opts.NoSystemAccount = true opts.ServerName = "monitor_server_2" opts.Cluster.Host = "127.0.0.1" opts.Cluster.Name = "A" @@ -2371,8 +2372,8 @@ func TestRoutezPermissions(t *testing.T) { t.Fatal("Routez body should NOT contain \"export\" information.") } // We do expect to see them show up for the information we have on Server A though. - if len(rz.Routes) != 1 { - t.Fatalf("Expected route array of 1, got %v\n", len(rz.Routes)) + if len(rz.Routes) != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected route array of %d, got %v\n", DEFAULT_ROUTE_POOL_SIZE, len(rz.Routes)) } route := rz.Routes[0] if route.Import == nil || route.Import.Allow == nil || @@ -2577,6 +2578,7 @@ func TestMonitorCluster(t *testing.T) { opts.Cluster.TLSTimeout, opts.Cluster.TLSConfig != nil, opts.Cluster.TLSConfig != nil, + DEFAULT_ROUTE_POOL_SIZE, } varzURL := fmt.Sprintf("http://127.0.0.1:%d/varz", s.MonitorAddr().Port) @@ -2592,7 +2594,7 @@ func TestMonitorCluster(t *testing.T) { // Having this here to make sure that if fields are added in ClusterOptsVarz, // we make sure to update this test (compiler will report an error if we don't) - _ = ClusterOptsVarz{"", "", 0, 0, nil, 2, false, false} + _ = ClusterOptsVarz{"", "", 0, 0, nil, 2, false, false, 0} // Alter the fields to make sure that we have a proper deep copy // of what may be stored in the server. Anything we change here @@ -3608,12 +3610,14 @@ func TestMonitorRouteRTT(t *testing.T) { for pollMode := 0; pollMode < 2; pollMode++ { checkFor(t, 2*firstPingInterval, 15*time.Millisecond, func() error { rz := pollRoutez(t, s, pollMode, routezURL, nil) - if len(rz.Routes) != 1 { - return fmt.Errorf("Expected 1 route, got %v", len(rz.Routes)) + // Pool size + 1 for system account + if len(rz.Routes) != DEFAULT_ROUTE_POOL_SIZE+1 { + return fmt.Errorf("Expected %d route, got %v", DEFAULT_ROUTE_POOL_SIZE+1, len(rz.Routes)) } - ri := rz.Routes[0] - if ri.RTT == _EMPTY_ { - return fmt.Errorf("Route's RTT not reported") + for _, ri := range rz.Routes { + if ri.RTT == _EMPTY_ { + return fmt.Errorf("Route's RTT not reported") + } } return nil }) @@ -4672,3 +4676,178 @@ func TestMonitorProfilez(t *testing.T) { } } } + +func TestMonitorRoutePoolSize(t *testing.T) { + conf1 := createConfFile(t, []byte(` + port: -1 + http: -1 + cluster { + port: -1 + name: "local" + pool_size: 5 + } + no_sys_acc: true + `)) + defer removeFile(t, conf1) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf23 := createConfFile(t, []byte(fmt.Sprintf(` + port: -1 + http: -1 + cluster { + port: -1 + name: "local" + routes: ["nats://127.0.0.1:%d"] + pool_size: 5 + } + no_sys_acc: true + `, o1.Cluster.Port))) + defer removeFile(t, conf23) + + s2, _ := RunServerWithConfig(conf23) + defer s2.Shutdown() + s3, _ := RunServerWithConfig(conf23) + defer s3.Shutdown() + + checkClusterFormed(t, s1, s2, s3) + + for i, s := range []*Server{s1, s2, s3} { + url := fmt.Sprintf("http://127.0.0.1:%d/varz", s.MonitorAddr().Port) + for mode := 0; mode < 2; mode++ { + v := pollVarz(t, s, mode, url, nil) + if v.Cluster.PoolSize != 5 { + t.Fatalf("Expected Cluster.PoolSize==5, got %v", v.Cluster.PoolSize) + } + if v.Remotes != 2 { + t.Fatalf("Expected Remotes==2, got %v", v.Remotes) + } + if v.Routes != 10 { + t.Fatalf("Expected NumRoutes==10, got %v", v.Routes) + } + } + + url = fmt.Sprintf("http://127.0.0.1:%d/routez", s.MonitorAddr().Port) + for mode := 0; mode < 2; mode++ { + v := pollRoutez(t, s, mode, url, nil) + if v.NumRoutes != 10 { + t.Fatalf("Expected NumRoutes==10, got %v", v.NumRoutes) + } + if n := len(v.Routes); n != 10 { + t.Fatalf("Expected len(Routes)==10, got %v", n) + } + remotes := make(map[string]int) + for _, r := range v.Routes { + remotes[r.RemoteID]++ + } + if n := len(remotes); n != 2 { + t.Fatalf("Expected routes for 2 different servers, got %v", n) + } + switch i { + case 0: + if n := remotes[s2.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S1 to S2, got %v", n) + } + if n := remotes[s3.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S1 to S3, got %v", n) + } + case 1: + if n := remotes[s1.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S2 to S1, got %v", n) + } + if n := remotes[s3.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S2 to S3, got %v", n) + } + case 2: + if n := remotes[s1.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S3 to S1, got %v", n) + } + if n := remotes[s2.ID()]; n != 5 { + t.Fatalf("Expected 5 routes from S3 to S2, got %v", n) + } + } + } + } +} + +func TestMonitorRoutePerAccount(t *testing.T) { + conf1 := createConfFile(t, []byte(` + port: -1 + http: -1 + accounts { + A { users: [{user: "a", password: "pwd"}] } + } + cluster { + port: -1 + name: "local" + accounts: ["A"] + } + `)) + defer removeFile(t, conf1) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf23 := createConfFile(t, []byte(fmt.Sprintf(` + port: -1 + http: -1 + accounts { + A { users: [{user: "a", password: "pwd"}] } + } + cluster { + port: -1 + name: "local" + routes: ["nats://127.0.0.1:%d"] + accounts: ["A"] + } + `, o1.Cluster.Port))) + defer removeFile(t, conf23) + + s2, _ := RunServerWithConfig(conf23) + defer s2.Shutdown() + s3, _ := RunServerWithConfig(conf23) + defer s3.Shutdown() + + checkClusterFormed(t, s1, s2, s3) + + for _, s := range []*Server{s1, s2, s3} { + // Default pool size + account "A" + system account (added by default) + enr := 2 * (DEFAULT_ROUTE_POOL_SIZE + 1 + 1) + url := fmt.Sprintf("http://127.0.0.1:%d/varz", s.MonitorAddr().Port) + for mode := 0; mode < 2; mode++ { + v := pollVarz(t, s, mode, url, nil) + if v.Remotes != 2 { + t.Fatalf("Expected Remotes==2, got %v", v.Remotes) + } + if v.Routes != enr { + t.Fatalf("Expected NumRoutes==%d, got %v", enr, v.Routes) + } + } + + url = fmt.Sprintf("http://127.0.0.1:%d/routez", s.MonitorAddr().Port) + for mode := 0; mode < 2; mode++ { + v := pollRoutez(t, s, mode, url, nil) + if v.NumRoutes != enr { + t.Fatalf("Expected NumRoutes==%d, got %v", enr, v.NumRoutes) + } + if n := len(v.Routes); n != enr { + t.Fatalf("Expected len(Routes)==%d, got %v", enr, n) + } + remotes := make(map[string]int) + for _, r := range v.Routes { + var acc int + if r.Account == "A" { + acc = 1 + } + remotes[r.RemoteID] += acc + } + if n := len(remotes); n != 2 { + t.Fatalf("Expected routes for 2 different servers, got %v", n) + } + for remoteID, v := range remotes { + if v != 1 { + t.Fatalf("Expected one and only one connection for account A for remote %q, got %v", remoteID, v) + } + } + } + } +} diff --git a/server/norace_test.go b/server/norace_test.go index 06da28f8..ae5044c2 100644 --- a/server/norace_test.go +++ b/server/norace_test.go @@ -648,11 +648,13 @@ func TestNoRaceRouteCache(t *testing.T) { oa := DefaultOptions() oa.NoSystemAccount = true + oa.Cluster.PoolSize = -1 sa := RunServer(oa) defer sa.Shutdown() ob := DefaultOptions() ob.NoSystemAccount = true + ob.Cluster.PoolSize = -1 ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port)) sb := RunServer(ob) defer sb.Shutdown() @@ -706,10 +708,7 @@ func TestNoRaceRouteCache(t *testing.T) { var route *client sb.mu.Lock() - for _, r := range sb.routes { - route = r - break - } + route = getFirstRoute(sb) sb.mu.Unlock() checkExpected := func(t *testing.T, expected int) { @@ -7808,3 +7807,361 @@ func TestNoRaceParallelStreamAndConsumerCreation(t *testing.T) { t.Fatalf("Expected only one consumer to be really created, got %d out of %d attempts", numConsumers, np) } } + +func TestNoRaceRoutePool(t *testing.T) { + var dur1 time.Duration + var dur2 time.Duration + + total := 1_000_000 + + for _, test := range []struct { + name string + poolSize int + }{ + {"no pooling", 0}, + {"pooling", 5}, + } { + t.Run(test.name, func(t *testing.T) { + tmpl := ` + port: -1 + accounts { + A { users: [{user: "A", password: "A"}] } + B { users: [{user: "B", password: "B"}] } + C { users: [{user: "C", password: "C"}] } + D { users: [{user: "D", password: "D"}] } + E { users: [{user: "E", password: "E"}] } + } + cluster { + port: -1 + name: "local" + %s + pool_size: %d + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_, test.poolSize))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + test.poolSize))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + wg := sync.WaitGroup{} + wg.Add(5) + + sendAndRecv := func(acc string) (*nats.Conn, *nats.Conn) { + t.Helper() + + s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(acc, acc)) + count := 0 + natsSub(t, s2nc, "foo", func(_ *nats.Msg) { + if count++; count == total { + wg.Done() + } + }) + natsFlush(t, s2nc) + + s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(acc, acc)) + + checkSubInterest(t, s1, acc, "foo", time.Second) + return s2nc, s1nc + } + + var rcv = [5]*nats.Conn{} + var snd = [5]*nats.Conn{} + accs := []string{"A", "B", "C", "D", "E"} + + for i := 0; i < 5; i++ { + rcv[i], snd[i] = sendAndRecv(accs[i]) + defer rcv[i].Close() + defer snd[i].Close() + } + + payload := []byte("some message") + start := time.Now() + for i := 0; i < 5; i++ { + go func(idx int) { + for i := 0; i < total; i++ { + snd[idx].Publish("foo", payload) + } + }(i) + } + + wg.Wait() + dur := time.Since(start) + if test.poolSize == 0 { + dur1 = dur + } else { + dur2 = dur + } + }) + } + perf1 := float64(total*5) / dur1.Seconds() + t.Logf("No pooling: %.0f msgs/sec", perf1) + perf2 := float64(total*5) / dur2.Seconds() + t.Logf("Pooling : %.0f msgs/sec", perf2) + t.Logf("Gain : %.2fx", perf2/perf1) +} + +func TestNoRaceRoutePerAccount(t *testing.T) { + var dur1 time.Duration + var dur2 time.Duration + + accounts := make([]string, 5) + for i := 0; i < 5; i++ { + akp, _ := nkeys.CreateAccount() + pub, _ := akp.PublicKey() + accounts[i] = pub + } + routeAccs := fmt.Sprintf("accounts: [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"]", + accounts[0], accounts[1], accounts[2], accounts[3], accounts[4]) + + total := 1_000_000 + + for _, test := range []struct { + name string + dedicated bool + }{ + {"route for all accounts", false}, + {"route per account", true}, + } { + t.Run(test.name, func(t *testing.T) { + tmpl := ` + port: -1 + accounts { + %s { users: [{user: "0", password: "0"}] } + %s { users: [{user: "1", password: "1"}] } + %s { users: [{user: "2", password: "2"}] } + %s { users: [{user: "3", password: "3"}] } + %s { users: [{user: "4", password: "4"}] } + } + cluster { + port: -1 + name: "local" + %s + %s + } + ` + var racc string + if test.dedicated { + racc = routeAccs + } else { + racc = _EMPTY_ + } + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + accounts[0], accounts[1], accounts[2], accounts[3], + accounts[4], _EMPTY_, racc))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + accounts[0], accounts[1], accounts[2], accounts[3], accounts[4], + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + racc))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + wg := sync.WaitGroup{} + wg.Add(5) + + sendAndRecv := func(acc string, user string) (*nats.Conn, *nats.Conn) { + t.Helper() + + s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(user, user)) + count := 0 + natsSub(t, s2nc, "foo", func(_ *nats.Msg) { + if count++; count == total { + wg.Done() + } + }) + natsFlush(t, s2nc) + + s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(user, user)) + + checkSubInterest(t, s1, acc, "foo", time.Second) + return s2nc, s1nc + } + + var rcv = [5]*nats.Conn{} + var snd = [5]*nats.Conn{} + users := []string{"0", "1", "2", "3", "4"} + + for i := 0; i < 5; i++ { + rcv[i], snd[i] = sendAndRecv(accounts[i], users[i]) + defer rcv[i].Close() + defer snd[i].Close() + } + + payload := []byte("some message") + start := time.Now() + for i := 0; i < 5; i++ { + go func(idx int) { + for i := 0; i < total; i++ { + snd[idx].Publish("foo", payload) + } + }(i) + } + + wg.Wait() + dur := time.Since(start) + if !test.dedicated { + dur1 = dur + } else { + dur2 = dur + } + }) + } + perf1 := float64(total*5) / dur1.Seconds() + t.Logf("Route for all accounts: %.0f msgs/sec", perf1) + perf2 := float64(total*5) / dur2.Seconds() + t.Logf("Route per account : %.0f msgs/sec", perf2) + t.Logf("Gain : %.2fx", perf2/perf1) +} + +// This test, which checks that messages are not duplicated when pooling or +// per-account routes are reloaded, would cause a DATA RACE that is not +// specific to the changes for pooling/per_account. For this reason, this +// test is located in the norace_test.go file. +func TestNoRaceRoutePoolAndPerAccountConfigReload(t *testing.T) { + for _, test := range []struct { + name string + poolSizeBefore string + poolSizeAfter string + accountsBefore string + accountsAfter string + }{ + {"from no pool to pool", _EMPTY_, "pool_size: 2", _EMPTY_, _EMPTY_}, + {"increase pool size", "pool_size: 2", "pool_size: 5", _EMPTY_, _EMPTY_}, + {"decrease pool size", "pool_size: 5", "pool_size: 2", _EMPTY_, _EMPTY_}, + {"from pool to no pool", "pool_size: 5", _EMPTY_, _EMPTY_, _EMPTY_}, + {"from no account to account", _EMPTY_, _EMPTY_, _EMPTY_, "accounts: [\"A\"]"}, + {"add account", _EMPTY_, _EMPTY_, "accounts: [\"B\"]", "accounts: [\"A\",\"B\"]"}, + {"remove account", _EMPTY_, _EMPTY_, "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, + {"from account to no account", _EMPTY_, _EMPTY_, "accounts: [\"A\"]", _EMPTY_}, + {"increase pool size and add account", "pool_size: 2", "pool_size: 3", "accounts: [\"B\"]", "accounts: [\"B\",\"A\"]"}, + {"decrease pool size and remove account", "pool_size: 3", "pool_size: 2", "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, + } { + t.Run(test.name, func(t *testing.T) { + tmplA := ` + port: -1 + server_name: "A" + accounts { + A { users: [{user: a, password: pwd}] } + B { users: [{user: b, password: pwd}] } + } + cluster: { + port: -1 + name: "local" + %s + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(tmplA, test.poolSizeBefore, test.accountsBefore))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + tmplB := ` + port: -1 + server_name: "B" + accounts { + A { users: [{user: a, password: pwd}] } + B { users: [{user: b, password: pwd}] } + } + cluster: { + port: -1 + name: "local" + %s + %s + routes: ["nats://127.0.0.1:%d"] + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(tmplB, test.poolSizeBefore, test.accountsBefore, optsA.Cluster.Port))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + + checkClusterFormed(t, srva, srvb) + + ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("a", "pwd")) + defer ncA.Close() + + sub := natsSubSync(t, ncA, "foo") + sub.SetPendingLimits(-1, -1) + checkSubInterest(t, srvb, "A", "foo", time.Second) + + ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("a", "pwd")) + defer ncB.Close() + + wg := sync.WaitGroup{} + wg.Add(1) + ch := make(chan struct{}) + go func() { + defer wg.Done() + + for i := 0; ; i++ { + ncB.Publish("foo", []byte(fmt.Sprintf("%d", i))) + select { + case <-ch: + return + default: + } + if i%300 == 0 { + time.Sleep(time.Duration(rand.Intn(5)) * time.Millisecond) + } + } + }() + + var l *captureErrorLogger + if test.accountsBefore != _EMPTY_ && test.accountsAfter == _EMPTY_ { + l = &captureErrorLogger{errCh: make(chan string, 100)} + srva.SetLogger(l, false, false) + } + + time.Sleep(250 * time.Millisecond) + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(tmplA, test.poolSizeAfter, test.accountsAfter)) + time.Sleep(125 * time.Millisecond) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(tmplB, test.poolSizeAfter, test.accountsAfter, optsA.Cluster.Port)) + + checkClusterFormed(t, srva, srvb) + checkSubInterest(t, srvb, "A", "foo", time.Second) + + if l != nil { + // Errors regarding "No route for account" should stop + var ok bool + for numErrs := 0; !ok && numErrs < 10; { + select { + case e := <-l.errCh: + if strings.Contains(e, "No route for account") { + numErrs++ + } + case <-time.After(DEFAULT_ROUTE_RECONNECT + 250*time.Millisecond): + ok = true + } + } + if !ok { + t.Fatalf("Still report of no route for account") + } + } + + close(ch) + wg.Wait() + + for prev := -1; ; { + msg, err := sub.NextMsg(50 * time.Millisecond) + if err != nil { + break + } + cur, _ := strconv.Atoi(string(msg.Data)) + if cur <= prev { + t.Fatalf("Previous was %d, got %d", prev, cur) + } + prev = cur + } + }) + } +} diff --git a/server/opts.go b/server/opts.go index 74211dee..4a487212 100644 --- a/server/opts.go +++ b/server/opts.go @@ -76,6 +76,8 @@ type ClusterOpts struct { Advertise string `json:"-"` NoAdvertise bool `json:"-"` ConnectRetries int `json:"-"` + PoolSize int `json:"-"` + Accounts []string `json:"-"` // Not exported (used in tests) resolver netResolver @@ -1595,6 +1597,10 @@ func parseCluster(v interface{}, opts *Options, errors *[]error, warnings *[]err } // This will possibly override permissions that were define in auth block setClusterPermissions(&opts.Cluster, perms) + case "pool_size": + opts.Cluster.PoolSize = int(mv.(int64)) + case "accounts": + opts.Cluster.Accounts, _ = parseStringArray("accounts", tk, <, mv, errors, warnings) default: if !tk.IsUsedVariable() { err := &unknownConfigFieldErr{ @@ -4663,7 +4669,7 @@ func setBaselineOptions(opts *Options) { if opts.AuthTimeout == 0 { opts.AuthTimeout = getDefaultAuthTimeout(opts.TLSConfig, opts.TLSTimeout) } - if opts.Cluster.Port != 0 { + if opts.Cluster.Port != 0 || opts.Cluster.ListenStr != _EMPTY_ { if opts.Cluster.Host == _EMPTY_ { opts.Cluster.Host = DEFAULT_HOST } @@ -4673,6 +4679,30 @@ func setBaselineOptions(opts *Options) { if opts.Cluster.AuthTimeout == 0 { opts.Cluster.AuthTimeout = getDefaultAuthTimeout(opts.Cluster.TLSConfig, opts.Cluster.TLSTimeout) } + if opts.Cluster.PoolSize == 0 { + opts.Cluster.PoolSize = DEFAULT_ROUTE_POOL_SIZE + } + // Unless pooling/accounts are disabled (by PoolSize being set to -1), + // check for Cluster.Accounts. Add the system account if not present and + // unless we have a configuration that disabled it. + if opts.Cluster.PoolSize > 0 { + sysAccName := opts.SystemAccount + if sysAccName == _EMPTY_ && !opts.NoSystemAccount { + sysAccName = DEFAULT_SYSTEM_ACCOUNT + } + if sysAccName != _EMPTY_ { + var found bool + for _, acc := range opts.Cluster.Accounts { + if acc == sysAccName { + found = true + break + } + } + if !found { + opts.Cluster.Accounts = append(opts.Cluster.Accounts, sysAccName) + } + } + } } if opts.LeafNode.Port != 0 { if opts.LeafNode.Host == _EMPTY_ { diff --git a/server/opts_test.go b/server/opts_test.go index d52ae4c4..c618b260 100644 --- a/server/opts_test.go +++ b/server/opts_test.go @@ -1131,6 +1131,7 @@ func TestBadNkeyConfig(t *testing.T) { if err := os.WriteFile(confFileName, []byte(content), 0666); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, confFileName) if _, err := ProcessConfigFile(confFileName); err == nil { t.Fatalf("Expected an error from nkey entry with password") } @@ -1147,6 +1148,7 @@ func TestNkeyWithPassConfig(t *testing.T) { if err := os.WriteFile(confFileName, []byte(content), 0666); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, confFileName) if _, err := ProcessConfigFile(confFileName); err == nil { t.Fatalf("Expected an error from bad nkey entry") } @@ -1163,6 +1165,7 @@ func TestTokenWithUserPass(t *testing.T) { if err := os.WriteFile(confFileName, []byte(content), 0666); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, confFileName) _, err := ProcessConfigFile(confFileName) if err == nil { t.Fatal("Expected error, got none") @@ -1184,6 +1187,7 @@ func TestTokenWithUsers(t *testing.T) { if err := os.WriteFile(confFileName, []byte(content), 0666); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, confFileName) _, err := ProcessConfigFile(confFileName) if err == nil { t.Fatal("Expected error, got none") @@ -2084,6 +2088,7 @@ func TestParsingGateways(t *testing.T) { if err := os.WriteFile(file, []byte(content), 0600); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, file) opts, err := ProcessConfigFile(file) if err != nil { t.Fatalf("Error processing file: %v", err) @@ -2337,6 +2342,7 @@ func TestParsingGatewaysErrors(t *testing.T) { if err := os.WriteFile(file, []byte(test.content), 0600); err != nil { t.Fatalf("Error writing config file: %v", err) } + defer removeFile(t, file) _, err := ProcessConfigFile(file) if err == nil { t.Fatalf("Expected to fail, did not. Content:\n%s", test.content) diff --git a/server/parser_test.go b/server/parser_test.go index c92350bc..14a5f94d 100644 --- a/server/parser_test.go +++ b/server/parser_test.go @@ -448,6 +448,7 @@ func TestParseHeaderPubArg(t *testing.T) { func TestParseRoutedHeaderMsg(t *testing.T) { c := dummyRouteClient() + c.route = &route{} pub := []byte("HMSG $foo foo 10 8\r\nXXXhello\r") if err := c.parse(pub); err == nil { @@ -565,6 +566,7 @@ func TestParseRoutedHeaderMsg(t *testing.T) { func TestParseRouteMsg(t *testing.T) { c := dummyRouteClient() + c.route = &route{} pub := []byte("MSG $foo foo 5\r\nhello\r") err := c.parse(pub) diff --git a/server/raft.go b/server/raft.go index 9bdbe5c3..10454d31 100644 --- a/server/raft.go +++ b/server/raft.go @@ -1679,7 +1679,7 @@ func (n *raft) run() { gw := s.gateway for { s.mu.Lock() - ready := len(s.routes)+len(s.leafs) > 0 + ready := s.numRemotes()+len(s.leafs) > 0 if !ready && gw.enabled { gw.RLock() ready = len(gw.out)+len(gw.in) > 0 diff --git a/server/reload.go b/server/reload.go index 7f083066..665cc90e 100644 --- a/server/reload.go +++ b/server/reload.go @@ -25,6 +25,7 @@ import ( "time" "github.com/nats-io/jwt/v2" + "github.com/nats-io/nuid" ) // FlagSnapshot captures the server options as specified by CLI flags at @@ -57,6 +58,10 @@ type option interface { // cluster permissions. IsClusterPermsChange() bool + // IsClusterPoolSizeOrAccountsChange indicates if this option requires + // special handling for changes in cluster's pool size or accounts list. + IsClusterPoolSizeOrAccountsChange() bool + // IsJetStreamChange inidicates a change in the servers config for JetStream. // Account changes will be handled separately in reloadAuthorization. IsJetStreamChange() bool @@ -88,6 +93,10 @@ func (n noopOption) IsClusterPermsChange() bool { return false } +func (n noopOption) IsClusterPoolSizeOrAccountsChange() bool { + return false +} + func (n noopOption) IsJetStreamChange() bool { return false } @@ -347,8 +356,11 @@ func (u *nkeysOption) Apply(server *Server) { // clusterOption implements the option interface for the `cluster` setting. type clusterOption struct { authOption - newValue ClusterOpts - permsChanged bool + newValue ClusterOpts + permsChanged bool + accsAdded []string + accsRemoved []string + poolSizeChanged bool } // Apply the cluster change. @@ -381,6 +393,32 @@ func (c *clusterOption) IsClusterPermsChange() bool { return c.permsChanged } +func (c *clusterOption) IsClusterPoolSizeOrAccountsChange() bool { + return c.poolSizeChanged || len(c.accsAdded) > 0 || len(c.accsRemoved) > 0 +} + +func (c *clusterOption) diffPoolAndAccounts(old *ClusterOpts) { + c.poolSizeChanged = c.newValue.PoolSize != old.PoolSize +addLoop: + for _, na := range c.newValue.Accounts { + for _, oa := range old.Accounts { + if na == oa { + continue addLoop + } + } + c.accsAdded = append(c.accsAdded, na) + } +removeLoop: + for _, oa := range old.Accounts { + for _, na := range c.newValue.Accounts { + if oa == na { + continue removeLoop + } + } + c.accsRemoved = append(c.accsRemoved, oa) + } +} + // routesOption implements the option interface for the cluster `routes` // setting. type routesOption struct { @@ -392,12 +430,12 @@ type routesOption struct { // Apply the route changes by adding and removing the necessary routes. func (r *routesOption) Apply(server *Server) { server.mu.Lock() - routes := make([]*client, len(server.routes)) + routes := make([]*client, server.numRoutes()) i := 0 - for _, client := range server.routes { - routes[i] = client + server.forEachRoute(func(r *client) { + routes[i] = r i++ - } + }) // If there was a change, notify monitoring code that it should // update the route URLs if /varz endpoint is inspected. if len(r.add)+len(r.remove) > 0 { @@ -425,7 +463,7 @@ func (r *routesOption) Apply(server *Server) { // Add routes. server.mu.Lock() - server.solicitRoutes(r.add) + server.solicitRoutes(r.add, server.getOpts().Cluster.Accounts) server.mu.Unlock() server.Noticef("Reloaded: cluster routes") @@ -756,14 +794,22 @@ func (s *Server) recheckPinnedCerts(curOpts *Options, newOpts *Options) { checkClients(LEAF, s.leafs, newOpts.LeafNode.TLSPinnedCerts) } if !reflect.DeepEqual(newOpts.Cluster.TLSPinnedCerts, curOpts.Cluster.TLSPinnedCerts) { - checkClients(ROUTER, s.routes, newOpts.Cluster.TLSPinnedCerts) + s.forEachRoute(func(c *client) { + if !c.matchesPinnedCert(newOpts.Cluster.TLSPinnedCerts) { + disconnectClients = append(disconnectClients, c) + } + }) } - if reflect.DeepEqual(newOpts.Gateway.TLSPinnedCerts, curOpts.Gateway.TLSPinnedCerts) { - for _, c := range s.remotes { + if s.gateway.enabled && reflect.DeepEqual(newOpts.Gateway.TLSPinnedCerts, curOpts.Gateway.TLSPinnedCerts) { + gw := s.gateway + gw.RLock() + for _, c := range gw.out { if !c.matchesPinnedCert(newOpts.Gateway.TLSPinnedCerts) { disconnectClients = append(disconnectClients, c) } } + checkClients(GATEWAY, gw.in, newOpts.Gateway.TLSPinnedCerts) + gw.RUnlock() } s.mu.Unlock() if len(disconnectClients) > 0 { @@ -822,6 +868,13 @@ func (s *Server) ReloadOptions(newOpts *Options) error { s.mu.Unlock() + // In case "-cluster ..." was provided through the command line, this will + // properly set the Cluster.Host/Port etc... + if l := curOpts.Cluster.ListenStr; l != _EMPTY_ { + newOpts.Cluster.ListenStr = l + overrideCluster(newOpts) + } + // Apply flags over config file settings. newOpts = MergeOptions(newOpts, FlagSnapshot) @@ -1062,8 +1115,27 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { if err := validateClusterOpts(oldClusterOpts, newClusterOpts); err != nil { return nil, err } - permsChanged := !reflect.DeepEqual(newClusterOpts.Permissions, oldClusterOpts.Permissions) - diffOpts = append(diffOpts, &clusterOption{newValue: newClusterOpts, permsChanged: permsChanged}) + co := &clusterOption{ + newValue: newClusterOpts, + permsChanged: !reflect.DeepEqual(newClusterOpts.Permissions, oldClusterOpts.Permissions), + } + co.diffPoolAndAccounts(&oldClusterOpts) + // If there are added accounts, first make sure that we can look them up. + // If we can't let's fail the reload. + for _, acc := range co.accsAdded { + if _, err := s.LookupAccount(acc); err != nil { + return nil, fmt.Errorf("unable to add account %q to the list of dedicated routes: %v", acc, err) + } + } + // If pool_size has been set to negative (but was not before), then let's + // add the system account to the list of removed accounts (we don't have + // to check if already there, duplicates are ok in that case). + if newClusterOpts.PoolSize < 0 && oldClusterOpts.PoolSize >= 0 { + if sys := s.SystemAccount(); sys != nil { + co.accsRemoved = append(co.accsRemoved, sys.GetName()) + } + } + diffOpts = append(diffOpts, co) case "routes": add, remove := diffRoutes(oldValue.([]*url.URL), newValue.([]*url.URL)) diffOpts = append(diffOpts, &routesOption{add: add, remove: remove}) @@ -1431,6 +1503,7 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { jsEnabled = false reloadTLS = false isStatszChange = false + co *clusterOption ) for _, opt := range opts { opt.Apply(s) @@ -1446,6 +1519,9 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { if opt.IsTLSChange() { reloadTLS = true } + if opt.IsClusterPoolSizeOrAccountsChange() { + co = opt.(*clusterOption) + } if opt.IsClusterPermsChange() { reloadClusterPerms = true } @@ -1470,7 +1546,11 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { if reloadClusterPerms { s.reloadClusterPermissions(ctx.oldClusterPerms) } - + newOpts := s.getOpts() + // If we need to reload cluster pool/per-account, then co will be not nil + if co != nil { + s.reloadClusterPoolAndAccounts(co, newOpts) + } if reloadJetstream { if !jsEnabled { s.DisableJetStream() @@ -1489,7 +1569,6 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { // For remote gateways and leafnodes, make sure that their TLS configuration // is updated (since the config is "captured" early and changes would otherwise // not be visible). - newOpts := s.getOpts() if s.gateway.enabled { s.gateway.updateRemotesTLSConfig(newOpts) } @@ -1534,7 +1613,7 @@ func (s *Server) reloadClientTraceLevel() { // Update their trace level when not holding server or gateway lock s.mu.Lock() - clientCnt := 1 + len(s.clients) + len(s.grTmpClients) + len(s.routes) + len(s.leafs) + clientCnt := 1 + len(s.clients) + len(s.grTmpClients) + s.numRoutes() + len(s.leafs) s.mu.Unlock() s.gateway.RLock() @@ -1548,12 +1627,15 @@ func (s *Server) reloadClientTraceLevel() { clients = append(clients, s.sys.client) } - cMaps := []map[uint64]*client{s.clients, s.grTmpClients, s.routes, s.leafs} + cMaps := []map[uint64]*client{s.clients, s.grTmpClients, s.leafs} for _, m := range cMaps { for _, c := range m { clients = append(clients, c) } } + s.forEachRoute(func(c *client) { + clients = append(clients, c) + }) s.mu.Unlock() s.gateway.RLock() @@ -1720,9 +1802,9 @@ func (s *Server) reloadAuthorization() { clients = append(clients, client) } } - for _, route := range s.routes { + s.forEachRoute(func(route *client) { routes = append(routes, route) - } + }) // Check here for any system/internal clients which will not be in the servers map of normal clients. if s.sys != nil && s.sys.account != nil && !s.opts.NoSystemAccount { s.accounts.Store(s.sys.account.Name, s.sys.account) @@ -1842,21 +1924,16 @@ func (s *Server) clientHasMovedToDifferentAccount(c *client) bool { func (s *Server) reloadClusterPermissions(oldPerms *RoutePermissions) { s.mu.Lock() var ( - infoJSON []byte - newPerms = s.opts.Cluster.Permissions - routes = make(map[uint64]*client, len(s.routes)) - withNewProto int + infoJSON []byte + newPerms = s.getOpts().Cluster.Permissions + routes = make(map[uint64]*client, s.numRoutes()) ) // Get all connected routes - for i, route := range s.routes { - // Count the number of routes that can understand receiving INFO updates. + s.forEachRoute(func(route *client) { route.mu.Lock() - if route.opts.Protocol >= RouteProtoInfo { - withNewProto++ - } + routes[route.cid] = route route.mu.Unlock() - routes[i] = route - } + }) // If new permissions is nil, then clear routeInfo import/export if newPerms == nil { s.routeInfo.Import = nil @@ -1865,23 +1942,24 @@ func (s *Server) reloadClusterPermissions(oldPerms *RoutePermissions) { s.routeInfo.Import = newPerms.Import s.routeInfo.Export = newPerms.Export } - // Regenerate route INFO - s.generateRouteInfoJSON() - infoJSON = s.routeInfoJSON - gacc := s.gacc + // Copy the current route's INFO struct. We will need to modify it per-account + routeInfo := s.routeInfo s.mu.Unlock() - // If there were no route, we are done - if len(routes) == 0 { - return + // Close connections for routes that don't understand async INFO. + for _, route := range routes { + route.mu.Lock() + close := route.opts.Protocol < RouteProtoInfo + cid := route.cid + route.mu.Unlock() + if close { + route.closeConnection(RouteRemoved) + delete(routes, cid) + } } - // If only older servers, simply close all routes and they will do the right - // thing on reconnect. - if withNewProto == 0 { - for _, route := range routes { - route.closeConnection(RouteRemoved) - } + // If there are no route left, we are done + if len(routes) == 0 { return } @@ -1891,64 +1969,275 @@ func (s *Server) reloadClusterPermissions(oldPerms *RoutePermissions) { newPermsTester := &client{} newPermsTester.setRoutePermissions(newPerms) - var ( - _localSubs [4096]*subscription - localSubs = _localSubs[:0] - subsNeedSUB []*subscription - subsNeedUNSUB []*subscription - deleteRoutedSubs []*subscription - ) - // FIXME(dlc) - Change for accounts. - gacc.sl.localSubs(&localSubs, false) + // For a given account and list of remotes, will send an INFO protocol so + // that remote updates its route permissions and will also send the RS+ or + // RS- for subscriptions that become or are no longer permitted. + update := func(accName string, sl *Sublist, remotes []*client) { + var ( + _localSubs [4096]*subscription + localSubs = _localSubs[:0] + subsNeedSUB []*subscription + subsNeedUNSUB []*subscription + deleteRoutedSubs []*subscription + ) + sl.localSubs(&localSubs, false) - // Go through all local subscriptions - for _, sub := range localSubs { - // Get all subs that can now be imported - subj := string(sub.subject) - couldImportThen := oldPermsTester.canImport(subj) - canImportNow := newPermsTester.canImport(subj) - if canImportNow { - // If we could not before, then will need to send a SUB protocol. - if !couldImportThen { - subsNeedSUB = append(subsNeedSUB, sub) - } - } else if couldImportThen { - // We were previously able to import this sub, but now - // we can't so we need to send an UNSUB protocol - subsNeedUNSUB = append(subsNeedUNSUB, sub) - } - } - - for _, route := range routes { - route.mu.Lock() - // If route is to older server, simply close connection. - if route.opts.Protocol < RouteProtoInfo { - route.mu.Unlock() - route.closeConnection(RouteRemoved) - continue - } - route.setRoutePermissions(newPerms) - for _, sub := range route.subs { - // If we can't export, we need to drop the subscriptions that - // we have on behalf of this route. + // Go through all local subscriptions + for _, sub := range localSubs { + // Get all subs that can now be imported subj := string(sub.subject) - if !route.canExport(subj) { - delete(route.subs, string(sub.sid)) - deleteRoutedSubs = append(deleteRoutedSubs, sub) + couldImportThen := oldPermsTester.canImport(subj) + canImportNow := newPermsTester.canImport(subj) + if canImportNow { + // If we could not before, then will need to send a SUB protocol. + if !couldImportThen { + subsNeedSUB = append(subsNeedSUB, sub) + } + } else if couldImportThen { + // We were previously able to import this sub, but now + // we can't so we need to send an UNSUB protocol + subsNeedUNSUB = append(subsNeedUNSUB, sub) } } - // Send an update INFO, which will allow remote server to show - // our current route config in monitoring and resend subscriptions - // that we now possibly allow with a change of Export permissions. - route.enqueueProto(infoJSON) - // Now send SUB and UNSUB protocols as needed. - route.sendRouteSubProtos(subsNeedSUB, false, nil) - route.sendRouteUnSubProtos(subsNeedUNSUB, false, nil) - route.mu.Unlock() + + for _, route := range remotes { + // We do this manually here, not invoke generateRouteInfoJSON() + routeInfo.RemoteAccount = accName + infoJSON = generateInfoJSON(&routeInfo) + + route.mu.Lock() + route.setRoutePermissions(newPerms) + for _, sub := range route.subs { + // If we can't export, we need to drop the subscriptions that + // we have on behalf of this route. + subj := string(sub.subject) + if !route.canExport(subj) { + delete(route.subs, string(sub.sid)) + deleteRoutedSubs = append(deleteRoutedSubs, sub) + } + } + // Send an update INFO, which will allow remote server to show + // our current route config in monitoring and resend subscriptions + // that we now possibly allow with a change of Export permissions. + route.enqueueProto(infoJSON) + // Now send SUB and UNSUB protocols as needed. + route.sendRouteSubProtos(subsNeedSUB, false, nil) + route.sendRouteUnSubProtos(subsNeedUNSUB, false, nil) + route.mu.Unlock() + } + // Remove as a batch all the subs that we have removed from each route. + sl.RemoveBatch(deleteRoutedSubs) } - // Remove as a batch all the subs that we have removed from each route. - // FIXME(dlc) - Change for accounts. - gacc.sl.RemoveBatch(deleteRoutedSubs) + + // Now go over all accounts and invoke the update function defined above. + s.accounts.Range(func(_, v interface{}) bool { + acc := v.(*Account) + acc.mu.RLock() + accName, sl, poolIdx := acc.Name, acc.sl, acc.routePoolIdx + acc.mu.RUnlock() + if sl == nil { + return true + } + var remotes []*client + for _, r := range routes { + r.mu.Lock() + if (poolIdx >= 0 && poolIdx == r.route.poolIdx) || (string(r.route.accName) == accName) { + remotes = append(remotes, r) + } + r.mu.Unlock() + } + update(accName, sl, remotes) + return true + }) +} + +func (s *Server) reloadClusterPoolAndAccounts(co *clusterOption, opts *Options) { + s.mu.Lock() + // Prevent adding new routes until we are ready to do so. + s.routesReject = true + var ch chan struct{} + // For accounts that have been added to the list of dedicated routes, + // send a protocol to their current assigned routes to allow the + // other side to prepare for the changes. + if len(co.accsAdded) > 0 { + protosSent := 0 + s.accAddedReqID = nuid.Next() + for _, an := range co.accsAdded { + if s.accRoutes == nil { + s.accRoutes = make(map[string]map[string]*client) + } + // In case a config reload was first done on another server, + // we may have already switched this account to a dedicated route. + // But we still want to send the protocol over the routes that + // would have otherwise handled it. + if _, ok := s.accRoutes[an]; !ok { + s.accRoutes[an] = make(map[string]*client) + } + if a, ok := s.accounts.Load(an); ok { + acc := a.(*Account) + acc.mu.Lock() + sl := acc.sl + // Get the current route pool index before calling setRouteInfo. + rpi := acc.routePoolIdx + // Switch to per-account route if not already done. + if rpi >= 0 { + s.setRouteInfo(acc) + } else { + // If it was transitioning, make sure we set it to the state + // that indicates that it has a dedicated route + if rpi == accTransitioningToDedicatedRoute { + acc.routePoolIdx = accDedicatedRoute + } + // Otherwise get the route pool index it would have been before + // the move so we can send the protocol to those routes. + rpi = s.computeRoutePoolIdx(acc) + } + acc.mu.Unlock() + // Generate the INFO protocol to send indicating that this account + // is being moved to a dedicated route. + ri := Info{ + RoutePoolSize: s.routesPoolSize, + RouteAccount: an, + RouteAccReqID: s.accAddedReqID, + } + proto := generateInfoJSON(&ri) + // Go over each remote's route at pool index `rpi` and remove + // remote subs for this account and send the protocol. + s.forEachRouteIdx(rpi, func(r *client) bool { + r.mu.Lock() + // Exclude routes to servers that don't support pooling. + if !r.route.noPool { + if subs := r.removeRemoteSubsForAcc(an); len(subs) > 0 { + sl.RemoveBatch(subs) + } + r.enqueueProto(proto) + protosSent++ + } + r.mu.Unlock() + return true + }) + } + } + if protosSent > 0 { + s.accAddedCh = make(chan struct{}, protosSent) + ch = s.accAddedCh + } + } + // Collect routes that need to be closed. + routes := make(map[*client]struct{}) + // Collect the per-account routes that need to be closed. + if len(co.accsRemoved) > 0 { + for _, an := range co.accsRemoved { + if remotes, ok := s.accRoutes[an]; ok && remotes != nil { + for _, r := range remotes { + if r != nil { + r.setNoReconnect() + routes[r] = struct{}{} + } + } + } + } + } + // If the pool size has changed, we need to close all pooled routes. + if co.poolSizeChanged { + s.forEachNonPerAccountRoute(func(r *client) { + routes[r] = struct{}{} + }) + } + // If there are routes to close, we need to release the server lock. + // Same if we need to wait on responses from the remotes when + // processing new per-account routes. + if len(routes) > 0 || len(ch) > 0 { + s.mu.Unlock() + + for done := false; !done && len(ch) > 0; { + select { + case <-ch: + case <-time.After(2 * time.Second): + s.Warnf("Timed out waiting for confirmation from all routes regarding per-account routes changes") + done = true + } + } + + for r := range routes { + r.closeConnection(RouteRemoved) + } + + s.mu.Lock() + } + // Clear the accAddedCh/ReqID fields in case they were set. + s.accAddedReqID, s.accAddedCh = _EMPTY_, nil + // Now that per-account routes that needed to be closed are closed, + // remove them from s.accRoutes. Doing so before would prevent + // removeRoute() to do proper cleanup because the route would not + // be found in s.accRoutes. + for _, an := range co.accsRemoved { + delete(s.accRoutes, an) + // Do not lookup and call setRouteInfo() on the accounts here. + // We need first to set the new s.routesPoolSize value and + // anyway, there is no need to do here if the pool size has + // changed (since it will be called for all accounts). + } + // We have already added the accounts to s.accRoutes that needed to + // be added. + + // We should always have at least the system account with a dedicated route, + // but in case we have a configuration that disables pooling and without + // a system account, possibly set the accRoutes to nil. + if len(opts.Cluster.Accounts) == 0 { + s.accRoutes = nil + } + // Now deal with pool size updates. + if ps := opts.Cluster.PoolSize; ps > 0 { + s.routesPoolSize = ps + s.routeInfo.RoutePoolSize = ps + } else { + s.routesPoolSize = 1 + s.routeInfo.RoutePoolSize = 0 + } + // If the pool size has changed, we need to recompute all accounts' route + // pool index. Note that the added/removed accounts will be reset there + // too, but that's ok (we could use a map to exclude them, but not worth it). + if co.poolSizeChanged { + s.accounts.Range(func(_, v interface{}) bool { + acc := v.(*Account) + acc.mu.Lock() + s.setRouteInfo(acc) + acc.mu.Unlock() + return true + }) + } else if len(co.accsRemoved) > 0 { + // For accounts that no longer have a dedicated route, we need to send + // the subsriptions on the existing pooled routes for those accounts. + for _, an := range co.accsRemoved { + if a, ok := s.accounts.Load(an); ok { + acc := a.(*Account) + acc.mu.Lock() + // First call this which will assign a new route pool index. + s.setRouteInfo(acc) + // Get the value so we can send the subscriptions interest + // on all routes with this pool index. + rpi := acc.routePoolIdx + acc.mu.Unlock() + s.forEachRouteIdx(rpi, func(r *client) bool { + // We have the guarantee that if the route exists, it + // is not a new one that would have been created when + // we released the server lock if some routes needed + // to be closed, because we have set s.routesReject + // to `true` at the top of this function. + s.sendSubsToRoute(r, rpi, an) + return true + }) + } + } + } + // Allow routes to be accepted now. + s.routesReject = false + // If there is a pool size change or added accounts, solicit routes now. + if co.poolSizeChanged || len(co.accsAdded) > 0 { + s.solicitRoutes(opts.Routes, co.accsAdded) + } + s.mu.Unlock() } // validateClusterOpts ensures the new ClusterOpts does not change host or diff --git a/server/reload_test.go b/server/reload_test.go index 06762483..07131e99 100644 --- a/server/reload_test.go +++ b/server/reload_test.go @@ -30,6 +30,7 @@ import ( "reflect" "runtime" "strings" + "sync" "testing" "time" @@ -1278,8 +1279,8 @@ func TestConfigReloadEnableClusterAuthorization(t *testing.T) { } defer srvbConn.Close() - if numRoutes := srvb.NumRoutes(); numRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", numRoutes) + if numRoutes := srvb.NumRoutes(); numRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, numRoutes) } // Ensure messages flow through the cluster as a sanity check. @@ -1321,8 +1322,8 @@ func TestConfigReloadEnableClusterAuthorization(t *testing.T) { } checkClusterFormed(t, srva, srvb) - if numRoutes := srvb.NumRoutes(); numRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", numRoutes) + if numRoutes := srvb.NumRoutes(); numRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, numRoutes) } // Ensure messages flow through the cluster now. @@ -1375,8 +1376,8 @@ func TestConfigReloadDisableClusterAuthorization(t *testing.T) { } defer srvbConn.Close() - if numRoutes := srvb.NumRoutes(); numRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", numRoutes) + if numRoutes := srvb.NumRoutes(); numRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, numRoutes) } // Ensure messages flow through the cluster as a sanity check. @@ -1400,8 +1401,8 @@ func TestConfigReloadDisableClusterAuthorization(t *testing.T) { checkClusterFormed(t, srva, srvb) - if numRoutes := srvb.NumRoutes(); numRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", numRoutes) + if numRoutes := srvb.NumRoutes(); numRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, numRoutes) } // Ensure messages still flow through the cluster. @@ -1464,8 +1465,8 @@ func TestConfigReloadClusterRoutes(t *testing.T) { } defer srvbConn.Close() - if numRoutes := srvb.NumRoutes(); numRoutes != 1 { - t.Fatalf("Expected 1 route, got %d", numRoutes) + if numRoutes := srvb.NumRoutes(); numRoutes != DEFAULT_ROUTE_POOL_SIZE { + t.Fatalf("Expected %d route, got %d", DEFAULT_ROUTE_POOL_SIZE, numRoutes) } // Ensure consumer on srvA is propagated to srvB @@ -1622,16 +1623,17 @@ func TestConfigReloadClusterAdvertise(t *testing.T) { verify := func(expectedHost string, expectedPort int, expectedIP string) { s.mu.Lock() routeInfo := s.routeInfo - routeInfoJSON := Info{} - err := json.Unmarshal(s.routeInfoJSON[5:], &routeInfoJSON) // Skip "INFO " + rij := generateInfoJSON(&routeInfo) s.mu.Unlock() - if err != nil { - t.Fatalf("Error on Unmarshal: %v", err) - } if routeInfo.Host != expectedHost || routeInfo.Port != expectedPort || routeInfo.IP != expectedIP { t.Fatalf("Expected host/port/IP to be %s:%v, %q, got %s:%d, %q", expectedHost, expectedPort, expectedIP, routeInfo.Host, routeInfo.Port, routeInfo.IP) } + routeInfoJSON := Info{} + err := json.Unmarshal(rij[5:], &routeInfoJSON) // Skip "INFO " + if err != nil { + t.Fatalf("Error on Unmarshal: %v", err) + } // Check that server routeInfoJSON was updated too if !reflect.DeepEqual(routeInfo, routeInfoJSON) { t.Fatalf("Expected routeInfoJSON to be %+v, got %+v", routeInfo, routeInfoJSON) @@ -2013,7 +2015,7 @@ func TestConfigReloadClusterWorks(t *testing.T) { getCID := func(s *Server) uint64 { s.mu.Lock() defer s.mu.Unlock() - for _, r := range s.routes { + if r := getFirstRoute(s); r != nil { return r.cid } return 0 @@ -2462,11 +2464,10 @@ func TestConfigReloadClusterPermsOldServer(t *testing.T) { getRouteRID := func() uint64 { rid := uint64(0) srvb.mu.Lock() - for _, r := range srvb.routes { + if r := getFirstRoute(srvb); r != nil { r.mu.Lock() rid = r.cid r.mu.Unlock() - break } srvb.mu.Unlock() return rid @@ -3044,7 +3045,6 @@ func TestConfigReloadAccountServicesImportExport(t *testing.T) { ] } } - no_sys_acc: true cluster { name: "abc" port: -1 @@ -3157,7 +3157,6 @@ func TestConfigReloadAccountServicesImportExport(t *testing.T) { ] } } - no_sys_acc: true cluster { name: "abc" port: -1 @@ -4511,3 +4510,778 @@ func TestConfigReloadWithSysAccountOnly(t *testing.T) { // ok } } + +func TestReloadRouteImportPermissionsWithAccounts(t *testing.T) { + for _, test := range []struct { + name string + poolSize string + accounts string + }{ + {"regular", _EMPTY_, _EMPTY_}, + {"pooling", "pool_size: 5", _EMPTY_}, + {"per-account", _EMPTY_, "accounts: [\"A\"]"}, + {"pool and per-account", "pool_size: 3", "accounts: [\"A\"]"}, + } { + t.Run(test.name, func(t *testing.T) { + confATemplate := ` + server_name: "A" + port: -1 + accounts { + A { users: [{user: "user1", password: "pwd"}] } + B { users: [{user: "user2", password: "pwd"}] } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + permissions { + import { + allow: %s + } + export { + allow: ">" + } + } + %s + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(confATemplate, `"foo"`, test.poolSize, test.accounts))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + confBTemplate := ` + server_name: "B" + port: -1 + accounts { + A { users: [{user: "user1", password: "pwd"}] } + B { users: [{user: "user2", password: "pwd"}] } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + name: "local" + permissions { + import { + allow: %s + } + export { + allow: ">" + } + } + routes = [ + "nats://127.0.0.1:%d" + ] + %s + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(confBTemplate, `"foo"`, optsA.Cluster.Port, test.poolSize, test.accounts))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + + checkClusterFormed(t, srva, srvb) + + ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncA.Close() + + sub1Foo := natsSubSync(t, ncA, "foo") + sub2Foo := natsSubSync(t, ncA, "foo") + + sub1Bar := natsSubSync(t, ncA, "bar") + sub2Bar := natsSubSync(t, ncA, "bar") + + natsFlush(t, ncA) + + checkSubInterest(t, srvb, "A", "foo", 2*time.Second) + + ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncB.Close() + + check := func(sub *nats.Subscription, expected bool) { + t.Helper() + if expected { + natsNexMsg(t, sub, time.Second) + } else { + if msg, err := sub.NextMsg(250 * time.Millisecond); err == nil { + t.Fatalf("Should not have gotten the message, got %s/%s", msg.Subject, msg.Data) + } + } + } + + // Should receive on "foo" + natsPub(t, ncB, "foo", []byte("foo1")) + check(sub1Foo, true) + check(sub2Foo, true) + + // But not on "bar" + natsPub(t, ncB, "bar", []byte("bar1")) + check(sub1Bar, false) + check(sub2Bar, false) + + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, `"bar"`, test.poolSize, test.accounts)) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBTemplate, `"bar"`, optsA.Cluster.Port, test.poolSize, test.accounts)) + + checkClusterFormed(t, srva, srvb) + + checkSubInterest(t, srvb, "A", "bar", 2*time.Second) + + // Should not receive on foo + natsPub(t, ncB, "foo", []byte("foo2")) + check(sub1Foo, false) + check(sub2Foo, false) + + // Should be able to receive on bar + natsPub(t, ncB, "bar", []byte("bar2")) + check(sub1Bar, true) + check(sub2Bar, true) + + // Restore "foo" + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, `"foo"`, test.poolSize, test.accounts)) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBTemplate, `"foo"`, optsA.Cluster.Port, test.poolSize, test.accounts)) + + checkClusterFormed(t, srva, srvb) + + checkSubInterest(t, srvb, "A", "foo", 2*time.Second) + + // Should receive on "foo" + natsPub(t, ncB, "foo", []byte("foo3")) + check(sub1Foo, true) + check(sub2Foo, true) + // But make sure there are no more than what we expect + check(sub1Foo, false) + check(sub2Foo, false) + + // And now "bar" should fail + natsPub(t, ncB, "bar", []byte("bar3")) + check(sub1Bar, false) + check(sub2Bar, false) + }) + } +} + +func TestReloadRoutePoolAndPerAccount(t *testing.T) { + confATemplate := ` + port: -1 + server_name: "A" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + B { users: [{user: "user2", password: "pwd"}] } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + %s + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(confATemplate, "pool_size: 3", "accounts: [\"A\"]"))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + confBCTemplate := ` + port: -1 + server_name: "%s" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + B { users: [{user: "user2", password: "pwd"}] } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + name: "local" + routes = [ + "nats://127.0.0.1:%d" + ] + %s + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\"]"))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + confC := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\"]"))) + srvc, _ := RunServerWithConfig(confC) + defer srvc.Shutdown() + + checkClusterFormed(t, srva, srvb, srvc) + + // We will also create subscriptions for accounts A, B and C on all sides + // just to make sure that interest is properly propagated after a reload. + // The conns slices will contain connections for accounts A on srva, srvb, + // srvc, then B on srva, srvb, etc.. and the subs slices will contain + // subscriptions for account A on foo on srva, bar on srvb, baz on srvc, + // then for account B on foo on srva, etc... + var conns []*nats.Conn + var subs []*nats.Subscription + for _, user := range []string{"user1", "user2", "user3"} { + nc := natsConnect(t, srva.ClientURL(), nats.UserInfo(user, "pwd")) + defer nc.Close() + conns = append(conns, nc) + sub := natsSubSync(t, nc, "foo") + subs = append(subs, sub) + nc = natsConnect(t, srvb.ClientURL(), nats.UserInfo(user, "pwd")) + defer nc.Close() + conns = append(conns, nc) + sub = natsSubSync(t, nc, "bar") + subs = append(subs, sub) + nc = natsConnect(t, srvc.ClientURL(), nats.UserInfo(user, "pwd")) + defer nc.Close() + conns = append(conns, nc) + sub = natsSubSync(t, nc, "baz") + subs = append(subs, sub) + } + + checkCluster := func() { + t.Helper() + checkClusterFormed(t, srva, srvb, srvc) + + for _, acc := range []string{"A", "B", "C"} { + // On server A, there should be interest for bar/baz + checkSubInterest(t, srva, acc, "bar", 2*time.Second) + checkSubInterest(t, srva, acc, "baz", 2*time.Second) + // On serer B, there should be interest on foo/baz + checkSubInterest(t, srvb, acc, "foo", 2*time.Second) + checkSubInterest(t, srvb, acc, "baz", 2*time.Second) + // And on server C, interest on foo/bar + checkSubInterest(t, srvc, acc, "foo", 2*time.Second) + checkSubInterest(t, srvc, acc, "bar", 2*time.Second) + } + } + checkCluster() + + getAccRouteID := func(acc string) uint64 { + s := srva + var id uint64 + srvbId := srvb.ID() + s.mu.RLock() + if remotes, ok := s.accRoutes[acc]; ok { + // For this test, we will take a single remote, say srvb + if r := remotes[srvbId]; r != nil { + r.mu.Lock() + if string(r.route.accName) == acc { + id = r.cid + } + r.mu.Unlock() + } + } + s.mu.RUnlock() + return id + } + // Capture the route for account "A" + raid := getAccRouteID("A") + if raid == 0 { + t.Fatal("Did not find route for account A") + } + + getRouteIDForAcc := func(acc string) uint64 { + s := srva + a, _ := s.LookupAccount(acc) + if a == nil { + return 0 + } + a.mu.RLock() + pidx := a.routePoolIdx + a.mu.RUnlock() + var id uint64 + s.mu.RLock() + // For this test, we will take a single remote, say srvb + srvbId := srvb.ID() + if conns, ok := s.routes[srvbId]; ok { + if r := conns[pidx]; r != nil { + r.mu.Lock() + id = r.cid + r.mu.Unlock() + } + } + s.mu.RUnlock() + return id + } + rbid := getRouteIDForAcc("B") + if rbid == 0 { + t.Fatal("Did not find route for account B") + } + rcid := getRouteIDForAcc("C") + if rcid == 0 { + t.Fatal("Did not find route for account C") + } + rdid := getRouteIDForAcc("D") + if rdid == 0 { + t.Fatal("Did not find route for account D") + } + + sendAndRecv := func(msg string) { + t.Helper() + for accIdx := 0; accIdx < 9; accIdx += 3 { + natsPub(t, conns[accIdx], "bar", []byte(msg)) + m := natsNexMsg(t, subs[accIdx+1], time.Second) + checkMsg := func(m *nats.Msg, subj string) { + t.Helper() + if string(m.Data) != msg { + t.Fatalf("For accIdx=%v, subject %q, expected message %q, got %q", accIdx, subj, msg, m.Data) + } + } + checkMsg(m, "bar") + natsPub(t, conns[accIdx+1], "baz", []byte(msg)) + m = natsNexMsg(t, subs[accIdx+2], time.Second) + checkMsg(m, "baz") + natsPub(t, conns[accIdx+2], "foo", []byte(msg)) + m = natsNexMsg(t, subs[accIdx], time.Second) + checkMsg(m, "foo") + } + } + sendAndRecv("0") + + // Now add accounts "B" and "D" and do a config reload. + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 3", "accounts: [\"A\",\"B\",\"D\"]")) + + // Even before reloading srvb and srvc, we should already have per-account + // routes for accounts B and D being established. The accounts routePoolIdx + // should be marked as transitioning. + checkAccPoolIdx := func(s *Server, acc string, expected int) { + t.Helper() + checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { + s.mu.RLock() + defer s.mu.RUnlock() + if a, ok := s.accounts.Load(acc); ok { + acc := a.(*Account) + acc.mu.RLock() + rpi := acc.routePoolIdx + acc.mu.RUnlock() + if rpi != expected { + return fmt.Errorf("Server %q - Account %q routePoolIdx should be %v, but is %v", s, acc, expected, rpi) + } + return nil + } + return fmt.Errorf("Server %q - Account %q not found", s, acc) + }) + } + checkRoutePerAccAlreadyEstablished := func(s *Server, acc string) { + t.Helper() + checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { + s.mu.RLock() + defer s.mu.RUnlock() + if _, ok := s.accRoutes[acc]; !ok { + return fmt.Errorf("Route for account %q still not established", acc) + } + return nil + }) + checkAccPoolIdx(s, acc, accTransitioningToDedicatedRoute) + } + // Check srvb and srvc for both accounts. + for _, s := range []*Server{srvb, srvc} { + for _, acc := range []string{"B", "D"} { + checkRoutePerAccAlreadyEstablished(s, acc) + } + } + // On srva, the accounts should already have their routePoolIdx set to + // the accDedicatedRoute value. + for _, acc := range []string{"B", "D"} { + checkAccPoolIdx(srva, acc, accDedicatedRoute) + } + // Now reload the other servers + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\",\"B\",\"D\"]")) + reloadUpdateConfig(t, srvc, confC, fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\",\"B\",\"D\"]")) + + checkCluster() + // Now check that the accounts B and D are no longer transitioning + for _, s := range []*Server{srva, srvb, srvc} { + for _, acc := range []string{"B", "D"} { + checkAccPoolIdx(s, acc, accDedicatedRoute) + } + } + + checkRouteForADidNotChange := func() { + t.Helper() + if id := getAccRouteID("A"); id != raid { + t.Fatalf("Route id for account 'A' was %d, is now %d", raid, id) + } + } + // Verify that the route for account "A" did not change. + checkRouteForADidNotChange() + + // Verify that account "B" has now its own route + if id := getAccRouteID("B"); id == 0 { + t.Fatal("Did not find route for account B") + } + // Same for "D". + if id := getAccRouteID("D"); id == 0 { + t.Fatal("Did not find route for account D") + } + + checkRouteStillPresent := func(id uint64) { + t.Helper() + srva.mu.RLock() + defer srva.mu.RUnlock() + srvbId := srvb.ID() + for _, r := range srva.routes[srvbId] { + if r != nil { + r.mu.Lock() + found := r.cid == id + r.mu.Unlock() + if found { + return + } + } + } + t.Fatalf("Route id %v has been disconnected", id) + } + // Verify that routes that were dealing with "B", and "D" were not disconnected. + // Of course, since "C" was not involved, that route should still be present too. + checkRouteStillPresent(rbid) + checkRouteStillPresent(rcid) + checkRouteStillPresent(rdid) + + sendAndRecv("1") + + // Now remove "B" and "D" and verify that route for "A" did not change. + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 3", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvc, confC, fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\"]")) + + checkCluster() + + // Verify that the route for account "A" did not change. + checkRouteForADidNotChange() + + // Verify that there is no dedicated route for account "B" + if id := getAccRouteID("B"); id != 0 { + t.Fatal("Should not have found a route for account B") + } + // It should instead be in one of the pooled route, and same + // than it was before. + if id := getRouteIDForAcc("B"); id != rbid { + t.Fatalf("Account B's route was %d, it is now %d", rbid, id) + } + // Same for "D" + if id := getAccRouteID("D"); id != 0 { + t.Fatal("Should not have found a route for account D") + } + if id := getRouteIDForAcc("D"); id != rdid { + t.Fatalf("Account D's route was %d, it is now %d", rdid, id) + } + + sendAndRecv("2") + + // Finally, change pool size and make sure that routes handling B, C and D + // were disconnected/reconnected, and that A did not change. + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 5", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 5", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvc, confC, fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "pool_size: 5", "accounts: [\"A\"]")) + + checkCluster() + + checkRouteForADidNotChange() + + checkRouteDisconnected := func(acc string, oldID uint64) { + t.Helper() + if id := getRouteIDForAcc(acc); id == oldID { + t.Fatalf("Route that was handling account %q did not change", acc) + } + } + checkRouteDisconnected("B", rbid) + checkRouteDisconnected("C", rcid) + checkRouteDisconnected("D", rdid) + + sendAndRecv("3") + + // Now check that there were no duplicates and that all subs have 0 pending messages. + for i, sub := range subs { + if n, _, _ := sub.Pending(); n != 0 { + t.Fatalf("Expected 0 pending messages, got %v for accIdx=%d sub=%q", n, i, sub.Subject) + } + } +} + +func TestConfigReloadRoutePoolCannotBeDisabledIfAccountsPresent(t *testing.T) { + tmpl := ` + port: -1 + server_name: "%s" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + B { users: [{user: "user2", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + %s + %s + %s + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "A", "accounts: [\"A\"]", _EMPTY_, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "B", "accounts: [\"A\"]", _EMPTY_, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + err := os.WriteFile(conf1, []byte(fmt.Sprintf(tmpl, "A", "accounts: [\"A\"]", "pool_size: -1", _EMPTY_)), 0666) + require_NoError(t, err) + if err := s1.Reload(); err == nil || !strings.Contains(err.Error(), "accounts") { + t.Fatalf("Expected error regarding presence of accounts, got %v", err) + } + + // Now remove the accounts too and reload, this should work + reloadUpdateConfig(t, s1, conf1, fmt.Sprintf(tmpl, "A", _EMPTY_, "pool_size: -1", _EMPTY_)) + reloadUpdateConfig(t, s2, conf2, fmt.Sprintf(tmpl, "B", _EMPTY_, "pool_size: -1", fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port))) + checkClusterFormed(t, s1, s2) + + ncs2 := natsConnect(t, s2.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncs2.Close() + sub := natsSubSync(t, ncs2, "foo") + checkSubInterest(t, s1, "A", "foo", time.Second) + + ncs1 := natsConnect(t, s1.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncs1.Close() + natsPub(t, ncs1, "foo", []byte("hello")) + natsNexMsg(t, sub, time.Second) + + // Wait a bit and make sure there are no duplicates + time.Sleep(50 * time.Millisecond) + if n, _, _ := sub.Pending(); n != 0 { + t.Fatalf("Expected no pending messages, got %v", n) + } + + // Finally, verify that the system account is no longer bound to + // a dedicated route. For that matter, s.accRoutes should be nil. + for _, s := range []*Server{s1, s2} { + sys := s.SystemAccount() + if sys == nil { + t.Fatal("No system account found") + } + sys.mu.RLock() + rpi := sys.routePoolIdx + sys.mu.RUnlock() + if rpi != 0 { + t.Fatalf("Server %q - expected account's routePoolIdx to be 0, got %v", s, rpi) + } + s.mu.RLock() + arNil := s.accRoutes == nil + s.mu.RUnlock() + if !arNil { + t.Fatalf("Server %q - accRoutes expected to be nil, it was not", s) + } + } +} + +func TestReloadRoutePoolAndPerAccountWithOlderServer(t *testing.T) { + confATemplate := ` + port: -1 + server_name: "A" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + %s + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(confATemplate, "pool_size: 3", _EMPTY_))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + confBCTemplate := ` + port: -1 + server_name: "%s" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + name: "local" + routes = [ + "nats://127.0.0.1:%d" + ] + %s + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", _EMPTY_))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + confC := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "pool_size: -1", _EMPTY_))) + srvc, _ := RunServerWithConfig(confC) + defer srvc.Shutdown() + + checkClusterFormed(t, srva, srvb, srvc) + + // Create a connection and sub on B and C + ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncB.Close() + subB := natsSubSync(t, ncB, "foo") + + ncC := natsConnect(t, srvc.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncC.Close() + subC := natsSubSync(t, ncC, "bar") + + // Check that on server B, there is interest on "bar" for account A + // (coming from server C), and on server C, there is interest on "foo" + // for account A (coming from server B). + checkCluster := func() { + t.Helper() + checkClusterFormed(t, srva, srvb, srvc) + checkSubInterest(t, srvb, "A", "bar", 2*time.Second) + checkSubInterest(t, srvc, "A", "foo", 2*time.Second) + } + checkCluster() + + sendAndRecv := func(msg string) { + t.Helper() + natsPub(t, ncB, "bar", []byte(msg)) + if m := natsNexMsg(t, subC, time.Second); string(m.Data) != msg { + t.Fatalf("Expected message %q on %q, got %q", msg, "bar", m.Data) + } + natsPub(t, ncC, "foo", []byte(msg)) + if m := natsNexMsg(t, subB, time.Second); string(m.Data) != msg { + t.Fatalf("Expected message %q on %q, got %q", msg, "foo", m.Data) + } + } + sendAndRecv("0") + + // Now add account "A" and do a config reload. We do this only on + // server srva and srb since server C really does not change. + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 3", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", "accounts: [\"A\"]")) + checkCluster() + sendAndRecv("1") + + // Remove "A" from the accounts list + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 3", _EMPTY_)) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", _EMPTY_)) + checkCluster() + sendAndRecv("2") + + // Change the pool size + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 5", _EMPTY_)) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 5", _EMPTY_)) + checkCluster() + sendAndRecv("3") + + // Add account "A" and change the pool size + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 4", "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 4", "accounts: [\"A\"]")) + checkCluster() + sendAndRecv("4") + + // Remove account "A" and change the pool size + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "pool_size: 3", _EMPTY_)) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "pool_size: 3", _EMPTY_)) + checkCluster() + sendAndRecv("5") +} + +func TestReloadRoutePoolAndPerAccountNoDuplicateSub(t *testing.T) { + confATemplate := ` + port: -1 + server_name: "A" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + pool_size: 3 + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(confATemplate, _EMPTY_))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + confBCTemplate := ` + port: -1 + server_name: "%s" + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + name: "local" + routes = [ + "nats://127.0.0.1:%d" + ] + pool_size: 3 + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, _EMPTY_))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + confC := createConfFile(t, []byte(fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, _EMPTY_))) + srvc, _ := RunServerWithConfig(confC) + defer srvc.Shutdown() + + checkClusterFormed(t, srva, srvb, srvc) + + ncC := natsConnect(t, srvc.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncC.Close() + + ch := make(chan struct{}) + wg := sync.WaitGroup{} + wg.Add(1) + var subs []*nats.Subscription + go func() { + defer wg.Done() + // Limit the number of subscriptions. From experimentation, the issue would + // arise around subscriptions ~700. + for i := 0; i < 1000; i++ { + if sub, err := ncC.SubscribeSync(fmt.Sprintf("foo.%d", i)); err == nil { + subs = append(subs, sub) + } + select { + case <-ch: + return + default: + if i%100 == 0 { + time.Sleep(5 * time.Millisecond) + } + } + } + }() + + // Wait a tiny bit before doing the configuration reload. + time.Sleep(100 * time.Millisecond) + + reloadUpdateConfig(t, srva, confA, fmt.Sprintf(confATemplate, "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(confBCTemplate, "B", optsA.Cluster.Port, "accounts: [\"A\"]")) + reloadUpdateConfig(t, srvc, confC, fmt.Sprintf(confBCTemplate, "C", optsA.Cluster.Port, "accounts: [\"A\"]")) + + checkClusterFormed(t, srva, srvb, srvc) + + close(ch) + wg.Wait() + + for _, sub := range subs { + checkSubInterest(t, srvb, "A", sub.Subject, 500*time.Millisecond) + } + + ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncB.Close() + + for _, sub := range subs { + natsPub(t, ncB, sub.Subject, []byte("hello")) + } + + // Now make sure that there is only 1 pending message for each sub. + // Wait a bit to give a chance to duplicate messages to arrive if + // there was a bug that would lead to a sub on each route (the pooled + // and the per-account) + time.Sleep(250 * time.Millisecond) + for _, sub := range subs { + if n, _, _ := sub.Pending(); n != 1 { + t.Fatalf("Expected only 1 message for subscription on %q, got %v", sub.Subject, n) + } + } +} diff --git a/server/route.go b/server/route.go index 568d0df7..835fa1b2 100644 --- a/server/route.go +++ b/server/route.go @@ -21,6 +21,7 @@ import ( "math/rand" "net" "net/url" + "reflect" "runtime" "strconv" "strings" @@ -78,11 +79,20 @@ type route struct { jetstream bool connectURLs []string wsConnURLs []string - replySubs map[*subscription]*time.Timer gatewayURL string leafnodeURL string hash string idHash string + // Location of the route in the slice: s.routes[remoteID][]*client. + // Initialized to -1 on creation, as to indicate that it is not + // added to the list. + poolIdx int + // If this is set, it means that the route is dedicated for this + // account and the account name will not be included in protocols. + accName []byte + // This is set to true if this is a route connection to an old + // server or a server that has pooling completely disabled. + noPool bool } type connectInfo struct { @@ -126,25 +136,11 @@ func (c *client) removeReplySub(sub *subscription) { (v.(*Account)).sl.Remove(sub) } c.mu.Lock() - c.removeReplySubTimeout(sub) delete(c.subs, string(sub.sid)) c.mu.Unlock() } } -// removeReplySubTimeout will remove a timer if it exists. -// Lock should be held upon entering. -func (c *client) removeReplySubTimeout(sub *subscription) { - // Remove any reply sub timer if it exists. - if c.route == nil || c.route.replySubs == nil { - return - } - if t, ok := c.route.replySubs[sub]; ok { - t.Stop() - delete(c.route.replySubs, sub) - } -} - func (c *client) processAccountSub(arg []byte) error { accName := string(arg) if c.kind == GATEWAY { @@ -184,6 +180,13 @@ func (c *client) processRoutedOriginClusterMsgArgs(arg []byte) error { args = append(args, arg[start:]) } + var an []byte + if c.kind == ROUTER { + if an = c.route.accName; len(an) > 0 && len(args) > 2 { + args = append(args[:2], args[1:]...) + args[1] = an + } + } c.pa.arg = arg switch len(args) { case 0, 1, 2, 3, 4: @@ -245,8 +248,11 @@ func (c *client) processRoutedOriginClusterMsgArgs(arg []byte) error { c.pa.origin = args[0] c.pa.account = args[1] c.pa.subject = args[2] - c.pa.pacache = arg[len(args[0])+1 : len(args[0])+len(args[1])+len(args[2])+2] - + if len(an) > 0 { + c.pa.pacache = c.pa.subject + } else { + c.pa.pacache = arg[len(args[0])+1 : len(args[0])+len(args[1])+len(args[2])+2] + } return nil } @@ -255,6 +261,12 @@ func (c *client) processRoutedHeaderMsgArgs(arg []byte) error { // Unroll splitArgs to avoid runtime/heap issues a := [MAX_HMSG_ARGS][]byte{} args := a[:0] + var an []byte + if c.kind == ROUTER { + if an = c.route.accName; len(an) > 0 { + args = append(args, an) + } + } start := -1 for i, b := range arg { switch b { @@ -333,7 +345,11 @@ func (c *client) processRoutedHeaderMsgArgs(arg []byte) error { // Common ones processed after check for arg length c.pa.account = args[0] c.pa.subject = args[1] - c.pa.pacache = arg[:len(args[0])+len(args[1])+1] + if len(an) > 0 { + c.pa.pacache = c.pa.subject + } else { + c.pa.pacache = arg[:len(args[0])+len(args[1])+1] + } return nil } @@ -342,6 +358,12 @@ func (c *client) processRoutedMsgArgs(arg []byte) error { // Unroll splitArgs to avoid runtime/heap issues a := [MAX_RMSG_ARGS][]byte{} args := a[:0] + var an []byte + if c.kind == ROUTER { + if an = c.route.accName; len(an) > 0 { + args = append(args, an) + } + } start := -1 for i, b := range arg { switch b { @@ -405,7 +427,11 @@ func (c *client) processRoutedMsgArgs(arg []byte) error { // Common ones processed after check for arg length c.pa.account = args[0] c.pa.subject = args[1] - c.pa.pacache = arg[:len(args[0])+len(args[1])+1] + if len(an) > 0 { + c.pa.pacache = c.pa.subject + } else { + c.pa.pacache = arg[:len(args[0])+len(args[1])+1] + } return nil } @@ -477,15 +503,6 @@ func (c *client) sendRouteConnect(clusterName string, tlsRequired bool) error { // Process the info message if we are a route. func (c *client) processRouteInfo(info *Info) { - // We may need to update route permissions and will need the account - // sublist. Since getting the account requires server lock, do the - // lookup now. - - // FIXME(dlc) - Add account scoping. - gacc := c.srv.globalAccount() - gacc.mu.RLock() - sl := gacc.sl - gacc.mu.RUnlock() supportsHeaders := c.srv.supportsHeaders() clusterName := c.srv.ClusterName() @@ -517,7 +534,7 @@ func (c *client) processRouteInfo(info *Info) { // Use other if remote is non dynamic or their name is "bigger" if s.isClusterNameDynamic() && (!info.Dynamic || (strings.Compare(clusterName, info.Cluster) < 0)) { s.setClusterName(info.Cluster) - s.removeAllRoutesExcept(c) + s.removeAllRoutesExcept(info.ID) c.mu.Lock() } else { c.closeConnection(ClusterNameConflict) @@ -525,12 +542,86 @@ func (c *client) processRouteInfo(info *Info) { } } + opts := s.getOpts() + // If this is an async INFO from an existing route... if c.flags.isSet(infoReceived) { remoteID := c.route.remoteID + // Check if this is an INFO about adding a per-account route during + // a configuration reload. + if info.RouteAccReqID != _EMPTY_ { + c.mu.Unlock() + + // If there is an account name, then the remote server is telling + // us that this account will now have its dedicated route. + if an := info.RouteAccount; an != _EMPTY_ { + acc, err := s.LookupAccount(an) + if err != nil { + s.Errorf("Error looking up account %q: %v", an, err) + return + } + s.mu.Lock() + if _, ok := s.accRoutes[an]; !ok { + s.accRoutes[an] = make(map[string]*client) + } + acc.mu.Lock() + sl := acc.sl + rpi := acc.routePoolIdx + // Make sure that the account was not already switched. + if rpi >= 0 { + s.setRouteInfo(acc) + // Change the route pool index to indicate that this + // account is actually transitioning. This will be used + // to suppress possible remote subscription interest coming + // in while the transition is happening. + acc.routePoolIdx = accTransitioningToDedicatedRoute + } else if info.RoutePoolSize == s.routesPoolSize { + // Otherwise, and if the other side's pool size matches + // ours, get the route pool index that was handling this + // account. + rpi = s.computeRoutePoolIdx(acc) + } + acc.mu.Unlock() + // Go over each remote's route at pool index `rpi` and remove + // remote subs for this account. + s.forEachRouteIdx(rpi, func(r *client) bool { + r.mu.Lock() + // Exclude routes to servers that don't support pooling. + if !r.route.noPool { + if subs := r.removeRemoteSubsForAcc(an); len(subs) > 0 { + sl.RemoveBatch(subs) + } + } + r.mu.Unlock() + return true + }) + // Respond to the remote by clearing the RouteAccount field. + info.RouteAccount = _EMPTY_ + proto := generateInfoJSON(info) + c.mu.Lock() + c.enqueueProto(proto) + c.mu.Unlock() + s.mu.Unlock() + } else { + // If no account name is specified, this is a response from the + // remote. Simply send to the communication channel, if the + // request ID matches the current one. + s.mu.Lock() + if info.RouteAccReqID == s.accAddedReqID && s.accAddedCh != nil { + select { + case s.accAddedCh <- struct{}{}: + default: + } + } + s.mu.Unlock() + } + // In both cases, we are done here. + return + } + // Check if this is an INFO for gateways... - if info.Gateway != "" { + if info.Gateway != _EMPTY_ { c.mu.Unlock() // If this server has no gateway configured, report error and return. if !s.gateway.enabled { @@ -545,7 +636,7 @@ func (c *client) processRouteInfo(info *Info) { // We receive an INFO from a server that informs us about another server, // so the info.ID in the INFO protocol does not match the ID of this route. - if remoteID != "" && remoteID != info.ID { + if remoteID != _EMPTY_ && remoteID != info.ID { c.mu.Unlock() // Process this implicit route. We will check that it is not an explicit @@ -556,23 +647,29 @@ func (c *client) processRouteInfo(info *Info) { var connectURLs []string var wsConnectURLs []string + var updateRoutePerms bool // If we are notified that the remote is going into LDM mode, capture route's connectURLs. if info.LameDuckMode { connectURLs = c.route.connectURLs wsConnectURLs = c.route.wsConnURLs } else { - // If this is an update due to config reload on the remote server, - // need to possibly send local subs to the remote server. - c.updateRemoteRoutePerms(sl, info) + // Update only if we detect a difference + updateRoutePerms = !reflect.DeepEqual(c.opts.Import, info.Import) || !reflect.DeepEqual(c.opts.Export, info.Export) } c.mu.Unlock() + if updateRoutePerms { + s.updateRemoteRoutePerms(c, info) + } + // If the remote is going into LDM and there are client connect URLs // associated with this route and we are allowed to advertise, remove // those URLs and update our clients. - if (len(connectURLs) > 0 || len(wsConnectURLs) > 0) && !s.getOpts().Cluster.NoAdvertise { + if (len(connectURLs) > 0 || len(wsConnectURLs) > 0) && !opts.Cluster.NoAdvertise { + s.mu.Lock() s.removeConnectURLsAndSendINFOToClients(connectURLs, wsConnectURLs) + s.mu.Unlock() } return } @@ -633,55 +730,40 @@ func (c *client) processRouteInfo(info *Info) { } c.route.url = url } + // The incoming INFO from the route will have IP set + // if it has Cluster.Advertise. In that case, use that + // otherwise construct it from the remote TCP address. + if info.IP == _EMPTY_ { + // Need to get the remote IP address. + switch conn := c.nc.(type) { + case *net.TCPConn, *tls.Conn: + addr := conn.RemoteAddr().(*net.TCPAddr) + info.IP = fmt.Sprintf("nats-route://%s/", net.JoinHostPort(addr.IP.String(), + strconv.Itoa(info.Port))) + default: + info.IP = c.route.url.String() + } + } + // For accounts that are configured to have their own route: + // If this is a solicit route, we already have c.route.accName set in createRoute. + // For non solicited route (the accept side), we will set the account name that + // is present in the INFO protocol. + didSolicit := c.route.didSolicit + if !didSolicit { + c.route.accName = []byte(info.RouteAccount) + } + accName := string(c.route.accName) // Check to see if we have this remote already registered. // This can happen when both servers have routes to each other. c.mu.Unlock() - if added, sendInfo := s.addRoute(c, info); added { - c.Debugf("Registering remote route %q", info.ID) - - // Send our subs to the other side. - s.sendSubsToRoute(c) - - // Send info about the known gateways to this route. - s.sendGatewayConfigsToRoute(c) - - // sendInfo will be false if the route that we just accepted - // is the only route there is. - if sendInfo { - // The incoming INFO from the route will have IP set - // if it has Cluster.Advertise. In that case, use that - // otherwise construct it from the remote TCP address. - if info.IP == "" { - // Need to get the remote IP address. - c.mu.Lock() - switch conn := c.nc.(type) { - case *net.TCPConn, *tls.Conn: - addr := conn.RemoteAddr().(*net.TCPAddr) - info.IP = fmt.Sprintf("nats-route://%s/", net.JoinHostPort(addr.IP.String(), - strconv.Itoa(info.Port))) - default: - info.IP = c.route.url.String() - } - c.mu.Unlock() - } - // Now let the known servers know about this new route - s.forwardNewRouteInfoToKnownServers(info) + if added := s.addRoute(c, didSolicit, info, accName); added { + if accName != _EMPTY_ { + c.Debugf("Registering remote route %q for account %q", info.ID, accName) + } else { + c.Debugf("Registering remote route %q", info.ID) } - // Unless disabled, possibly update the server's INFO protocol - // and send to clients that know how to handle async INFOs. - if !s.getOpts().Cluster.NoAdvertise { - s.addConnectURLsAndSendINFOToClients(info.ClientConnectURLs, info.WSConnectURLs) - } - // Add the remote's leafnodeURL to our list of URLs and send the update - // to all LN connections. (Note that when coming from a route, LeafNodeURLs - // is an array of size 1 max). - s.mu.Lock() - if len(info.LeafNodeURLs) == 1 && s.addLeafNodeURL(info.LeafNodeURLs[0]) { - s.sendAsyncLeafNodeInfo() - } - s.mu.Unlock() } else { c.Debugf("Detected duplicate remote route %q", info.ID) c.closeConnection(DuplicateRoute) @@ -690,8 +772,8 @@ func (c *client) processRouteInfo(info *Info) { // Possibly sends local subscriptions interest to this route // based on changes in the remote's Export permissions. -// Lock assumed held on entry -func (c *client) updateRemoteRoutePerms(sl *Sublist, info *Info) { +func (s *Server) updateRemoteRoutePerms(c *client, info *Info) { + c.mu.Lock() // Interested only on Export permissions for the remote server. // Create "fake" clients that we will use to check permissions // using the old permissions... @@ -705,13 +787,30 @@ func (c *client) updateRemoteRoutePerms(sl *Sublist, info *Info) { c.opts.Import = info.Import c.opts.Export = info.Export + c.mu.Unlock() + var acc *Account + var err error + if an := info.RouteAccount; an == _EMPTY_ { + acc = s.globalAccount() + } else if acc, err = s.LookupAccount(an); err != nil { + c.Errorf("Unable to lookup account %q to update remote route permissions: %v", an, err) + return + } + + acc.mu.RLock() + sl := acc.sl + acc.mu.RUnlock() + if sl == nil { + return + } var ( _localSubs [4096]*subscription localSubs = _localSubs[:0] ) sl.localSubs(&localSubs, false) + c.mu.Lock() c.sendRouteSubProtos(localSubs, false, func(sub *subscription) bool { subj := string(sub.subject) // If the remote can now export but could not before, and this server can import this @@ -721,6 +820,7 @@ func (c *client) updateRemoteRoutePerms(sl *Sublist, info *Info) { } return false }) + c.mu.Unlock() } // sendAsyncInfoToClients sends an INFO protocol to all @@ -765,7 +865,13 @@ func (s *Server) processImplicitRoute(info *Info) { return } // Check if this route already exists - if _, exists := s.remotes[remoteID]; exists { + if accName := info.RouteAccount; accName != _EMPTY_ { + if remotes, ok := s.accRoutes[accName]; ok { + if r := remotes[remoteID]; r != nil { + return + } + } + } else if _, exists := s.routes[remoteID]; exists { return } // Check if we have this route as a configured route @@ -786,7 +892,7 @@ func (s *Server) processImplicitRoute(info *Info) { if info.AuthRequired { r.User = url.UserPassword(opts.Cluster.Username, opts.Cluster.Password) } - s.startGoRoutine(func() { s.connectToRoute(r, false, true) }) + s.startGoRoutine(func() { s.connectToRoute(r, false, true, info.RouteAccount) }) } // hasThisRouteConfigured returns true if info.Host:info.Port is present @@ -805,10 +911,8 @@ func (s *Server) hasThisRouteConfigured(info *Info) bool { // forwardNewRouteInfoToKnownServers sends the INFO protocol of the new route // to all routes known by this server. In turn, each server will contact this // new route. +// Server lock held on entry. func (s *Server) forwardNewRouteInfoToKnownServers(info *Info) { - s.mu.Lock() - defer s.mu.Unlock() - // Note: nonce is not used in routes. // That being said, the info we get is the initial INFO which // contains a nonce, but we now forward this to existing routes, @@ -817,13 +921,13 @@ func (s *Server) forwardNewRouteInfoToKnownServers(info *Info) { b, _ := json.Marshal(info) infoJSON := []byte(fmt.Sprintf(InfoProto, b)) - for _, r := range s.routes { + s.forEachRemote(func(r *client) { r.mu.Lock() if r.route.remoteID != info.ID { r.enqueueProto(infoJSON) } r.mu.Unlock() - } + }) } // canImport is whether or not we will send a SUB for interest to the other side. @@ -908,10 +1012,28 @@ func (c *client) removeRemoteSubs() { // Now remove the subs by batch for each account sublist. for _, ase := range as { c.Debugf("Removing %d subscriptions for account %q", len(ase.subs), ase.acc.Name) + ase.acc.mu.Lock() ase.acc.sl.RemoveBatch(ase.subs) + ase.acc.mu.Unlock() } } +// Removes (and returns) the subscriptions from this route's subscriptions map +// that belong to the given account. +// Lock is held on entry +func (c *client) removeRemoteSubsForAcc(name string) []*subscription { + var subs []*subscription + for key, sub := range c.subs { + an := strings.Fields(key)[0] + if an == name { + sub.max = 0 + subs = append(subs, sub) + delete(c.subs, key) + } + } + return subs +} + func (c *client) parseUnsubProto(arg []byte) (string, []byte, []byte, error) { // Indicate any activity, so pub and sub or unsubs. c.in.subs++ @@ -919,14 +1041,27 @@ func (c *client) parseUnsubProto(arg []byte) (string, []byte, []byte, error) { args := splitArg(arg) var queue []byte + var accountName string + subjIdx := 1 + c.mu.Lock() + if c.kind == ROUTER && c.route != nil { + if accountName = string(c.route.accName); accountName != _EMPTY_ { + subjIdx = 0 + } + } + c.mu.Unlock() + switch len(args) { - case 2: - case 3: - queue = args[2] + case subjIdx + 1: + case subjIdx + 2: + queue = args[subjIdx+1] default: return "", nil, nil, fmt.Errorf("parse error: '%s'", arg) } - return string(args[0]), args[1], queue, nil + if accountName == _EMPTY_ { + accountName = string(args[0]) + } + return accountName, args[subjIdx], queue, nil } // Indicates no more interest in the given account/subject for the remote side. @@ -957,12 +1092,16 @@ func (c *client) processRemoteUnsub(arg []byte) (err error) { updateGWs := false // We store local subs by account and subject and optionally queue name. // RS- will have the arg exactly as the key. - key := string(arg) + var key string + if c.kind == ROUTER && c.route != nil && len(c.route.accName) > 0 { + key = accountName + " " + string(arg) + } else { + key = string(arg) + } sub, ok := c.subs[key] if ok { delete(c.subs, key) acc.sl.Remove(sub) - c.removeReplySubTimeout(sub) updateGWs = srv.gateway.enabled } c.mu.Unlock() @@ -996,26 +1135,44 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { args := splitArg(arg) sub := &subscription{client: c} - var off int + // This value indicate what is the mandatory subject offset in the args + // slice. It varies based on the optional presence of origin or account name + // fields (tha latter would not be present for "per-account" routes). + var subjIdx int + // If account is present, this is its "char" position in arg slice. + var accPos int if hasOrigin { - off = 1 + // Set to 1, will be adjusted if the account is also expected. + subjIdx = 1 sub.origin = args[0] + // The account would start after the origin and trailing space. + accPos = len(sub.origin) + 1 + } + c.mu.Lock() + accountName := string(c.route.accName) + c.mu.Unlock() + // If the route is dedicated to an account, accountName will not + // be empty. If it is, then the account must be in the protocol. + var accInProto bool + if accountName == _EMPTY_ { + subjIdx++ + accInProto = true } - switch len(args) { - case 2 + off: + case subjIdx + 1: sub.queue = nil - case 4 + off: - sub.queue = args[2+off] - sub.qw = int32(parseSize(args[3+off])) + case subjIdx + 3: + sub.queue = args[subjIdx+1] + sub.qw = int32(parseSize(args[subjIdx+2])) default: return fmt.Errorf("processRemoteSub Parse Error: '%s'", arg) } - sub.subject = args[1+off] - - // Lookup the account - // FIXME(dlc) - This may start having lots of contention? - accountName := string(args[0+off]) + sub.subject = args[subjIdx] + // If the account name is empty (not a "per-account" route), the account + // is at the index prior to the subject. + if accountName == _EMPTY_ { + accountName = string(args[subjIdx-1]) + } // Lookup account while avoiding fetch. // A slow fetch delays subsequent remote messages. It also avoids the expired check (see below). // With all but memory resolver lookup can be delayed or fail. @@ -1076,19 +1233,47 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { // We store local subs by account and subject and optionally queue name. // If we have a queue it will have a trailing weight which we do not want. if sub.queue != nil { - sub.sid = arg[len(sub.origin)+off : len(arg)-len(args[3+off])-1] + // if the account is in the protocol, we can reference directly "arg", + // otherwise, we need to allocate/construct the sid. + if accInProto { + sub.sid = arg[accPos : accPos+len(accountName)+1+len(sub.subject)+1+len(sub.queue)] + } else { + sub.sid = append(sub.sid, accountName...) + sub.sid = append(sub.sid, ' ') + sub.sid = append(sub.sid, sub.subject...) + sub.sid = append(sub.sid, ' ') + sub.sid = append(sub.sid, sub.queue...) + } + } else if accInProto { + sub.sid = arg[accPos:] } else { - sub.sid = arg[len(sub.origin)+off:] + sub.sid = append(sub.sid, accountName...) + sub.sid = append(sub.sid, ' ') + sub.sid = append(sub.sid, sub.subject...) } key := string(sub.sid) + acc.mu.RLock() + // For routes (this can be called by leafnodes), check if the account is + // transitioning (from pool to dedicated route) and this route is not a + // per-account route (route.poolIdx >= 0). If so, ignore this subscription. + // Exclude "no pool" routes from this check. + if c.kind == ROUTER && !c.route.noPool && + acc.routePoolIdx == accTransitioningToDedicatedRoute && c.route.poolIdx >= 0 { + acc.mu.RUnlock() + c.mu.Unlock() + // Do not return an error, which would cause the connection to be closed. + return nil + } + sl := acc.sl + acc.mu.RUnlock() osub := c.subs[key] updateGWs := false delta := int32(1) if osub == nil { c.subs[key] = sub // Now place into the account sl. - if err = acc.sl.Insert(sub); err != nil { + if err = sl.Insert(sub); err != nil { delete(c.subs, key) c.mu.Unlock() c.Errorf("Could not insert subscription: %v", err) @@ -1100,7 +1285,7 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { // For a queue we need to update the weight. delta = sub.qw - atomic.LoadInt32(&osub.qw) atomic.StoreInt32(&osub.qw, sub.qw) - acc.sl.UpdateRemoteQSub(osub) + sl.UpdateRemoteQSub(osub) } c.mu.Unlock() @@ -1118,6 +1303,7 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { return nil } +// Lock is held on entry func (c *client) addRouteSubOrUnsubProtoToBuf(buf []byte, accName string, sub *subscription, isSubProto bool) []byte { // If we have an origin cluster and the other side supports leafnode origin clusters // send an LS+/LS- version instead. @@ -1136,8 +1322,10 @@ func (c *client) addRouteSubOrUnsubProtoToBuf(buf []byte, accName string, sub *s buf = append(buf, rUnsubBytes...) } } - buf = append(buf, accName...) - buf = append(buf, ' ') + if len(c.route.accName) == 0 { + buf = append(buf, accName...) + buf = append(buf, ' ') + } buf = append(buf, sub.subject...) if len(sub.queue) > 0 { buf = append(buf, ' ') @@ -1162,20 +1350,27 @@ func (c *client) addRouteSubOrUnsubProtoToBuf(buf []byte, accName string, sub *s // the remote side. For each account we will send the // complete interest for all subjects, both normal as a binary // and queue group weights. -func (s *Server) sendSubsToRoute(route *client) { - s.mu.Lock() +// +// Server lock held on entry. +func (s *Server) sendSubsToRoute(route *client, idx int, account string) { + var noPool bool + if idx >= 0 { + // We need to check if this route is "no_pool" in which case we + // need to select all accounts. + route.mu.Lock() + noPool = route.route.noPool + route.mu.Unlock() + } // Estimated size of all protocols. It does not have to be accurate at all. eSize := 0 - // Send over our account subscriptions. - // copy accounts into array first - accs := make([]*Account, 0, 32) - s.accounts.Range(func(k, v interface{}) bool { - a := v.(*Account) - accs = append(accs, a) - a.mu.RLock() + estimateProtosSize := func(a *Account, addAccountName bool) { if ns := len(a.rm); ns > 0 { - // Proto looks like: "RS+ [ ]\r\n" - eSize += ns * (len(rSubBytes) + len(a.Name) + 1 + 2) + var accSize int + if addAccountName { + accSize = len(a.Name) + 1 + } + // Proto looks like: "RS+ [ ][ ]\r\n" + eSize += ns * (len(rSubBytes) + 1 + accSize) for key := range a.rm { // Key contains "[ ]" eSize += len(key) @@ -1185,10 +1380,34 @@ func (s *Server) sendSubsToRoute(route *client) { eSize += 5 } } - a.mu.RUnlock() - return true - }) - s.mu.Unlock() + } + // Send over our account subscriptions. + accs := make([]*Account, 0, 32) + if idx < 0 || account != _EMPTY_ { + if ai, ok := s.accounts.Load(account); ok { + a := ai.(*Account) + a.mu.RLock() + // Estimate size and add account name in protocol if idx is not -1 + estimateProtosSize(a, idx >= 0) + accs = append(accs, a) + a.mu.RUnlock() + } + } else { + s.accounts.Range(func(k, v interface{}) bool { + a := v.(*Account) + a.mu.RLock() + // We are here for regular or pooled routes (not per-account). + // So we collect all accounts whose routePoolIdx matches the + // one for this route, or only the account provided, or all + // accounts if dealing with a "no pool" route. + if a.routePoolIdx == idx || noPool { + estimateProtosSize(a, true) + accs = append(accs, a) + } + a.mu.RUnlock() + return true + }) + } buf := make([]byte, 0, eSize) @@ -1212,9 +1431,11 @@ func (s *Server) sendSubsToRoute(route *client) { } a.mu.RUnlock() } - route.enqueueProto(buf) + if len(buf) > 0 { + route.enqueueProto(buf) + route.Debugf("Sent local subscriptions to route") + } route.mu.Unlock() - route.Debugf("Sent local subscriptions to route") } // Sends SUBs protocols for the given subscriptions. If a filter is specified, it is @@ -1273,16 +1494,15 @@ func (c *client) sendRouteSubOrUnSubProtos(subs []*subscription, isSubProto, tra c.traceOutOp("", buf[as:len(buf)-LEN_CR_LF]) } } - c.enqueueProto(buf) } -func (s *Server) createRoute(conn net.Conn, rURL *url.URL) *client { +func (s *Server) createRoute(conn net.Conn, rURL *url.URL, accName string) *client { // Snapshot server options. opts := s.getOpts() didSolicit := rURL != nil - r := &route{didSolicit: didSolicit} + r := &route{didSolicit: didSolicit, poolIdx: -1} for _, route := range opts.Routes { if rURL != nil && (strings.EqualFold(rURL.Host, route.Host)) { r.routeType = Explicit @@ -1291,17 +1511,16 @@ func (s *Server) createRoute(conn net.Conn, rURL *url.URL) *client { c := &client{srv: s, nc: conn, opts: ClientOpts{}, kind: ROUTER, msubs: -1, mpay: -1, route: r, start: time.Now()} + var infoJSON []byte // Grab server variables s.mu.Lock() - // New proto wants a nonce (although not used in routes, that is, not signed in CONNECT) - var raw [nonceLen]byte - nonce := raw[:] - s.generateNonce(nonce) - s.routeInfo.Nonce = string(nonce) - s.generateRouteInfoJSON() - // Clear now that it has been serialized. Will prevent nonce to be included in async INFO that we may send. - s.routeInfo.Nonce = _EMPTY_ - infoJSON := s.routeInfoJSON + // If we are creating a pooled connection and this is the server soliciting + // the connection, we will delay sending the INFO after we have processed + // the incoming INFO from the remote. + delayInfo := didSolicit && accName == _EMPTY_ && opts.Cluster.PoolSize >= 1 + if !delayInfo { + infoJSON = s.generateRouteInitialInfoJSON(accName, 0) + } authRequired := s.routeInfo.AuthRequired tlsRequired := s.routeInfo.TLSRequired clusterName := s.info.Cluster @@ -1318,6 +1537,7 @@ func (s *Server) createRoute(conn net.Conn, rURL *url.URL) *client { // Do this before the TLS code, otherwise, in case of failure // and if route is explicit, it would try to reconnect to 'nil'... r.url = rURL + r.accName = []byte(accName) } else { c.flags.set(expectConnect) } @@ -1396,98 +1616,311 @@ func (s *Server) createRoute(conn net.Conn, rURL *url.URL) *client { } } - // Send our info to the other side. - // Our new version requires dynamic information for accounts and a nonce. - c.enqueueProto(infoJSON) + if !delayInfo { + // Send our info to the other side. + // Our new version requires dynamic information for accounts and a nonce. + c.enqueueProto(infoJSON) + } c.mu.Unlock() c.Noticef("Route connection created") return c } +// Generates a nonce and set some route info's fields before marshal'ing into JSON. +// To be used only when a route is created (to send the initial INFO protocol). +// +// Server lock held on entry. +func (s *Server) generateRouteInitialInfoJSON(accName string, poolIdx int) []byte { + // New proto wants a nonce (although not used in routes, that is, not signed in CONNECT) + var raw [nonceLen]byte + nonce := raw[:] + s.generateNonce(nonce) + ri := &s.routeInfo + ri.Nonce, ri.RouteAccount, ri.RoutePoolIdx = string(nonce), accName, poolIdx + infoJSON := generateInfoJSON(&s.routeInfo) + // Clear now that it has been serialized. Will prevent nonce to be included in async INFO that we may send. + // Same for some other fields. + ri.Nonce, ri.RouteAccount, ri.RoutePoolIdx = _EMPTY_, _EMPTY_, 0 + return infoJSON +} + const ( _CRLF_ = "\r\n" _EMPTY_ = "" ) -func (s *Server) addRoute(c *client, info *Info) (bool, bool) { - id := c.route.remoteID - sendInfo := false +func (s *Server) addRoute(c *client, didSolicit bool, info *Info, accName string) bool { + id := info.ID s.mu.Lock() - if !s.running { + if !s.running || s.routesReject { s.mu.Unlock() - return false, false + return false } - remote, exists := s.remotes[id] - if !exists { - s.routes[c.cid] = c - s.remotes[id] = c - // check to be consistent and future proof. but will be same domain - if s.sameDomain(info.Domain) { - s.nodeToInfo.Store(c.route.hash, - nodeInfo{c.route.remoteName, s.info.Version, s.info.Cluster, info.Domain, id, nil, nil, nil, false, info.JetStream}) + var invProtoErr string + + opts := s.getOpts() + + // Assume we are in pool mode if info.RoutePoolSize is set. We may disable + // in some cases. + pool := info.RoutePoolSize > 0 + // This is used to prevent a server with pooling to constantly trying + // to connect to a server with no pooling (for instance old server) after + // the first connection is established. + var noReconnectForOldServer bool + + // If the remote is an old server, info.RoutePoolSize will be 0, or if + // this server's Cluster.PoolSize is negative, we will behave as an old + // server and need to handle things differently. + if info.RoutePoolSize <= 0 || opts.Cluster.PoolSize < 0 { + if accName != _EMPTY_ { + invProtoErr = fmt.Sprintf("Not possible to have a dedicate route for account %q between those servers", accName) + // In this case, make sure this route does not attempt to reconnect + c.setNoReconnect() + } else { + // We will accept, but treat this remote has "no pool" + pool, noReconnectForOldServer = false, true + c.mu.Lock() + c.route.poolIdx = 0 + c.route.noPool = true + c.mu.Unlock() + // Keep track of number of routes like that. We will use that when + // sending subscriptions over routes. + s.routesNoPool++ } + } else if s.routesPoolSize != info.RoutePoolSize { + // The cluster's PoolSize configuration must be an exact match with the remote server. + invProtoErr = fmt.Sprintf("Mismatch route pool size: %v vs %v", s.routesPoolSize, info.RoutePoolSize) + } else if didSolicit { + // For solicited route, the incoming's RoutePoolIdx should not be set. + if info.RoutePoolIdx != 0 { + invProtoErr = fmt.Sprintf("Route pool index should not be set but is set to %v", info.RoutePoolIdx) + } + } else if info.RoutePoolIdx < 0 || info.RoutePoolIdx >= s.routesPoolSize { + // For non solicited routes, if the remote sends a RoutePoolIdx, make + // sure it is a valid one (in range of the pool size). + invProtoErr = fmt.Sprintf("Invalid route pool index: %v - pool size is %v", info.RoutePoolIdx, info.RoutePoolSize) + } + if invProtoErr != _EMPTY_ { + s.mu.Unlock() + c.sendErrAndErr(invProtoErr) + c.closeConnection(ProtocolViolation) + return false + } + // If accName is set, we are dealing with a per-account connection. + if accName != _EMPTY_ { + // When an account has its own route, it will be an error if the given + // account name is not found in s.accRoutes map. + conns, exists := s.accRoutes[accName] + if !exists { + s.mu.Unlock() + c.sendErrAndErr(fmt.Sprintf("No route for account %q", accName)) + c.closeConnection(ProtocolViolation) + return false + } + remote, exists := conns[id] + if !exists { + conns[id] = c + c.mu.Lock() + idHash := c.route.idHash + cid := c.cid + c.mu.Unlock() + + // Store this route with key being the route id hash + account name + s.storeRouteByHash(idHash+accName, c) + + // Now that we have registered the route, we can remove from the temp map. + s.removeFromTempClients(cid) + + // Notify other routes about this new route + s.forwardNewRouteInfoToKnownServers(info) + + // Send subscription interest + s.sendSubsToRoute(c, -1, accName) + } else { + handleDuplicateRoute(remote, c, true) + } + s.mu.Unlock() + return !exists + } + var remote *client + // That will be the position of the connection in the slice, we initialize + // to -1 to indicate that no space was found. + idx := -1 + // This will be the size (or number of connections) in a given slice. + sz := 0 + // Check if we know about the remote server + conns, exists := s.routes[id] + if !exists { + // No, create a slice for route connections of the size of the pool + // or 1 when not in pool mode. + conns = make([]*client, s.routesPoolSize) + // Track this slice for this remote server. + s.routes[id] = conns + // Set the index to info.RoutePoolIdx because if this is a solicited + // route, this value will be 0, which is what we want, otherwise, we + // will use whatever index the remote has chosen. + idx = info.RoutePoolIdx + } else if pool { + // The remote was found. If this is a non solicited route, we will place + // the connection in the pool at the index given by info.RoutePoolIdx. + // But if there is already one, close this incoming connection as a + // duplicate. + if !didSolicit { + idx = info.RoutePoolIdx + if remote = conns[idx]; remote != nil { + handleDuplicateRoute(remote, c, false) + s.mu.Unlock() + return false + } + } + // For all cases (solicited and not) we need to count how many connections + // we already have, and for solicited route, we will find a free spot in + // the slice. + for i, r := range conns { + if idx == -1 && r == nil { + idx = i + } else if r != nil { + sz++ + } + } + } else { + remote = conns[0] + } + // If there is a spot, idx will be greater or equal to 0. + if idx >= 0 { c.mu.Lock() c.route.connectURLs = info.ClientConnectURLs c.route.wsConnURLs = info.WSConnectURLs + c.route.poolIdx = idx + rtype := c.route.routeType cid := c.cid - hash := c.route.hash idHash := c.route.idHash + rHash := c.route.hash + rn := c.route.remoteName + url := c.route.url + // For solicited routes, we need now to send the INFO protocol + if didSolicit { + c.enqueueProto(s.generateRouteInitialInfoJSON(_EMPTY_, idx)) + } c.mu.Unlock() + // Add to the slice and bump the count of connections for this remote + conns[idx] = c + sz++ + // This boolean will indicate that we are registering the only + // connection in non pooled situation or we stored the very first + // connection for a given remote server. + doOnce := !pool || sz == 1 + if doOnce { + // check to be consistent and future proof. but will be same domain + if s.sameDomain(info.Domain) { + s.nodeToInfo.Store(rHash, + nodeInfo{rn, s.info.Version, s.info.Cluster, info.Domain, id, nil, nil, nil, false, info.JetStream}) + } + } + // Store this route using the hash as the key - s.storeRouteByHash(hash, idHash, c) + if pool { + idHash += strconv.Itoa(idx) + } + s.storeRouteByHash(idHash, c) // Now that we have registered the route, we can remove from the temp map. s.removeFromTempClients(cid) - // we don't need to send if the only route is the one we just accepted. - sendInfo = len(s.routes) > 1 + if doOnce { + // If the INFO contains a Gateway URL, add it to the list for our cluster. + if info.GatewayURL != _EMPTY_ && s.addGatewayURL(info.GatewayURL) { + s.sendAsyncGatewayInfo() + } - // If the INFO contains a Gateway URL, add it to the list for our cluster. - if info.GatewayURL != "" && s.addGatewayURL(info.GatewayURL) { - s.sendAsyncGatewayInfo() + // we don't need to send if the only route is the one we just accepted. + if len(s.routes) > 1 { + // Now let the known servers know about this new route + s.forwardNewRouteInfoToKnownServers(info) + } + + // Send info about the known gateways to this route. + s.sendGatewayConfigsToRoute(c) + + // Unless disabled, possibly update the server's INFO protocol + // and send to clients that know how to handle async INFOs. + if !opts.Cluster.NoAdvertise { + s.addConnectURLsAndSendINFOToClients(info.ClientConnectURLs, info.WSConnectURLs) + } + + // Add the remote's leafnodeURL to our list of URLs and send the update + // to all LN connections. (Note that when coming from a route, LeafNodeURLs + // is an array of size 1 max). + if len(info.LeafNodeURLs) == 1 && s.addLeafNodeURL(info.LeafNodeURLs[0]) { + s.sendAsyncLeafNodeInfo() + } + } + + // Send the subscriptions interest. + s.sendSubsToRoute(c, idx, _EMPTY_) + + // In pool mode, if we did not yet reach the cap, try to connect a new connection + if pool && didSolicit && sz != s.routesPoolSize { + s.startGoRoutine(func() { + select { + case <-time.After(time.Duration(rand.Intn(100)) * time.Millisecond): + case <-s.quitCh: + s.grWG.Done() + return + } + s.connectToRoute(url, rtype == Explicit, true, _EMPTY_) + }) } } s.mu.Unlock() - + if pool { + if idx == -1 { + // Was full, so need to close connection + c.Debugf("Route pool size reached, closing extra connection to %q", id) + handleDuplicateRoute(nil, c, true) + return false + } + return true + } + // This is for non-pool mode at this point. if exists { - var r *route - - c.mu.Lock() - // upgrade to solicited? - if c.route.didSolicit { - // Make a copy - rs := *c.route - r = &rs - } - // Since this duplicate route is going to be removed, make sure we clear - // c.route.leafnodeURL, otherwise, when processing the disconnect, this - // would cause the leafnode URL for that remote server to be removed - // from our list. Same for gateway... - c.route.leafnodeURL, c.route.gatewayURL = _EMPTY_, _EMPTY_ - // Same for the route hash otherwise it would be removed from s.routesByHash. - c.route.hash, c.route.idHash = _EMPTY_, _EMPTY_ - c.mu.Unlock() - - remote.mu.Lock() - // r will be not nil if c.route.didSolicit was true - if r != nil { - // If we upgrade to solicited, we still want to keep the remote's - // connectURLs. So transfer those. - r.connectURLs = remote.route.connectURLs - r.wsConnURLs = remote.route.wsConnURLs - remote.route = r - } - // This is to mitigate the issue where both sides add the route - // on the opposite connection, and therefore end-up with both - // connections being dropped. - remote.route.retry = true - remote.mu.Unlock() + handleDuplicateRoute(remote, c, noReconnectForOldServer) } - return !exists, sendInfo + return !exists +} + +func handleDuplicateRoute(remote, c *client, setNoReconnect bool) { + // We used to clear some fields when closing a duplicate connection + // to prevent sending INFO protocols for the remotes to update + // their leafnode/gateway URLs. This is no longer needed since + // removeRoute() now does the right thing of doing that only when + // the closed connection was an added route connection. + c.mu.Lock() + didSolict := c.route.didSolicit + url := c.route.url + if setNoReconnect { + c.flags.set(noReconnect) + } + c.mu.Unlock() + + if remote == nil { + return + } + + remote.mu.Lock() + if didSolict && !remote.route.didSolicit { + remote.route.didSolicit = true + remote.route.url = url + } + // This is to mitigate the issue where both sides add the route + // on the opposite connection, and therefore end-up with both + // connections being dropped. + remote.route.retry = true + remote.mu.Unlock() } // Import filter check. @@ -1590,8 +2023,45 @@ func (s *Server) updateRouteSubscriptionMap(acc *Account, sub *subscription, del routes := _routes[:0] s.mu.Lock() - for _, route := range s.routes { - routes = append(routes, route) + // The account's routePoolIdx field is set/updated under the server lock + // (but also the account's lock). So we don't need to acquire the account's + // lock here to get the value. + if poolIdx := acc.routePoolIdx; poolIdx < 0 { + if conns, ok := s.accRoutes[acc.Name]; ok { + for _, r := range conns { + routes = append(routes, r) + } + } + if s.routesNoPool > 0 { + // We also need to look for "no pool" remotes (that is, routes to older + // servers or servers that have explicitly disabled pooling). + s.forEachRemote(func(r *client) { + r.mu.Lock() + if r.route.noPool { + routes = append(routes, r) + } + r.mu.Unlock() + }) + } + } else { + // We can't use s.forEachRouteIdx here since we want to check/get the + // "no pool" route ONLY if we don't find a route at the given `poolIdx`. + for _, conns := range s.routes { + if r := conns[poolIdx]; r != nil { + routes = append(routes, r) + } else if s.routesNoPool > 0 { + // Check if we have a "no pool" route at index 0, and if so, it + // means that for this remote, we have a single connection because + // that server does not have pooling. + if r := conns[0]; r != nil { + r.mu.Lock() + if r.route.noPool { + routes = append(routes, r) + } + r.mu.Unlock() + } + } + } } trace := atomic.LoadInt32(&s.logging.trace) == 1 s.mu.Unlock() @@ -1702,6 +2172,9 @@ func (s *Server) startRouteAcceptLoop() { Dynamic: s.isClusterNameDynamic(), LNOC: true, } + if ps := opts.Cluster.PoolSize; ps > 0 { + info.RoutePoolSize = ps + } // Set this if only if advertise is not disabled if !opts.Cluster.NoAdvertise { info.ClientConnectURLs = s.clientConnectURLs @@ -1762,10 +2235,10 @@ func (s *Server) startRouteAcceptLoop() { } // Start the accept loop in a different go routine. - go s.acceptConnections(l, "Route", func(conn net.Conn) { s.createRoute(conn, nil) }, nil) + go s.acceptConnections(l, "Route", func(conn net.Conn) { s.createRoute(conn, nil, _EMPTY_) }, nil) // Solicit Routes if applicable. This will not block. - s.solicitRoutes(opts.Routes) + s.solicitRoutes(opts.Routes, opts.Cluster.Accounts) s.mu.Unlock() } @@ -1785,8 +2258,6 @@ func (s *Server) setRouteInfoHostPortAndIP() error { s.routeInfo.Port = s.opts.Cluster.Port s.routeInfo.IP = "" } - // (re)generate the routeInfoJSON byte array - s.generateRouteInfoJSON() return nil } @@ -1804,7 +2275,7 @@ func (s *Server) StartRouting(clientListenReady chan struct{}) { } -func (s *Server) reConnectToRoute(rURL *url.URL, rtype RouteType) { +func (s *Server) reConnectToRoute(rURL *url.URL, rtype RouteType, accName string) { tryForEver := rtype == Explicit // If A connects to B, and B to A (regardless if explicit or // implicit - due to auto-discovery), and if each server first @@ -1821,7 +2292,7 @@ func (s *Server) reConnectToRoute(rURL *url.URL, rtype RouteType) { s.grWG.Done() return } - s.connectToRoute(rURL, tryForEver, false) + s.connectToRoute(rURL, tryForEver, false, accName) } // Checks to make sure the route is still valid. @@ -1834,7 +2305,7 @@ func (s *Server) routeStillValid(rURL *url.URL) bool { return false } -func (s *Server) connectToRoute(rURL *url.URL, tryForEver, firstConnect bool) { +func (s *Server) connectToRoute(rURL *url.URL, tryForEver, firstConnect bool, accName string) { // Snapshot server options. opts := s.getOpts() @@ -1849,8 +2320,18 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver, firstConnect bool) { attempts := 0 for s.isRunning() && rURL != nil { - if tryForEver && !s.routeStillValid(rURL) { - return + if tryForEver { + if !s.routeStillValid(rURL) { + return + } + if accName != _EMPTY_ { + s.mu.RLock() + _, valid := s.accRoutes[accName] + s.mu.RUnlock() + if !valid { + return + } + } } var conn net.Conn address, err := s.getRandomIP(resolver, rURL.Host, excludedAddresses) @@ -1892,7 +2373,7 @@ func (s *Server) connectToRoute(rURL *url.URL, tryForEver, firstConnect bool) { // We have a route connection here. // Go ahead and create it and exit this func. - s.createRoute(conn, rURL) + s.createRoute(conn, rURL, accName) return } } @@ -1917,11 +2398,18 @@ func (s *Server) saveRouteTLSName(routes []*url.URL) { // Start connection process to provided routes. Each route connection will // be started in a dedicated go routine. // Lock is held on entry -func (s *Server) solicitRoutes(routes []*url.URL) { +func (s *Server) solicitRoutes(routes []*url.URL, accounts []string) { s.saveRouteTLSName(routes) for _, r := range routes { route := r - s.startGoRoutine(func() { s.connectToRoute(route, true, true) }) + s.startGoRoutine(func() { s.connectToRoute(route, true, true, _EMPTY_) }) + } + // Now go over possible per-account routes and create them. + for _, an := range accounts { + for _, r := range routes { + route, accName := r, an + s.startGoRoutine(func() { s.connectToRoute(route, true, true, accName) }) + } } } @@ -1967,7 +2455,10 @@ func (c *client) processRouteConnect(srv *Server, arg []byte, lang string) error if !proto.Dynamic { srv.getOpts().Cluster.Name = proto.Cluster } - srv.removeAllRoutesExcept(c) + c.mu.Lock() + remoteID := c.opts.Name + c.mu.Unlock() + srv.removeAllRoutesExcept(remoteID) shouldReject = false } } @@ -1993,11 +2484,24 @@ func (c *client) processRouteConnect(srv *Server, arg []byte, lang string) error } // Called when we update our cluster name during negotiations with remotes. -func (s *Server) removeAllRoutesExcept(c *client) { +func (s *Server) removeAllRoutesExcept(remoteID string) { s.mu.Lock() - routes := make([]*client, 0, len(s.routes)) - for _, r := range s.routes { - if r != c { + routes := make([]*client, 0, s.numRoutes()) + for rID, conns := range s.routes { + if rID == remoteID { + continue + } + for _, r := range conns { + if r != nil { + routes = append(routes, r) + } + } + } + for _, conns := range s.accRoutes { + for rID, r := range conns { + if rID == remoteID { + continue + } routes = append(routes, r) } } @@ -2009,63 +2513,192 @@ func (s *Server) removeAllRoutesExcept(c *client) { } func (s *Server) removeRoute(c *client) { - var rID string - var lnURL string - var gwURL string - var hash string - var idHash string + s.mu.Lock() + defer s.mu.Unlock() + + var ( + rID string + lnURL string + gwURL string + idHash string + accName string + poolIdx = -1 + connectURLs []string + wsConnectURLs []string + opts = s.getOpts() + rURL *url.URL + noPool bool + ) c.mu.Lock() cid := c.cid r := c.route if r != nil { rID = r.remoteID lnURL = r.leafnodeURL - hash = r.hash idHash = r.idHash gwURL = r.gatewayURL + poolIdx = r.poolIdx + accName = string(r.accName) + if r.noPool { + s.routesNoPool-- + noPool = true + } + connectURLs = r.connectURLs + wsConnectURLs = r.wsConnURLs + rURL = r.url } c.mu.Unlock() - s.mu.Lock() - delete(s.routes, cid) - if r != nil { - rc, ok := s.remotes[rID] - // Only delete it if it is us.. - if ok && c == rc { - delete(s.remotes, rID) + if accName != _EMPTY_ { + if conns, ok := s.accRoutes[accName]; ok { + if r := conns[rID]; r == c { + s.removeRouteByHash(idHash + accName) + delete(conns, rID) + // Do not remove or set to nil when all remotes have been + // removed from the map. The configured accounts must always + // be in the accRoutes map and addRoute expects "conns" map + // to be created. + } } - // Remove the remote's gateway URL from our list and - // send update to inbound Gateway connections. - if gwURL != _EMPTY_ && s.removeGatewayURL(gwURL) { - s.sendAsyncGatewayInfo() + } + // If this is still -1, it means that it was not added to the routes + // so simply remove from temp clients and we are done. + if poolIdx == -1 || accName != _EMPTY_ { + s.removeFromTempClients(cid) + return + } + if conns, ok := s.routes[rID]; ok { + // If this route was not the one stored, simply remove from the + // temporary map and be done. + if conns[poolIdx] != c { + s.removeFromTempClients(cid) + return } - // Remove the remote's leafNode URL from - // our list and send update to LN connections. - if lnURL != _EMPTY_ && s.removeLeafNodeURL(lnURL) { - s.sendAsyncLeafNodeInfo() + conns[poolIdx] = nil + // Now check if this was the last connection to be removed. + empty := true + for _, c := range conns { + if c != nil { + empty = false + break + } } - s.removeRouteByHash(hash, idHash) + // This was the last route for this remote. Remove the remote entry + // and possibly send some async INFO protocols regarding gateway + // and leafnode URLs. + if empty { + delete(s.routes, rID) + + // Since this is the last route for this remote, possibly update + // the client connect URLs and send an update to connected + // clients. + if (len(connectURLs) > 0 || len(wsConnectURLs) > 0) && !opts.Cluster.NoAdvertise { + s.removeConnectURLsAndSendINFOToClients(connectURLs, wsConnectURLs) + } + // Remove the remote's gateway URL from our list and + // send update to inbound Gateway connections. + if gwURL != _EMPTY_ && s.removeGatewayURL(gwURL) { + s.sendAsyncGatewayInfo() + } + // Remove the remote's leafNode URL from + // our list and send update to LN connections. + if lnURL != _EMPTY_ && s.removeLeafNodeURL(lnURL) { + s.sendAsyncLeafNodeInfo() + } + // If this server has pooling and the route for this remote + // was a "no pool" route, attempt to reconnect. + if s.routesPoolSize > 1 && noPool { + s.startGoRoutine(func() { s.connectToRoute(rURL, true, true, _EMPTY_) }) + } + } + // This is for gateway code. Remove this route from a map that uses + // the route hash in combination with the pool index as the key. + if s.routesPoolSize > 1 { + idHash += strconv.Itoa(poolIdx) + } + s.removeRouteByHash(idHash) } s.removeFromTempClients(cid) - s.mu.Unlock() } func (s *Server) isDuplicateServerName(name string) bool { if name == _EMPTY_ { return false } - s.mu.Lock() - defer s.mu.Unlock() + s.mu.RLock() + defer s.mu.RUnlock() if s.info.Name == name { return true } - for _, r := range s.routes { - r.mu.Lock() - duplicate := r.route.remoteName == name - r.mu.Unlock() - if duplicate { - return true + for _, conns := range s.routes { + for _, r := range conns { + if r != nil { + r.mu.Lock() + duplicate := r.route.remoteName == name + r.mu.Unlock() + if duplicate { + return true + } + break + } } } return false } + +// Goes over each non-nil route connection for all remote servers +// and invokes the function `f`. It does not go over per-account +// routes. +// Server lock is held on entry. +func (s *Server) forEachNonPerAccountRoute(f func(r *client)) { + for _, conns := range s.routes { + for _, r := range conns { + if r != nil { + f(r) + } + } + } +} + +// Goes over each non-nil route connection for all remote servers +// and invokes the function `f`. This also includes the per-account +// routes. +// Server lock is held on entry. +func (s *Server) forEachRoute(f func(r *client)) { + s.forEachNonPerAccountRoute(f) + for _, conns := range s.accRoutes { + for _, r := range conns { + f(r) + } + } +} + +// Goes over each non-nil route connection at the given pool index +// location in the slice and invokes the function `f`. If the +// callback returns `true`, this function moves to the next remote. +// Otherwise, the iteration over removes stops. +// This does not include per-account routes. +// Server lock is held on entry. +func (s *Server) forEachRouteIdx(idx int, f func(r *client) bool) { + for _, conns := range s.routes { + if r := conns[idx]; r != nil { + if !f(r) { + return + } + } + } +} + +// Goes over each remote and for the first non nil route connection, +// invokes the function `f`. +// Server lock is held on entry. +func (s *Server) forEachRemote(f func(r *client)) { + for _, conns := range s.routes { + for _, r := range conns { + if r != nil { + f(r) + break + } + } + } +} diff --git a/server/routes_test.go b/server/routes_test.go index 51eec7c9..676a97e4 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -19,7 +19,11 @@ import ( "crypto/tls" "fmt" "net" + "net/http" + "net/http/httptest" "net/url" + "os" + "reflect" "runtime" "strconv" "strings" @@ -28,7 +32,9 @@ import ( "testing" "time" + "github.com/nats-io/jwt/v2" "github.com/nats-io/nats.go" + "github.com/nats-io/nkeys" ) func init() { @@ -287,11 +293,33 @@ func TestServerRoutesWithAuthAndBCrypt(t *testing.T) { // Helper function to check that a cluster is formed func checkClusterFormed(t testing.TB, servers ...*Server) { t.Helper() - expectedNumRoutes := len(servers) - 1 + var _enr [8]int + enr := _enr[:0] + for _, a := range servers { + if a.getOpts().Cluster.PoolSize < 0 { + enr = append(enr, len(servers)-1) + } else { + a.mu.RLock() + nr := a.routesPoolSize + len(a.accRoutes) + a.mu.RUnlock() + total := 0 + for _, b := range servers { + if a == b { + continue + } + if b.getOpts().Cluster.PoolSize < 0 { + total++ + } else { + total += nr + } + } + enr = append(enr, total) + } + } checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { - for _, s := range servers { - if numRoutes := s.NumRoutes(); numRoutes != expectedNumRoutes { - return fmt.Errorf("Expected %d routes for server %q, got %d", expectedNumRoutes, s.ID(), numRoutes) + for i, s := range servers { + if numRoutes := s.NumRoutes(); numRoutes != enr[i] { + return fmt.Errorf("Expected %d routes for server %q, got %d", enr[i], s, numRoutes) } } return nil @@ -888,53 +916,77 @@ func TestServerPoolUpdatedWhenRouteGoesAway(t *testing.T) { } func TestRouteFailedConnRemovedFromTmpMap(t *testing.T) { - optsA, err := ProcessConfigFile("./configs/srv_a.conf") - require_NoError(t, err) - optsA.NoSigs, optsA.NoLog = true, true + for _, test := range []struct { + name string + poolSize int + }{ + {"no pooling", -1}, + {"pool 1", 1}, + {"pool 3", 3}, + } { + t.Run(test.name, func(t *testing.T) { + optsA, err := ProcessConfigFile("./configs/srv_a.conf") + require_NoError(t, err) + optsA.NoSigs, optsA.NoLog = true, true + optsA.Cluster.PoolSize = test.poolSize - optsB, err := ProcessConfigFile("./configs/srv_b.conf") - require_NoError(t, err) - optsB.NoSigs, optsB.NoLog = true, true + optsB, err := ProcessConfigFile("./configs/srv_b.conf") + require_NoError(t, err) + optsB.NoSigs, optsB.NoLog = true, true + optsB.Cluster.PoolSize = test.poolSize - srvA := New(optsA) - defer srvA.Shutdown() - srvB := New(optsB) - defer srvB.Shutdown() + srvA := New(optsA) + defer srvA.Shutdown() + srvB := New(optsB) + defer srvB.Shutdown() - // Start this way to increase chance of having the two connect - // to each other at the same time. This will cause one of the - // route to be dropped. - wg := &sync.WaitGroup{} - wg.Add(2) - go func() { - srvA.Start() - wg.Done() - }() - go func() { - srvB.Start() - wg.Done() - }() + // Start this way to increase chance of having the two connect + // to each other at the same time. This will cause one of the + // route to be dropped. + wg := &sync.WaitGroup{} + wg.Add(2) + go func() { + srvA.Start() + wg.Done() + }() + go func() { + srvB.Start() + wg.Done() + }() - checkClusterFormed(t, srvA, srvB) + checkClusterFormed(t, srvA, srvB) - // Ensure that maps are empty - checkMap := func(s *Server) { - checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { - s.grMu.Lock() - l := len(s.grTmpClients) - s.grMu.Unlock() - if l != 0 { - return fmt.Errorf("grTmpClients map should be empty, got %v", l) + // Ensure that maps are empty + checkMap := func(s *Server) { + checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { + s.grMu.Lock() + l := len(s.grTmpClients) + s.grMu.Unlock() + if l != 0 { + return fmt.Errorf("grTmpClients map should be empty, got %v", l) + } + return nil + }) } - return nil + checkMap(srvA) + checkMap(srvB) + + srvB.Shutdown() + srvA.Shutdown() + wg.Wait() }) } - checkMap(srvA) - checkMap(srvB) +} - srvB.Shutdown() - srvA.Shutdown() - wg.Wait() +func getFirstRoute(s *Server) *client { + for _, conns := range s.routes { + for _, r := range conns { + if r != nil { + return r + } + } + } + return nil } func TestRoutePermsAppliedOnInboundAndOutboundRoute(t *testing.T) { @@ -970,10 +1022,7 @@ func TestRoutePermsAppliedOnInboundAndOutboundRoute(t *testing.T) { t.Helper() var route *client s.mu.Lock() - for _, r := range s.routes { - route = r - break - } + route = getFirstRoute(s) s.mu.Unlock() route.mu.Lock() perms := route.perms @@ -1095,8 +1144,21 @@ func TestRouteNoCrashOnAddingSubToRoute(t *testing.T) { checkClusterFormed(t, servers...) // Make sure all subs are registered in s. + gacc := s.globalAccount() + gacc.mu.RLock() + sl := gacc.sl + gacc.mu.RUnlock() checkFor(t, time.Second, 15*time.Millisecond, func() error { - if ts := s.globalAccount().TotalSubs() - 4; ts < int(numRoutes) { + var _subs [64]*subscription + subs := _subs[:0] + sl.All(&subs) + var ts int + for _, sub := range subs { + if string(sub.subject) == "foo" { + ts++ + } + } + if ts != int(numRoutes) { return fmt.Errorf("Not all %d routed subs were registered: %d", numRoutes, ts) } return nil @@ -1127,10 +1189,7 @@ func TestRouteRTT(t *testing.T) { t.Helper() var route *client s.mu.Lock() - for _, r := range s.routes { - route = r - break - } + route = getFirstRoute(s) s.mu.Unlock() var rtt time.Duration @@ -1164,11 +1223,10 @@ func TestRouteRTT(t *testing.T) { // the route's RTT to 0 to see if it gets // updated. s.mu.Lock() - for _, r := range s.routes { + if r := getFirstRoute(s); r != nil { r.mu.Lock() r.rtt = 0 r.mu.Unlock() - break } s.mu.Unlock() } @@ -1256,10 +1314,7 @@ func TestRouteCloseTLSConnection(t *testing.T) { // Get route connection var route *client s.mu.Lock() - for _, r := range s.routes { - route = r - break - } + route = getFirstRoute(s) s.mu.Unlock() // Fill the buffer. We want to timeout on write so that nc.Close() // would block due to a write that cannot complete. @@ -1497,8 +1552,8 @@ func testTLSRoutesCertificateImplicitAllow(t *testing.T, pass bool) { defer srvB.Shutdown() if pass { - checkNumRoutes(t, srvA, 1) - checkNumRoutes(t, srvB, 1) + checkNumRoutes(t, srvA, DEFAULT_ROUTE_POOL_SIZE) + checkNumRoutes(t, srvB, DEFAULT_ROUTE_POOL_SIZE) } else { time.Sleep(1 * time.Second) // the fail case uses the IP, so a short wait is sufficient checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { @@ -1634,18 +1689,18 @@ func TestRouteSolicitedReconnectsEvenIfImplicit(t *testing.T) { checkClusterFormed(t, s1, s2, s3) s2.mu.Lock() - for _, r := range s2.routes { + s2.forEachRoute(func(r *client) { r.mu.Lock() // Close the route between S2 and S3 (that do not have explicit route to each other) if r.route.remoteID == s3.ID() { r.nc.Close() } r.mu.Unlock() - } + }) s2.mu.Unlock() // Wait a bit to make sure that we don't check for cluster formed too soon (need to make // sure that connection is really removed and reconnect mechanism starts). - time.Sleep(100 * time.Millisecond) + time.Sleep(500 * time.Millisecond) checkClusterFormed(t, s1, s2, s3) // Now shutdown server 3 and make sure that s2 stops trying to reconnect to s3 at one point @@ -1653,7 +1708,8 @@ func TestRouteSolicitedReconnectsEvenIfImplicit(t *testing.T) { s2.SetLogger(l, true, false) s3.Shutdown() // S2 should retry ConnectRetries+1 times and then stop - for i := 0; i < o2.Cluster.ConnectRetries+1; i++ { + // Take into account default route pool size and system account dedicated route + for i := 0; i < (DEFAULT_ROUTE_POOL_SIZE+1)*(o2.Cluster.ConnectRetries+1); i++ { select { case <-l.ch: case <-time.After(2 * time.Second): @@ -1676,6 +1732,7 @@ func TestRouteSaveTLSName(t *testing.T) { cluster { name: "abc" port: -1 + pool_size: -1 tls { cert_file: '../test/configs/certs/server-noip.pem' key_file: '../test/configs/certs/server-key-noip.pem' @@ -1691,6 +1748,7 @@ func TestRouteSaveTLSName(t *testing.T) { cluster { name: "abc" port: -1 + pool_size: -1 routes: ["nats://%s:%d"] tls { cert_file: '../test/configs/certs/server-noip.pem' @@ -1713,13 +1771,13 @@ func TestRouteSaveTLSName(t *testing.T) { reloadUpdateConfig(t, s2, c2And3Conf, fmt.Sprintf(tmpl, "127.0.0.1", o1.Cluster.Port)) s2.mu.RLock() - for _, r := range s2.routes { + s2.forEachRoute(func(r *client) { r.mu.Lock() if r.route.routeType == Implicit { r.nc.Close() } r.mu.Unlock() - } + }) s2.mu.RUnlock() checkClusterFormed(t, s1, s2, s3) @@ -1733,13 +1791,13 @@ func TestRouteSaveTLSName(t *testing.T) { for i := 0; !gotIt && i < 5; i++ { s2.mu.Lock() s2.routeTLSName = _EMPTY_ - for _, r := range s2.routes { + s2.forEachRoute(func(r *client) { r.mu.Lock() if r.route.routeType == Implicit { r.nc.Close() } r.mu.Unlock() - } + }) s2.mu.Unlock() select { case <-l.errCh: @@ -1757,3 +1815,1324 @@ func TestRouteSaveTLSName(t *testing.T) { reloadUpdateConfig(t, s2, c2And3Conf, fmt.Sprintf(tmpl, "localhost", o1.Cluster.Port)) checkClusterFormed(t, s1, s2, s3) } + +func TestRoutePoolAndPerAccountErrors(t *testing.T) { + conf := createConfFile(t, []byte(` + port: -1 + cluster { + port: -1 + accounts: ["abc", "def", "abc"] + } + `)) + o := LoadConfig(conf) + if _, err := NewServer(o); err == nil || !strings.Contains(err.Error(), "duplicate") { + t.Fatalf("Expected error about duplicate, got %v", err) + } + + conf1 := createConfFile(t, []byte(` + port: -1 + cluster { + port: -1 + name: "local" + accounts: ["abc"] + } + `)) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + l := &captureErrorLogger{errCh: make(chan string, 10)} + s1.SetLogger(l, false, false) + + conf2 := createConfFile(t, []byte(fmt.Sprintf(` + port: -1 + cluster { + port: -1 + name: "local" + routes: ["nats://127.0.0.1:%d"] + accounts: ["def"] + } + `, o1.Cluster.Port))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + for i := 0; i < 2; i++ { + select { + case e := <-l.errCh: + if !strings.Contains(e, "No route for account \"def\"") { + t.Fatalf("Expected error about no route for account, got %v", e) + } + case <-time.After(2 * time.Second): + t.Fatalf("Did not get expected error regarding no route for account") + } + time.Sleep(DEFAULT_ROUTE_RECONNECT + 100*time.Millisecond) + } + + s2.Shutdown() + s1.Shutdown() + + conf1 = createConfFile(t, []byte(` + port: -1 + cluster { + port: -1 + name: "local" + pool_size: 5 + } + `)) + s1, o1 = RunServerWithConfig(conf1) + defer s1.Shutdown() + + l = &captureErrorLogger{errCh: make(chan string, 10)} + s1.SetLogger(l, false, false) + + conf2 = createConfFile(t, []byte(fmt.Sprintf(` + port: -1 + cluster { + port: -1 + name: "local" + routes: ["nats://127.0.0.1:%d"] + pool_size: 3 + } + `, o1.Cluster.Port))) + s2, _ = RunServerWithConfig(conf2) + defer s2.Shutdown() + + for i := 0; i < 2; i++ { + select { + case e := <-l.errCh: + if !strings.Contains(e, "Mismatch route pool size") { + t.Fatalf("Expected error about pool size mismatch, got %v", e) + } + case <-time.After(2 * time.Second): + t.Fatalf("Did not get expected error regarding mismatch pool size") + } + time.Sleep(DEFAULT_ROUTE_RECONNECT + 100*time.Millisecond) + } +} + +func TestRoutePool(t *testing.T) { + tmpl := ` + port: -1 + accounts { + A { users: [{user: "a", password: "a"}] } + B { users: [{user: "b", password: "b"}] } + } + cluster { + port: -1 + name: "local" + %s + pool_size: 2 + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + checkRoutePoolIdx := func(s *Server, accName string, expected int) { + t.Helper() + a, err := s.LookupAccount(accName) + require_NoError(t, err) + require_True(t, a != nil) + a.mu.RLock() + rpi := a.routePoolIdx + a.mu.RUnlock() + require_True(t, rpi == expected) + } + checkRoutePoolIdx(s1, "A", 0) + checkRoutePoolIdx(s2, "A", 0) + checkRoutePoolIdx(s2, "B", 1) + checkRoutePoolIdx(s2, "B", 1) + + sendAndRecv := func(acc, user, pwd string) { + t.Helper() + s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(user, pwd)) + defer s2nc.Close() + + sub := natsSubSync(t, s2nc, "foo") + natsFlush(t, s2nc) + + s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(user, pwd)) + defer s1nc.Close() + + checkSubInterest(t, s1, acc, "foo", time.Second) + + for i := 0; i < 1000; i++ { + natsPub(t, s1nc, "foo", []byte("hello")) + } + for i := 0; i < 1000; i++ { + natsNexMsg(t, sub, time.Second) + } + // Make sure we don't receive more + if msg, err := sub.NextMsg(150 * time.Millisecond); err == nil { + t.Fatalf("Unexpected message: %+v", msg) + } + } + + sendAndRecv("A", "a", "a") + sendAndRecv("B", "b", "b") + + checkStats := func(s *Server, isOut bool) { + t.Helper() + s.mu.RLock() + defer s.mu.RUnlock() + for _, conns := range s.routes { + for i, r := range conns { + r.mu.Lock() + if isOut { + if v := r.stats.outMsgs; v < 1000 { + r.mu.Unlock() + t.Fatalf("Expected at least 1000 in out msgs for route %v, got %v", i+1, v) + } + } else { + if v := r.stats.inMsgs; v < 1000 { + r.mu.Unlock() + t.Fatalf("Expected at least 1000 in in msgs for route %v, got %v", i+1, v) + } + } + r.mu.Unlock() + } + } + } + checkStats(s1, true) + checkStats(s2, false) + + disconnectRoute := func(s *Server, idx int) { + t.Helper() + attempts := 0 + TRY_AGAIN: + s.mu.RLock() + for _, conns := range s.routes { + for i, r := range conns { + if i != idx { + continue + } + if r != nil { + r.mu.Lock() + nc := r.nc + r.mu.Unlock() + if nc == nil { + s.mu.RUnlock() + if attempts++; attempts < 10 { + time.Sleep(250 * time.Millisecond) + goto TRY_AGAIN + } + t.Fatalf("Route %v net.Conn is nil", i) + } + nc.Close() + } else { + s.mu.RUnlock() + if attempts++; attempts < 10 { + time.Sleep(250 * time.Millisecond) + goto TRY_AGAIN + } + t.Fatalf("Route %v connection is nil", i) + } + } + } + s.mu.RUnlock() + time.Sleep(250 * time.Millisecond) + checkClusterFormed(t, s1, s2) + } + disconnectRoute(s1, 0) + disconnectRoute(s2, 1) +} + +func TestRoutePoolConnectRace(t *testing.T) { + // This test will have each server point to each other and that is causing + // each one to attempt to connect routes to each other which should lead + // to connections needing to be dropped. We make sure that there is still + // resolution and there is the expected number of routes. + createSrv := func(name string, port int) *Server { + o := DefaultOptions() + o.Port = -1 + o.ServerName = name + o.Cluster.PoolSize = 5 + o.Cluster.Name = "local" + o.Cluster.Port = port + o.Routes = RoutesFromStr("nats://127.0.0.1:1234,nats://127.0.0.1:1235,nats://127.0.0.1:1236") + s, err := NewServer(o) + if err != nil { + t.Fatalf("Error creating server: %v", err) + } + return s + } + s1 := createSrv("A", 1234) + s2 := createSrv("B", 1235) + s3 := createSrv("C", 1236) + + l := &captureDebugLogger{dbgCh: make(chan string, 100)} + s1.SetLogger(l, true, false) + + servers := []*Server{s1, s2, s3} + + for _, s := range servers { + go s.Start() + defer s.Shutdown() + } + + checkClusterFormed(t, s1, s2, s3) + + for done, duplicate := false, 0; !done; { + select { + case e := <-l.dbgCh: + if strings.Contains(e, "duplicate") { + if duplicate++; duplicate > 10 { + t.Fatalf("Routes are constantly reconnecting: %v", e) + } + } + case <-time.After(DEFAULT_ROUTE_RECONNECT + 250*time.Millisecond): + // More than reconnect and some, and no reconnect, so we are good. + done = true + } + } + + for _, s := range servers { + s.Shutdown() + s.WaitForShutdown() + } +} + +func TestRoutePoolRouteStoredSameIndexBothSides(t *testing.T) { + tmpl := ` + port: -1 + accounts { + A { users: [{user: "a", password: "a"}] } + B { users: [{user: "b", password: "b"}] } + C { users: [{user: "c", password: "c"}] } + D { users: [{user: "d", password: "d"}] } + } + cluster { + port: -1 + name: "local" + %s + pool_size: 4 + } + no_sys_acc: true + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + for i := 0; i < 20; i++ { + checkClusterFormed(t, s1, s2) + + collect := func(s *Server, checkRemoteAddr bool) []string { + addrs := make([]string, 0, 4) + s.mu.RLock() + s.forEachRoute(func(r *client) { + var addr string + r.mu.Lock() + if r.nc != nil { + if checkRemoteAddr { + addr = r.nc.RemoteAddr().String() + } else { + addr = r.nc.LocalAddr().String() + } + addrs = append(addrs, addr) + } + r.mu.Unlock() + }) + s.mu.RUnlock() + return addrs + } + + addrsS1 := collect(s1, true) + addrsS2 := collect(s2, false) + if len(addrsS1) != 4 || len(addrsS2) != 4 { + // It could be that connections were not ready (r.nc is nil in collect()) + // if that is the case, try again. + i-- + continue + } + + if !reflect.DeepEqual(addrsS1, addrsS2) { + t.Fatalf("Connections not stored at same index:\ns1=%v\ns2=%v", addrsS1, addrsS2) + } + + s1.mu.RLock() + s1.forEachRoute(func(r *client) { + r.mu.Lock() + if r.nc != nil { + r.nc.Close() + } + r.mu.Unlock() + }) + s1.mu.RUnlock() + } +} + +type captureRMsgTrace struct { + DummyLogger + sync.Mutex + traces *bytes.Buffer + out []string +} + +func (l *captureRMsgTrace) Tracef(format string, args ...interface{}) { + l.Lock() + defer l.Unlock() + msg := fmt.Sprintf(format, args...) + if strings.Contains(msg, "[RMSG ") { + l.traces.WriteString(msg) + l.out = append(l.out, msg) + } +} + +func TestRoutePerAccount(t *testing.T) { + + akp1, _ := nkeys.CreateAccount() + acc1, _ := akp1.PublicKey() + + akp2, _ := nkeys.CreateAccount() + acc2, _ := akp2.PublicKey() + + tmpl := ` + port: -1 + accounts { + %s { users: [{user: "a", password: "a"}] } + %s { users: [{user: "b", password: "b"}] } + } + cluster { + port: -1 + name: "local" + %s + accounts: ["%s"] + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, acc1, acc2, _EMPTY_, acc2))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + acc1, acc2, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + acc2))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + l := &captureRMsgTrace{traces: &bytes.Buffer{}} + s2.SetLogger(l, false, true) + + checkClusterFormed(t, s1, s2) + + disconnectRoute := func(s *Server) { + t.Helper() + attempts := 0 + TRY_AGAIN: + s.mu.RLock() + if conns, ok := s.accRoutes[acc2]; ok { + for _, r := range conns { + if r != nil { + r.mu.Lock() + nc := r.nc + r.mu.Unlock() + if nc == nil { + s.mu.RUnlock() + if attempts++; attempts < 10 { + time.Sleep(250 * time.Millisecond) + goto TRY_AGAIN + } + t.Fatal("Route net.Conn is nil") + } + nc.Close() + } else { + s.mu.RUnlock() + if attempts++; attempts < 10 { + time.Sleep(250 * time.Millisecond) + goto TRY_AGAIN + } + t.Fatal("Route connection is nil") + } + } + } + s.mu.RUnlock() + time.Sleep(250 * time.Millisecond) + checkClusterFormed(t, s1, s2) + } + disconnectRoute(s1) + disconnectRoute(s2) + + sendAndRecv := func(acc, user, pwd string) { + t.Helper() + s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(user, pwd)) + defer s2nc.Close() + + sub := natsSubSync(t, s2nc, "foo") + natsFlush(t, s2nc) + + s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(user, pwd)) + defer s1nc.Close() + + checkSubInterest(t, s1, acc, "foo", time.Second) + + for i := 0; i < 10; i++ { + natsPub(t, s1nc, "foo", []byte("hello")) + } + for i := 0; i < 10; i++ { + natsNexMsg(t, sub, time.Second) + } + // Make sure we don't receive more + if msg, err := sub.NextMsg(150 * time.Millisecond); err == nil { + t.Fatalf("Unexpected message: %+v", msg) + } + } + + sendAndRecv(acc1, "a", "a") + sendAndRecv(acc2, "b", "b") + + l.Lock() + traces := l.traces.String() + out := append([]string(nil), l.out...) + l.Unlock() + // We should not have any "[RMSG " + if strings.Contains(traces, fmt.Sprintf("[RMSG %s", acc2)) { + var outStr string + for _, l := range out { + outStr += l + "\r\n" + } + t.Fatalf("Should not have included account %q in protocol, got:\n%s", acc2, outStr) + } +} + +func TestRoutePerAccountImplicit(t *testing.T) { + tmpl := ` + port: -1 + accounts { + A { users: [{user: "a", password: "a"}] } + B { users: [{user: "b", password: "b"}] } + } + cluster { + port: -1 + name: "local" + accounts: ["A"] + %s + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2And3 := createConfFile(t, []byte(fmt.Sprintf(tmpl, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2And3) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + s3, _ := RunServerWithConfig(conf2And3) + defer s3.Shutdown() + + checkClusterFormed(t, s1, s2, s3) + + // On s3, close the per-account routes from s2 + s3.mu.RLock() + for _, conns := range s3.accRoutes { + for rem, r := range conns { + if rem != s2.ID() { + continue + } + r.mu.Lock() + if r.nc != nil { + r.nc.Close() + } + r.mu.Unlock() + } + } + s3.mu.RUnlock() + // Wait a bit to make sure there is a disconnect, then check the cluster is ok + time.Sleep(250 * time.Millisecond) + checkClusterFormed(t, s1, s2, s3) +} + +func TestRoutePerAccountDefaultForSysAccount(t *testing.T) { + tmpl := ` + port: -1 + accounts { + A { users: [{user: "a", password: "a"}] } + B { users: [{user: "b", password: "b"}] } + } + cluster { + port: -1 + name: "local" + %s + %s + %s + } + %s + ` + for _, test := range []struct { + name string + accounts string + sysAcc string + noSysAcc bool + }{ + {"default sys no accounts", _EMPTY_, _EMPTY_, false}, + {"default sys in accounts", "accounts: [\"$SYS\"]", _EMPTY_, false}, + {"default sys with other accounts", "accounts: [\"A\",\"$SYS\"]", _EMPTY_, false}, + {"explicit sys no accounts", _EMPTY_, "system_account: B", false}, + {"explicit sys in accounts", "accounts: [\"B\"]", "system_account: B", false}, + {"explicit sys with other accounts", "accounts: [\"B\",\"A\"]", "system_account: B", false}, + {"no system account no accounts", _EMPTY_, "no_sys_acc: true", true}, + {"no system account with accounts", "accounts: [\"A\"]", "no_sys_acc: true", true}, + } { + t.Run(test.name, func(t *testing.T) { + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_, test.accounts, + _EMPTY_, test.sysAcc))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_, test.accounts, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), test.sysAcc))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + checkSysAccRoute := func(s *Server) { + t.Helper() + var name string + acc := s.SystemAccount() + if test.noSysAcc { + if acc != nil { + t.Fatalf("Should not be any system account, got %q", acc.GetName()) + } + // We will check that there is no accRoutes for the default + // system account name + name = DEFAULT_SYSTEM_ACCOUNT + } else { + acc.mu.RLock() + pi := acc.routePoolIdx + name = acc.Name + acc.mu.RUnlock() + if pi != -1 { + t.Fatalf("System account %q should have route pool index==-1, got %v", name, pi) + } + } + s.mu.RLock() + _, ok := s.accRoutes[name] + s.mu.RUnlock() + if test.noSysAcc { + if ok { + t.Fatalf("System account %q should not have its own route, since NoSystemAccount was specified", name) + } + } else if !ok { + t.Fatalf("System account %q should be present in accRoutes, it was not", name) + } + } + checkSysAccRoute(s1) + checkSysAccRoute(s2) + + // Check that this is still the case after a config reload + reloadUpdateConfig(t, s1, conf1, fmt.Sprintf(tmpl, "pool_size: 4", test.accounts, + _EMPTY_, test.sysAcc)) + reloadUpdateConfig(t, s2, conf2, fmt.Sprintf(tmpl, "pool_size: 4", test.accounts, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), test.sysAcc)) + + checkSysAccRoute(s1) + checkSysAccRoute(s2) + }) + } +} + +func TestRoutePerAccountConnectRace(t *testing.T) { + // This test will have each server point to each other and that is causing + // each one to attempt to connect routes to each other which should lead + // to connections needing to be dropped. We make sure that there is still + // resolution and there is the expected number of routes. + createSrv := func(name string, port int) *Server { + o := DefaultOptions() + o.Port = -1 + o.ServerName = name + o.Accounts = []*Account{NewAccount("A")} + o.NoSystemAccount = true + o.Cluster.PoolSize = 1 + o.Cluster.Accounts = []string{"A"} + o.Cluster.Name = "local" + o.Cluster.Port = port + o.Routes = RoutesFromStr("nats://127.0.0.1:1234,nats://127.0.0.1:1235,nats://127.0.0.1:1236") + s, err := NewServer(o) + if err != nil { + t.Fatalf("Error creating server: %v", err) + } + return s + } + s1 := createSrv("A", 1234) + s2 := createSrv("B", 1235) + s3 := createSrv("C", 1236) + + l := &captureDebugLogger{dbgCh: make(chan string, 100)} + s1.SetLogger(l, true, false) + + servers := []*Server{s1, s2, s3} + + for _, s := range servers { + go s.Start() + defer s.Shutdown() + } + + checkClusterFormed(t, s1, s2, s3) + + for done, duplicate := false, 0; !done; { + select { + case e := <-l.dbgCh: + if strings.Contains(e, "duplicate") { + if duplicate++; duplicate > 10 { + t.Fatalf("Routes are constantly reconnecting: %v", e) + } + } + case <-time.After(DEFAULT_ROUTE_RECONNECT + 250*time.Millisecond): + // More than reconnect and some, and no reconnect, so we are good. + done = true + } + } + + for _, s := range servers { + s.Shutdown() + s.WaitForShutdown() + } +} + +func TestRoutePoolPerAccountSubUnsubProtoParsing(t *testing.T) { + for _, test := range []struct { + name string + extra string + }{ + {"regular", _EMPTY_}, + {"pooling", "pool_size: 5"}, + {"per-account", "accounts: [\"A\"]"}, + } { + t.Run(test.name, func(t *testing.T) { + confATemplate := ` + port: -1 + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(confATemplate, test.extra))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + confBTemplate := ` + port: -1 + accounts { + A { users: [{user: "user1", password: "pwd"}] } + } + cluster { + listen: 127.0.0.1:-1 + routes = [ + "nats://127.0.0.1:%d" + ] + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(confBTemplate, optsA.Cluster.Port, test.extra))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + + checkClusterFormed(t, srva, srvb) + + ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncA.Close() + + for i := 0; i < 2; i++ { + var sub *nats.Subscription + if i == 0 { + sub = natsSubSync(t, ncA, "foo") + } else { + sub = natsQueueSubSync(t, ncA, "foo", "bar") + } + + checkSubInterest(t, srvb, "A", "foo", 2*time.Second) + + checkSubs := func(s *Server, queue, expected bool) { + t.Helper() + acc, err := s.LookupAccount("A") + if err != nil { + t.Fatalf("Error looking account: %v", err) + } + checkFor(t, time.Second, 15*time.Millisecond, func() error { + acc.mu.RLock() + res := acc.sl.Match("foo") + acc.mu.RUnlock() + if expected { + if queue && (len(res.qsubs) == 0 || len(res.psubs) != 0) { + return fmt.Errorf("Expected queue sub, did not find it") + } else if !queue && (len(res.psubs) == 0 || len(res.qsubs) != 0) { + return fmt.Errorf("Expected psub, did not find it") + } + } else if len(res.psubs)+len(res.qsubs) != 0 { + return fmt.Errorf("Unexpected subscription: %+v", res) + } + return nil + }) + } + + checkSubs(srva, i == 1, true) + checkSubs(srvb, i == 1, true) + + sub.Unsubscribe() + natsFlush(t, ncA) + + checkSubs(srva, i == 1, false) + checkSubs(srvb, i == 1, false) + } + }) + } +} + +func TestRoutePoolPerAccountStreamImport(t *testing.T) { + for _, test := range []struct { + name string + route string + }{ + {"regular", _EMPTY_}, + {"pooled", "pool_size: 5"}, + {"one per account", "accounts: [\"A\"]"}, + {"both per account", "accounts: [\"A\", \"B\"]"}, + } { + t.Run(test.name, func(t *testing.T) { + tmplA := ` + server_name: "A" + port: -1 + accounts { + A { + users: [{user: "user1", password: "pwd"}] + exports: [{stream: "foo"}] + } + B { + users: [{user: "user2", password: "pwd"}] + imports: [{stream: {subject: "foo", account: "A"}}] + } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + %s + } + ` + confA := createConfFile(t, []byte(fmt.Sprintf(tmplA, test.route))) + srva, optsA := RunServerWithConfig(confA) + defer srva.Shutdown() + + tmplB := ` + server_name: "B" + port: -1 + accounts { + A { + users: [{user: "user1", password: "pwd"}] + exports: [{stream: "foo"}] + } + B { + users: [{user: "user2", password: "pwd"}] + imports: [{stream: {subject: "foo", account: "A"}}] + } + C { users: [{user: "user3", password: "pwd"}] } + D { users: [{user: "user4", password: "pwd"}] } + } + cluster { + name: "local" + listen: 127.0.0.1:-1 + %s + %s + } + ` + confB := createConfFile(t, []byte(fmt.Sprintf(tmplB, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", optsA.Cluster.Port), + test.route))) + srvb, _ := RunServerWithConfig(confB) + defer srvb.Shutdown() + + checkClusterFormed(t, srva, srvb) + + ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("user2", "pwd")) + defer ncB.Close() + + sub := natsSubSync(t, ncB, "foo") + + checkSubInterest(t, srva, "B", "foo", time.Second) + checkSubInterest(t, srva, "A", "foo", time.Second) + + ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("user1", "pwd")) + defer ncA.Close() + + natsPub(t, ncA, "foo", []byte("hello")) + natsNexMsg(t, sub, time.Second) + + natsUnsub(t, sub) + natsFlush(t, ncB) + + checkFor(t, time.Second, 15*time.Millisecond, func() error { + for _, acc := range []string{"A", "B"} { + a, err := srva.LookupAccount(acc) + if err != nil { + return err + } + a.mu.RLock() + r := a.sl.Match("foo") + a.mu.RUnlock() + if len(r.psubs) != 0 { + return fmt.Errorf("Subscription not unsubscribed") + } + } + return nil + }) + }) + } +} + +func TestRoutePoolAndPerAccountWithServiceLatencyNoDataRace(t *testing.T) { + // For this test, we want the system (SYS) and SERVICE accounts to be bound + // to different routes. So the names and pool size have been chosen accordingly. + for _, test := range []struct { + name string + poolStr string + }{ + {"pool", "pool_size: 5"}, + {"per account", "accounts: [\"SYS\", \"SERVICE\", \"REQUESTOR\"]"}, + } { + t.Run(test.name, func(t *testing.T) { + tmpl := ` + port: -1 + accounts { + SYS { + users [{user: "sys", password: "pwd"}] + } + SERVICE { + users [{user: "svc", password: "pwd"}] + exports = [ + {service: "req.*", latency: {subject: "results"}} + ] + } + REQUESTOR { + users [{user: "req", password: "pwd"}] + imports = [ + {service: {account: "SERVICE", subject: "req.echo"}, to: "request"} + ] + } + } + system_account: "SYS" + cluster { + name: "local" + port: -1 + %s + %s + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, test.poolStr, _EMPTY_))) + s1, opts1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, test.poolStr, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", opts1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + // Create service provider. + nc := natsConnect(t, s1.ClientURL(), nats.UserInfo("svc", "pwd")) + defer nc.Close() + + // The service listener. + // serviceTime := 25 * time.Millisecond + natsSub(t, nc, "req.echo", func(msg *nats.Msg) { + // time.Sleep(serviceTime) + msg.Respond(msg.Data) + }) + + // Listen for metrics + rsub := natsSubSync(t, nc, "results") + natsFlush(t, nc) + checkSubInterest(t, s2, "SERVICE", "results", time.Second) + + // Create second client and send request from this one. + nc2 := natsConnect(t, s2.ClientURL(), nats.UserInfo("req", "pwd")) + defer nc2.Close() + + for i := 0; i < 5; i++ { + // Send the request. + _, err := nc2.Request("request", []byte("hello"), time.Second) + require_NoError(t, err) + // Get the latency result + natsNexMsg(t, rsub, time.Second) + } + }) + } +} + +func TestRouteParseOriginClusterMsgArgs(t *testing.T) { + for _, test := range []struct { + racc bool + args string + pacache string + reply string + queues [][]byte + }{ + {true, "ORIGIN foo 12 345\r\n", "MY_ACCOUNT foo", _EMPTY_, nil}, + {true, "ORIGIN foo bar 12 345\r\n", "MY_ACCOUNT foo", "bar", nil}, + {true, "ORIGIN foo + bar queue1 queue2 12 345\r\n", "MY_ACCOUNT foo", "bar", [][]byte{[]byte("queue1"), []byte("queue2")}}, + {true, "ORIGIN foo | queue1 queue2 12 345\r\n", "MY_ACCOUNT foo", _EMPTY_, [][]byte{[]byte("queue1"), []byte("queue2")}}, + + {false, "ORIGIN MY_ACCOUNT foo 12 345\r\n", "MY_ACCOUNT foo", _EMPTY_, nil}, + {false, "ORIGIN MY_ACCOUNT foo bar 12 345\r\n", "MY_ACCOUNT foo", "bar", nil}, + {false, "ORIGIN MY_ACCOUNT foo + bar queue1 queue2 12 345\r\n", "MY_ACCOUNT foo", "bar", [][]byte{[]byte("queue1"), []byte("queue2")}}, + {false, "ORIGIN MY_ACCOUNT foo | queue1 queue2 12 345\r\n", "MY_ACCOUNT foo", _EMPTY_, [][]byte{[]byte("queue1"), []byte("queue2")}}, + } { + t.Run(test.args, func(t *testing.T) { + c := &client{kind: ROUTER, route: &route{}} + if test.racc { + c.route.accName = []byte("MY_ACCOUNT") + } + if err := c.processRoutedOriginClusterMsgArgs([]byte(test.args)); err != nil { + t.Fatalf("Error processing: %v", err) + } + if string(c.pa.origin) != "ORIGIN" { + t.Fatalf("Invalid origin: %q", c.pa.origin) + } + if string(c.pa.account) != "MY_ACCOUNT" { + t.Fatalf("Invalid account: %q", c.pa.account) + } + if string(c.pa.subject) != "foo" { + t.Fatalf("Invalid subject: %q", c.pa.subject) + } + if string(c.pa.reply) != test.reply { + t.Fatalf("Invalid reply: %q", c.pa.reply) + } + if !reflect.DeepEqual(c.pa.queues, test.queues) { + t.Fatalf("Invalid queues: %v", c.pa.queues) + } + if c.pa.hdr != 12 { + t.Fatalf("Invalid header size: %v", c.pa.hdr) + } + if c.pa.size != 345 { + t.Fatalf("Invalid size: %v", c.pa.size) + } + }) + } +} + +func TestRoutePoolAndPerAccountOperatorMode(t *testing.T) { + _, spub := createKey(t) + sysClaim := jwt.NewAccountClaims(spub) + sysClaim.Name = "SYS" + sysJwt := encodeClaim(t, sysClaim, spub) + + akp, apub := createKey(t) + claima := jwt.NewAccountClaims(apub) + ajwt := encodeClaim(t, claima, apub) + + bkp, bpub := createKey(t) + claimb := jwt.NewAccountClaims(bpub) + bjwt := encodeClaim(t, claimb, bpub) + + ckp, cpub := createKey(t) + claimc := jwt.NewAccountClaims(cpub) + cjwt := encodeClaim(t, claimc, cpub) + + _, dpub := createKey(t) + + basePath := "/ngs/v1/accounts/jwt/" + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == basePath { + w.Write([]byte("ok")) + } else if strings.HasSuffix(r.URL.Path, spub) { + w.Write([]byte(sysJwt)) + } else if strings.HasSuffix(r.URL.Path, apub) { + w.Write([]byte(ajwt)) + } else if strings.HasSuffix(r.URL.Path, bpub) { + w.Write([]byte(bjwt)) + } else if strings.HasSuffix(r.URL.Path, cpub) { + w.Write([]byte(cjwt)) + } + })) + defer ts.Close() + + operator := fmt.Sprintf(` + operator: %s + system_account: %s + resolver: URL("%s%s") + `, ojwt, spub, ts.URL, basePath) + + tmpl := ` + listen: 127.0.0.1:-1 + server_name: %s + cluster { + port: -1 + name: "local" + %s + accounts: ["` + apub + `"%s] + } + ` + operator + + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "A", _EMPTY_, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "B", fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + _EMPTY_))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + checkRoute := func(s *Server, acc string, perAccount bool) { + t.Helper() + checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { + s.mu.RLock() + _, ok := s.accRoutes[acc] + s.mu.RUnlock() + if perAccount && !ok { + return fmt.Errorf("No dedicated route for account %q on server %q", acc, s) + } else if !perAccount && ok { + return fmt.Errorf("Dedicated route for account %q on server %q", acc, s) + } + return nil + }) + } + // Route for account "apub" should be a dedicated route + checkRoute(s1, apub, true) + checkRoute(s2, apub, true) + // Route for account "bpub" should not + checkRoute(s1, bpub, false) + checkRoute(s2, bpub, false) + + checkComm := func(acc string, kp nkeys.KeyPair, subj string) { + t.Helper() + usr := createUserCreds(t, nil, kp) + ncAs2 := natsConnect(t, s2.ClientURL(), usr) + defer ncAs2.Close() + sub := natsSubSync(t, ncAs2, subj) + checkSubInterest(t, s1, acc, subj, time.Second) + + ncAs1 := natsConnect(t, s1.ClientURL(), usr) + defer ncAs1.Close() + natsPub(t, ncAs1, subj, nil) + natsNexMsg(t, sub, time.Second) + } + checkComm(apub, akp, "foo") + checkComm(bpub, bkp, "bar") + + // Add account "bpub" in accounts doing a configuration reload + reloadUpdateConfig(t, s1, conf1, fmt.Sprintf(tmpl, "A", _EMPTY_, fmt.Sprintf(",\"%s\"", bpub))) + // Already the route should be moved to a dedicated route, even + // before doing the config reload on s2. + checkRoute(s1, bpub, true) + checkRoute(s2, bpub, true) + // Account "aoub" should still have its dedicated route + checkRoute(s1, apub, true) + checkRoute(s2, apub, true) + // Let's complete the config reload on srvb + reloadUpdateConfig(t, s2, conf2, fmt.Sprintf(tmpl, "B", fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + fmt.Sprintf(",\"%s\"", bpub))) + checkClusterFormed(t, s1, s2) + // Check communication on account bpub again. + checkComm(bpub, bkp, "baz") + + // Now add with config reload an account that has not been used yet (cpub). + // We will also remove account bpub from the account list. + reloadUpdateConfig(t, s1, conf1, fmt.Sprintf(tmpl, "A", _EMPTY_, fmt.Sprintf(",\"%s\"", cpub))) + // Again, check before reloading s2. + checkRoute(s1, cpub, true) + checkRoute(s2, cpub, true) + // Now reload s2 and do other checks. + reloadUpdateConfig(t, s2, conf2, fmt.Sprintf(tmpl, "B", fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), + fmt.Sprintf(",\"%s\"", cpub))) + checkClusterFormed(t, s1, s2) + checkRoute(s1, bpub, false) + checkRoute(s2, bpub, false) + checkComm(cpub, ckp, "bat") + + // Finally, let's try to add an account that the account server rejects. + err := os.WriteFile(conf1, []byte(fmt.Sprintf(tmpl, "A", _EMPTY_, fmt.Sprintf(",\"%s\",\"%s\"", cpub, dpub))), 0666) + require_NoError(t, err) + if err := s1.Reload(); err == nil || !strings.Contains(err.Error(), dpub) { + t.Fatalf("Expected error about not being able to lookup this account, got %q", err) + } +} + +func TestRoutePoolAndPerAccountWithOlderServer(t *testing.T) { + tmpl := ` + port: -1 + server_name: "%s" + accounts { + A { users: [{user: "A", password: "pwd"}] } + B { users: [{user: "B", password: "pwd"}] } + } + cluster { + port: -1 + name: "local" + pool_size: 5 + accounts: ["A"] + %s + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "A", _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "B", + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, _ := RunServerWithConfig(conf2) + defer s2.Shutdown() + + // Have s3 explicitly disable pooling (to behave as an old server) + conf3 := createConfFile(t, []byte(fmt.Sprintf(` + port: -1 + server_name: "C" + accounts { + A { users: [{user: "A", password: "pwd"}] } + B { users: [{user: "B", password: "pwd"}] } + } + cluster { + port: -1 + name: "local" + pool_size: -1 + routes: ["nats://127.0.0.1:%d"] + } + `, o1.Cluster.Port))) + s3, _ := RunServerWithConfig(conf3) + defer s3.Shutdown() + + checkClusterFormed(t, s1, s2, s3) + + check := func(acc, subj string, subSrv, pubSrv1, pubSrv2 *Server) { + t.Helper() + + ncSub := natsConnect(t, subSrv.ClientURL(), nats.UserInfo(acc, "pwd")) + defer ncSub.Close() + sub := natsSubSync(t, ncSub, subj) + + checkSubInterest(t, pubSrv1, acc, subj, time.Second) + checkSubInterest(t, pubSrv2, acc, subj, time.Second) + + pub := func(s *Server) { + t.Helper() + nc := natsConnect(t, s.ClientURL(), nats.UserInfo(acc, "pwd")) + defer nc.Close() + + natsPub(t, nc, subj, []byte("hello")) + natsNexMsg(t, sub, time.Second) + } + pub(pubSrv1) + pub(pubSrv2) + } + check("A", "subj1", s1, s2, s3) + check("A", "subj2", s2, s1, s3) + check("A", "subj3", s3, s1, s2) + check("B", "subj4", s1, s2, s3) + check("B", "subj5", s2, s1, s3) + check("B", "subj6", s3, s1, s2) +} + +type testDuplicateRouteLogger struct { + DummyLogger + ch chan struct{} +} + +func (l *testDuplicateRouteLogger) Noticef(format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + if !strings.Contains(msg, DuplicateRoute.String()) { + return + } + select { + case l.ch <- struct{}{}: + default: + } +} + +// This test will make sure that a server with pooling does not +// keep trying to connect to a non pooled (for instance old) server. +// Will also make sure that if the old server is simply accepting +// connections, and restarted, the server with pooling will connect. +func TestRoutePoolWithOlderServerConnectAndReconnect(t *testing.T) { + tmplA := ` + port: -1 + server_name: "A" + cluster { + port: -1 + name: "local" + pool_size: 3 + %s + } + ` + conf1 := createConfFile(t, []byte(fmt.Sprintf(tmplA, _EMPTY_))) + s1, o1 := RunServerWithConfig(conf1) + defer s1.Shutdown() + + l := &testDuplicateRouteLogger{ch: make(chan struct{}, 50)} + s1.SetLogger(l, false, false) + + tmplB := ` + port: -1 + server_name: "B" + cluster { + port: %d + name: "local" + pool_size: -1 + %s + } + ` + conf2 := createConfFile(t, []byte(fmt.Sprintf(tmplB, -1, + fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port)))) + s2, o2 := RunServerWithConfig(conf2) + defer s2.Shutdown() + + checkClusterFormed(t, s1, s2) + + // Now reload configuration of s1 to point to s2. + reloadUpdateConfig(t, s1, conf1, fmt.Sprintf(tmplA, fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o2.Cluster.Port))) + checkClusterFormed(t, s1, s2) + + // We could get some, but it should settle. + checkRepeatConnect := func() { + t.Helper() + tm := time.NewTimer(2 * DEFAULT_ROUTE_CONNECT) + var last time.Time + for done := false; !done; { + select { + case <-l.ch: + last = time.Now() + case <-tm.C: + done = true + } + } + if dur := time.Since(last); dur <= DEFAULT_ROUTE_CONNECT { + t.Fatalf("Still attempted to connect %v ago", dur) + } + } + checkRepeatConnect() + + // Now shutdown s2 and restart it without active route to s1. + // Check that cluster can still be formed: that is, s1 is + // still trying to connect to s2. + s2.Shutdown() + // Wait for more than a regular reconnect delay attempt. + // Note that in test it is set to 15ms. + time.Sleep(50 * time.Millisecond) + // Restart the server s2 with the same cluster port otherwise + // s1 would not be able to reconnect. + conf2 = createConfFile(t, []byte(fmt.Sprintf(tmplB, o2.Cluster.Port, _EMPTY_))) + s2, _ = RunServerWithConfig(conf2) + defer s2.Shutdown() + // Make sure reconnect occurs + checkClusterFormed(t, s1, s2) + // And again, make sure there is no repeat-connect + checkRepeatConnect() +} diff --git a/server/server.go b/server/server.go index f73fbfaf..a31d442f 100644 --- a/server/server.go +++ b/server/server.go @@ -21,6 +21,7 @@ import ( "errors" "flag" "fmt" + "hash/fnv" "io" "log" "math/rand" @@ -90,6 +91,10 @@ type Info struct { LNOC bool `json:"lnoc,omitempty"` InfoOnConnect bool `json:"info_on_connect,omitempty"` // When true the server will respond to CONNECT with an INFO ConnectInfo bool `json:"connect_info,omitempty"` // When true this is the server INFO response to CONNECT + RoutePoolSize int `json:"route_pool_size,omitempty"` + RoutePoolIdx int `json:"route_pool_idx,omitempty"` + RouteAccount string `json:"route_account,omitempty"` + RouteAccReqID string `json:"route_acc_add_reqid,omitempty"` // Gateways Specific Gateway string `json:"gateway,omitempty"` // Name of the origin Gateway (sent by gateway's INFO) @@ -135,9 +140,14 @@ type Server struct { activeAccounts int32 accResolver AccountResolver clients map[uint64]*client - routes map[uint64]*client - routesByHash sync.Map - remotes map[string]*client + routes map[string][]*client + routesPoolSize int // Configured pool size + routesReject bool // During reload, we may want to reject adding routes until some conditions are met + routesNoPool int // Number of routes that don't use pooling (connecting to older server for instance) + accRoutes map[string]map[string]*client // Key is account name, value is key=remoteID/value=route connection + accRouteByHash sync.Map // Key is account name, value is nil or a pool index + accAddedCh chan struct{} + accAddedReqID string leafs map[uint64]*client users map[string]*User nkeys map[string]*NkeyUser @@ -153,7 +163,6 @@ type Server struct { routeListener net.Listener routeListenerErr error routeInfo Info - routeInfoJSON []byte routeResolver netResolver routesToSelf map[string]struct{} routeTLSName string @@ -503,8 +512,7 @@ func NewServer(opts *Options) (*Server, error) { s.grTmpClients = make(map[uint64]*client) // For tracking routes and their remote ids - s.routes = make(map[uint64]*client) - s.remotes = make(map[string]*client) + s.initRouteStructures(opts) // For tracking leaf nodes. s.leafs = make(map[uint64]*client) @@ -570,6 +578,27 @@ func NewServer(opts *Options) (*Server, error) { return s, nil } +// Initializes route structures based on pooling and/or per-account routes. +// +// Server lock is held on entry +func (s *Server) initRouteStructures(opts *Options) { + s.routes = make(map[string][]*client) + if ps := opts.Cluster.PoolSize; ps > 0 { + s.routesPoolSize = ps + } else { + s.routesPoolSize = 1 + } + // If we have per-account routes, we create accRoutes and initialize it + // with nil values. The presence of an account as the key will allow us + // to know if a given account is supposed to have dedicated routes. + if l := len(opts.Cluster.Accounts); l > 0 { + s.accRoutes = make(map[string]map[string]*client, l) + for _, acc := range opts.Cluster.Accounts { + s.accRoutes[acc] = make(map[string]*client) + } + } +} + func (s *Server) logRejectedTLSConns() { defer s.grWG.Done() t := time.NewTicker(time.Second) @@ -614,8 +643,6 @@ func (s *Server) setClusterName(name string) { s.info.Cluster = name s.routeInfo.Cluster = name - // Regenerate the info byte array - s.generateRouteInfoJSON() // Need to close solicited leaf nodes. The close has to be done outside of the server lock. var leafs []*client for _, c := range s.leafs { @@ -675,6 +702,18 @@ func validateCluster(o *Options) error { // Set this here so we do not consider it dynamic. o.Cluster.Name = o.Gateway.Name } + if l := len(o.Cluster.Accounts); l > 0 { + if o.Cluster.PoolSize < 0 { + return fmt.Errorf("pool_size cannot be negative if accounts are specified") + } + m := make(map[string]struct{}, l) + for _, a := range o.Cluster.Accounts { + if _, exists := m[a]; exists { + return fmt.Errorf("found duplicate account name %q in accounts list %q", a, o.Cluster.Accounts) + } + m[a] = struct{}{} + } + } return nil } @@ -941,12 +980,6 @@ func (s *Server) checkResolvePreloads() { } } -func (s *Server) generateRouteInfoJSON() { - b, _ := json.Marshal(s.routeInfo) - pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} - s.routeInfoJSON = bytes.Join(pcs, []byte(" ")) -} - // Determines if we are in pre NATS 2.0 setup with no accounts. func (s *Server) globalAccountOnly() bool { var hasOthers bool @@ -1385,6 +1418,7 @@ func (s *Server) registerAccountNoLock(acc *Account) *Account { s.setAccountSublist(acc) acc.mu.Lock() + s.setRouteInfo(acc) if acc.clients == nil { acc.clients = make(map[*client]struct{}) } @@ -1439,6 +1473,47 @@ func (s *Server) registerAccountNoLock(acc *Account) *Account { return nil } +// Sets the account's routePoolIdx depending on presence or not of +// pooling or per-account routes. Also updates a map used by +// gateway code to retrieve a route based on some route hash. +// +// Both Server and Account lock held on entry. +func (s *Server) setRouteInfo(acc *Account) { + // If there is a dedicated route configured for this account + if _, ok := s.accRoutes[acc.Name]; ok { + // We want the account name to be in the map, but we don't + // need a value (we could store empty string) + s.accRouteByHash.Store(acc.Name, nil) + // Set the route pool index to -1 so that it is easy when + // ranging over accounts to exclude those accounts when + // trying to get accounts for a given pool index. + acc.routePoolIdx = accDedicatedRoute + } else { + // If pool size more than 1, we will compute a hash code and + // use modulo to assign to an index of the pool slice. For 1 + // and below, all accounts will be bound to the single connection + // at index 0. + acc.routePoolIdx = s.computeRoutePoolIdx(acc) + if s.routesPoolSize > 1 { + s.accRouteByHash.Store(acc.Name, acc.routePoolIdx) + } + } +} + +// Returns a route pool index for this account based on the given pool size. +// Account lock is held on entry (account's name is accessed but immutable +// so could be called without account's lock). +// Server lock held on entry. +func (s *Server) computeRoutePoolIdx(acc *Account) int { + if s.routesPoolSize <= 1 { + return 0 + } + h := fnv.New32a() + h.Write([]byte(acc.Name)) + sum32 := h.Sum32() + return int((sum32 % uint32(s.routesPoolSize))) +} + // lookupAccount is a function to return the account structure // associated with an account name. // Lock MUST NOT be held upon entry. @@ -1957,9 +2032,11 @@ func (s *Server) Shutdown() { } s.grMu.Unlock() // Copy off the routes - for i, r := range s.routes { - conns[i] = r - } + s.forEachRoute(func(r *client) { + r.mu.Lock() + conns[r.cid] = r + r.mu.Unlock() + }) // Copy off the gateways s.getAllGatewayConnections(conns) @@ -2723,6 +2800,7 @@ func (s *Server) saveClosedClient(c *client, nc net.Conn, reason ClosedState) { // Adds to the list of client and websocket clients connect URLs. // If there was a change, an INFO protocol is sent to registered clients // that support async INFO protocols. +// Server lock held on entry. func (s *Server) addConnectURLsAndSendINFOToClients(curls, wsurls []string) { s.updateServerINFOAndSendINFOToClients(curls, wsurls, true) } @@ -2730,16 +2808,15 @@ func (s *Server) addConnectURLsAndSendINFOToClients(curls, wsurls []string) { // Removes from the list of client and websocket clients connect URLs. // If there was a change, an INFO protocol is sent to registered clients // that support async INFO protocols. +// Server lock held on entry. func (s *Server) removeConnectURLsAndSendINFOToClients(curls, wsurls []string) { s.updateServerINFOAndSendINFOToClients(curls, wsurls, false) } // Updates the list of client and websocket clients connect URLs and if any change // sends an async INFO update to clients that support it. +// Server lock held on entry. func (s *Server) updateServerINFOAndSendINFOToClients(curls, wsurls []string, add bool) { - s.mu.Lock() - defer s.mu.Unlock() - remove := !add // Will return true if we need alter the server's Info object. updateMap := func(urls []string, m refCountedUrlSet) bool { @@ -2873,8 +2950,17 @@ func (s *Server) addToTempClients(cid uint64, c *client) bool { // NumRoutes will report the number of registered routes. func (s *Server) NumRoutes() int { s.mu.RLock() - nr := len(s.routes) - s.mu.RUnlock() + defer s.mu.RUnlock() + return s.numRoutes() +} + +// numRoutes will report the number of registered routes. +// Server lock held on entry +func (s *Server) numRoutes() int { + var nr int + s.forEachRoute(func(c *client) { + nr++ + }) return nr } @@ -2882,7 +2968,13 @@ func (s *Server) NumRoutes() int { func (s *Server) NumRemotes() int { s.mu.RLock() defer s.mu.RUnlock() - return len(s.remotes) + return s.numRemotes() +} + +// numRemotes will report number of registered remotes. +// Server lock held on entry +func (s *Server) numRemotes() int { + return len(s.routes) } // NumLeafNodes will report number of leaf node connections. @@ -3558,12 +3650,12 @@ func (s *Server) lameDuckMode() { // Server lock is held on entry. func (s *Server) sendLDMToRoutes() { s.routeInfo.LameDuckMode = true - s.generateRouteInfoJSON() - for _, r := range s.routes { + infoJSON := generateInfoJSON(&s.routeInfo) + s.forEachRemote(func(r *client) { r.mu.Lock() - r.enqueueProto(s.routeInfoJSON) + r.enqueueProto(infoJSON) r.mu.Unlock() - } + }) // Clear now so that we notify only once, should we have to send other INFOs. s.routeInfo.LameDuckMode = false } diff --git a/server/server_test.go b/server/server_test.go index f394fe58..3f181a78 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -419,7 +419,7 @@ func TestClientAdvertiseInCluster(t *testing.T) { checkURLs := func(expected string) { t.Helper() - checkFor(t, time.Second, 15*time.Millisecond, func() error { + checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { srvs := nc.DiscoveredServers() for _, u := range srvs { if u == expected { @@ -441,7 +441,7 @@ func TestClientAdvertiseInCluster(t *testing.T) { checkURLs("nats://srvBC:4222") srvB.Shutdown() - checkNumRoutes(t, srvA, 1) + checkNumRoutes(t, srvA, DEFAULT_ROUTE_POOL_SIZE+1) checkURLs("nats://srvBC:4222") } @@ -691,7 +691,8 @@ func TestCustomRouterAuthentication(t *testing.T) { s3 := RunServer(opts3) defer s3.Shutdown() checkClusterFormed(t, s, s3) - checkNumRoutes(t, s3, 1) + // Default pool size + 1 for system account + checkNumRoutes(t, s3, DEFAULT_ROUTE_POOL_SIZE+1) } func TestMonitoringNoTimeout(t *testing.T) { diff --git a/server/split_test.go b/server/split_test.go index 571e9ff7..88177e7b 100644 --- a/server/split_test.go +++ b/server/split_test.go @@ -364,7 +364,7 @@ func TestSplitDanglingArgBuf(t *testing.T) { } // MSG (the client has to be a ROUTE) - c = &client{msubs: -1, mpay: -1, subs: make(map[string]*subscription), kind: ROUTER} + c = &client{msubs: -1, mpay: -1, subs: make(map[string]*subscription), kind: ROUTER, route: &route{}} msgop := []byte("RMSG $foo foo 5\r\nhello\r\n") c.parse(msgop[:5]) c.parse(msgop[5:10]) @@ -420,6 +420,7 @@ func TestSplitRoutedMsgArg(t *testing.T) { defer c.close() // Allow parser to process RMSG c.kind = ROUTER + c.route = &route{} b := make([]byte, 1024) @@ -445,7 +446,7 @@ func TestSplitRoutedMsgArg(t *testing.T) { } func TestSplitBufferMsgOp(t *testing.T) { - c := &client{msubs: -1, mpay: -1, mcl: 1024, subs: make(map[string]*subscription), kind: ROUTER} + c := &client{msubs: -1, mpay: -1, mcl: 1024, subs: make(map[string]*subscription), kind: ROUTER, route: &route{}} msg := []byte("RMSG $G foo.bar _INBOX.22 11\r\nhello world\r") msg1 := msg[:2] msg2 := msg[2:9] diff --git a/server/sublist.go b/server/sublist.go index a77f9f98..1a1f353f 100644 --- a/server/sublist.go +++ b/server/sublist.go @@ -820,9 +820,13 @@ func (s *Sublist) RemoveBatch(subs []*subscription) error { // Turn off our cache if enabled. wasEnabled := s.cache != nil s.cache = nil + // We will try to remove all subscriptions but will report the first that caused + // an error. In other words, we don't bail out at the first error which would + // possibly leave a bunch of subscriptions that could have been removed. + var err error for _, sub := range subs { - if err := s.remove(sub, false, false); err != nil { - return err + if lerr := s.remove(sub, false, false); lerr != nil && err == nil { + err = lerr } } // Turn caching back on here. @@ -830,7 +834,7 @@ func (s *Sublist) RemoveBatch(subs []*subscription) error { if wasEnabled { s.cache = make(map[string]*SublistResult) } - return nil + return err } // pruneNode is used to prune an empty node from the tree. diff --git a/server/util.go b/server/util.go index c7f5827e..aea3dcf1 100644 --- a/server/util.go +++ b/server/util.go @@ -14,7 +14,9 @@ package server import ( + "bytes" "context" + "encoding/json" "errors" "fmt" "math" @@ -334,3 +336,10 @@ func copyStrings(src []string) []string { copy(dst, src) return dst } + +// Returns a byte slice for the INFO protocol. +func generateInfoJSON(info *Info) []byte { + b, _ := json.Marshal(info) + pcs := [][]byte{[]byte("INFO"), b, []byte(CR_LF)} + return bytes.Join(pcs, []byte(" ")) +} diff --git a/test/cluster_tls_test.go b/test/cluster_tls_test.go index b5ab06a1..c968d50a 100644 --- a/test/cluster_tls_test.go +++ b/test/cluster_tls_test.go @@ -103,6 +103,7 @@ func TestClusterTLSInsecure(t *testing.T) { cluster { name: "xyz" listen: "127.0.0.1:-1" + pool_size: -1 tls { cert_file: "./configs/certs/server-noip.pem" key_file: "./configs/certs/server-key-noip.pem" @@ -122,6 +123,7 @@ func TestClusterTLSInsecure(t *testing.T) { cluster { name: "xyz" listen: "127.0.0.1:-1" + pool_size: -1 tls { cert_file: "./configs/certs/server-noip.pem" key_file: "./configs/certs/server-key-noip.pem" diff --git a/test/norace_test.go b/test/norace_test.go index 2cdf8b53..ffaf45c1 100644 --- a/test/norace_test.go +++ b/test/norace_test.go @@ -46,6 +46,7 @@ func TestNoRaceRouteSendSubs(t *testing.T) { write_deadline: "2s" cluster { port: -1 + pool_size: -1 %s } no_sys_acc: true diff --git a/test/ocsp_test.go b/test/ocsp_test.go index c1d2a542..0205342a 100644 --- a/test/ocsp_test.go +++ b/test/ocsp_test.go @@ -1012,6 +1012,7 @@ func TestOCSPCluster(t *testing.T) { host: "127.0.0.1" advertise: 127.0.0.1 port: -1 + pool_size: -1 tls { cert_file: "configs/certs/ocsp/server-status-request-url-02-cert.pem" @@ -1045,6 +1046,7 @@ func TestOCSPCluster(t *testing.T) { host: "127.0.0.1" advertise: 127.0.0.1 port: -1 + pool_size: -1 routes: [ nats://127.0.0.1:%d ] connect_retries: 30 @@ -1111,6 +1113,7 @@ func TestOCSPCluster(t *testing.T) { host: "127.0.0.1" advertise: 127.0.0.1 port: -1 + pool_size: -1 routes: [ nats://127.0.0.1:%d ] connect_retries: 30 @@ -1202,6 +1205,7 @@ func TestOCSPCluster(t *testing.T) { store_dir: '%s' cluster { port: -1 + pool_size: -1 name: AB host: "127.0.0.1" advertise: 127.0.0.1 diff --git a/test/operator_test.go b/test/operator_test.go index 5fbc4947..0afca441 100644 --- a/test/operator_test.go +++ b/test/operator_test.go @@ -341,6 +341,7 @@ func TestReloadDoesNotWipeAccountsWithOperatorMode(t *testing.T) { cluster { name: "A" listen: 127.0.0.1:-1 + pool_size: -1 authorization { timeout: 2.2 } %s @@ -463,6 +464,7 @@ func TestReloadDoesUpdateAccountsWithMemoryResolver(t *testing.T) { cluster { name: "A" listen: 127.0.0.1:-1 + pool_size: -1 authorization { timeout: 2.2 } %s @@ -554,6 +556,7 @@ func TestReloadFailsWithBadAccountsWithMemoryResolver(t *testing.T) { cluster { name: "A" listen: 127.0.0.1:-1 + pool_size: -1 authorization { timeout: 2.2 } %s diff --git a/test/test.go b/test/test.go index 5e7c444e..0a1eea6a 100644 --- a/test/test.go +++ b/test/test.go @@ -79,6 +79,8 @@ func RunServerCallback(opts *server.Options, callback func(*server.Server)) *ser opts.NoLog = !doLog opts.Trace = doTrace opts.Debug = doDebug + // For all tests in the "test" package, we will disable route pooling. + opts.Cluster.PoolSize = -1 s, err := server.NewServer(opts) if err != nil || s == nil { diff --git a/test/tls_test.go b/test/tls_test.go index 19fb800e..bc51f915 100644 --- a/test/tls_test.go +++ b/test/tls_test.go @@ -1921,6 +1921,7 @@ func TestTLSPinnedCertsRoute(t *testing.T) { port: -1 cluster { port: -1 + pool_size: -1 tls { ca_file: "configs/certs/ca.pem" cert_file: "configs/certs/server-cert.pem"