Files
nats-server/server/norace_test.go
Ivan Kozlovic 4bf81420e2 [FIXED] Fast routed JetStream API requests were dropped
If a JS API request is received from a non client connection, it
was processed in its own go routine. To reduce the number of
such go routine, we were limiting the number of outstanding routines
to 4096. However, in some situations, it was possible to issue
many requests at the same time that would then cause those requests
to be dropped.

(an example was an MQTT benchmark tool that would create 5000
sessions, each with one QoS1 R1 consumer (with the use of consumer_replicas=1).
On abrupt exit of the tool, the consumers and their sessions needed
to be deleted. Since would cause fast incoming delete consumer requests
which would cause the original code to drop some of them)

Signed-off-by: Ivan Kozlovic <ivan@synadia.com>
2022-05-23 11:15:55 -06:00

5308 lines
140 KiB
Go

// Copyright 2018-2022 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !race && !skip_no_race_tests
// +build !race,!skip_no_race_tests
package server
import (
"bufio"
"bytes"
"compress/gzip"
"context"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"math/rand"
"net"
"net/http"
"net/url"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
"crypto/hmac"
crand "crypto/rand"
"crypto/sha256"
"github.com/klauspost/compress/s2"
"github.com/nats-io/jwt/v2"
"github.com/nats-io/nats.go"
"github.com/nats-io/nkeys"
"golang.org/x/time/rate"
)
// IMPORTANT: Tests in this file are not executed when running with the -race flag.
// The test name should be prefixed with TestNoRace so we can run only
// those tests: go test -run=TestNoRace ...
func TestNoRaceAvoidSlowConsumerBigMessages(t *testing.T) {
opts := DefaultOptions() // Use defaults to make sure they avoid pending slow consumer.
opts.NoSystemAccount = true
s := RunServer(opts)
defer s.Shutdown()
nc1, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer nc1.Close()
nc2, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer nc2.Close()
data := make([]byte, 1024*1024) // 1MB payload
rand.Read(data)
expected := int32(500)
received := int32(0)
done := make(chan bool)
// Create Subscription.
nc1.Subscribe("slow.consumer", func(m *nats.Msg) {
// Just eat it so that we are not measuring
// code time, just delivery.
atomic.AddInt32(&received, 1)
if received >= expected {
done <- true
}
})
// Create Error handler
nc1.SetErrorHandler(func(c *nats.Conn, s *nats.Subscription, err error) {
t.Fatalf("Received an error on the subscription's connection: %v\n", err)
})
nc1.Flush()
for i := 0; i < int(expected); i++ {
nc2.Publish("slow.consumer", data)
}
nc2.Flush()
select {
case <-done:
return
case <-time.After(10 * time.Second):
r := atomic.LoadInt32(&received)
if s.NumSlowConsumers() > 0 {
t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", r, expected)
}
t.Fatalf("Failed to receive all large messages: %d of %d\n", r, expected)
}
}
func TestNoRaceRoutedQueueAutoUnsubscribe(t *testing.T) {
optsA, err := ProcessConfigFile("./configs/seed.conf")
require_NoError(t, err)
optsA.NoSigs, optsA.NoLog = true, true
optsA.NoSystemAccount = true
srvA := RunServer(optsA)
defer srvA.Shutdown()
srvARouteURL := fmt.Sprintf("nats://%s:%d", optsA.Cluster.Host, srvA.ClusterAddr().Port)
optsB := nextServerOpts(optsA)
optsB.Routes = RoutesFromStr(srvARouteURL)
srvB := RunServer(optsB)
defer srvB.Shutdown()
// Wait for these 2 to connect to each other
checkClusterFormed(t, srvA, srvB)
// Have a client connection to each server
ncA, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsA.Host, optsA.Port))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer ncA.Close()
ncB, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsB.Host, optsB.Port))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer ncB.Close()
rbar := int32(0)
barCb := func(m *nats.Msg) {
atomic.AddInt32(&rbar, 1)
}
rbaz := int32(0)
bazCb := func(m *nats.Msg) {
atomic.AddInt32(&rbaz, 1)
}
// Create 125 queue subs with auto-unsubscribe to each server for
// group bar and group baz. So 250 total per queue group.
cons := []*nats.Conn{ncA, ncB}
for _, c := range cons {
for i := 0; i < 100; i++ {
qsub, err := c.QueueSubscribe("foo", "bar", barCb)
if err != nil {
t.Fatalf("Error on subscribe: %v", err)
}
if err := qsub.AutoUnsubscribe(1); err != nil {
t.Fatalf("Error on auto-unsubscribe: %v", err)
}
qsub, err = c.QueueSubscribe("foo", "baz", bazCb)
if err != nil {
t.Fatalf("Error on subscribe: %v", err)
}
if err := qsub.AutoUnsubscribe(1); err != nil {
t.Fatalf("Error on auto-unsubscribe: %v", err)
}
}
c.Subscribe("TEST.COMPLETE", func(m *nats.Msg) {})
}
// We coelasce now so for each server we will have all local (200) plus
// two from the remote side for each queue group. We also create one more
// and will wait til each server has 204 subscriptions, that will make sure
// that we have everything setup.
checkFor(t, 10*time.Second, 100*time.Millisecond, func() error {
subsA := srvA.NumSubscriptions()
subsB := srvB.NumSubscriptions()
if subsA != 204 || subsB != 204 {
return fmt.Errorf("Not all subs processed yet: %d and %d", subsA, subsB)
}
return nil
})
expected := int32(200)
// Now send messages from each server
for i := int32(0); i < expected; i++ {
c := cons[i%2]
c.Publish("foo", []byte("Don't Drop Me!"))
}
for _, c := range cons {
c.Flush()
}
checkFor(t, 10*time.Second, 100*time.Millisecond, func() error {
nbar := atomic.LoadInt32(&rbar)
nbaz := atomic.LoadInt32(&rbaz)
if nbar == expected && nbaz == expected {
return nil
}
return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'",
expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz))
})
}
func TestNoRaceClosedSlowConsumerWriteDeadline(t *testing.T) {
opts := DefaultOptions()
opts.NoSystemAccount = true
opts.WriteDeadline = 10 * time.Millisecond // Make very small to trip.
opts.MaxPending = 500 * 1024 * 1024 // Set high so it will not trip here.
s := RunServer(opts)
defer s.Shutdown()
c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer c.Close()
if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil {
t.Fatalf("Error sending protocols to server: %v", err)
}
// Reduce socket buffer to increase reliability of data backing up in the server destined
// for our subscribed client.
c.(*net.TCPConn).SetReadBuffer(128)
url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)
sender, err := nats.Connect(url)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer sender.Close()
payload := make([]byte, 1024*1024)
for i := 0; i < 100; i++ {
if err := sender.Publish("foo", payload); err != nil {
t.Fatalf("Error on publish: %v", err)
}
}
// Flush sender connection to ensure that all data has been sent.
if err := sender.Flush(); err != nil {
t.Fatalf("Error on flush: %v", err)
}
// At this point server should have closed connection c.
checkClosedConns(t, s, 1, 2*time.Second)
conns := s.closedClients()
if lc := len(conns); lc != 1 {
t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc)
}
checkReason(t, conns[0].Reason, SlowConsumerWriteDeadline)
}
func TestNoRaceClosedSlowConsumerPendingBytes(t *testing.T) {
opts := DefaultOptions()
opts.NoSystemAccount = true
opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer.
opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip.
s := RunServer(opts)
defer s.Shutdown()
c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer c.Close()
if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil {
t.Fatalf("Error sending protocols to server: %v", err)
}
// Reduce socket buffer to increase reliability of data backing up in the server destined
// for our subscribed client.
c.(*net.TCPConn).SetReadBuffer(128)
url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)
sender, err := nats.Connect(url)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer sender.Close()
payload := make([]byte, 1024*1024)
for i := 0; i < 100; i++ {
if err := sender.Publish("foo", payload); err != nil {
t.Fatalf("Error on publish: %v", err)
}
}
// Flush sender connection to ensure that all data has been sent.
if err := sender.Flush(); err != nil {
t.Fatalf("Error on flush: %v", err)
}
// At this point server should have closed connection c.
checkClosedConns(t, s, 1, 2*time.Second)
conns := s.closedClients()
if lc := len(conns); lc != 1 {
t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc)
}
checkReason(t, conns[0].Reason, SlowConsumerPendingBytes)
}
func TestNoRaceSlowConsumerPendingBytes(t *testing.T) {
opts := DefaultOptions()
opts.NoSystemAccount = true
opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer.
opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip.
s := RunServer(opts)
defer s.Shutdown()
c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer c.Close()
if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil {
t.Fatalf("Error sending protocols to server: %v", err)
}
// Reduce socket buffer to increase reliability of data backing up in the server destined
// for our subscribed client.
c.(*net.TCPConn).SetReadBuffer(128)
url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)
sender, err := nats.Connect(url)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer sender.Close()
payload := make([]byte, 1024*1024)
for i := 0; i < 100; i++ {
if err := sender.Publish("foo", payload); err != nil {
t.Fatalf("Error on publish: %v", err)
}
}
// Flush sender connection to ensure that all data has been sent.
if err := sender.Flush(); err != nil {
t.Fatalf("Error on flush: %v", err)
}
// At this point server should have closed connection c.
// On certain platforms, it may take more than one call before
// getting the error.
for i := 0; i < 100; i++ {
if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil {
// ok
return
}
}
t.Fatal("Connection should have been closed")
}
func TestNoRaceGatewayNoMissingReplies(t *testing.T) {
// This test will have following setup:
//
// responder1 requestor
// | |
// v v
// [A1]<-------gw------------[B1]
// | \ |
// | \______gw__________ | route
// | _\| |
// [ ]--------gw----------->[ ]
// [A2]<-------gw------------[B2]
// [ ] [ ]
// ^
// |
// responder2
//
// There is a possible race that when the requestor creates
// a subscription on the reply subject, the subject interest
// being sent from the inbound gateway, and B1 having none,
// the SUB first goes to B2 before being sent to A1 from
// B2's inbound GW. But the request can go from B1 to A1
// right away and the responder1 connecting to A1 may send
// back the reply before the interest on the reply makes it
// to A1 (from B2).
// This test will also verify that if the responder is instead
// connected to A2, the reply is properly received by requestor
// on B1.
// For this test we want to be in interestOnly mode, so
// make it happen quickly
gatewayMaxRUnsubBeforeSwitch = 1
defer func() { gatewayMaxRUnsubBeforeSwitch = defaultGatewayMaxRUnsubBeforeSwitch }()
// Start with setting up A2 and B2.
ob2 := testDefaultOptionsForGateway("B")
sb2 := runGatewayServer(ob2)
defer sb2.Shutdown()
oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2)
sa2 := runGatewayServer(oa2)
defer sa2.Shutdown()
waitForOutboundGateways(t, sa2, 1, time.Second)
waitForInboundGateways(t, sa2, 1, time.Second)
waitForOutboundGateways(t, sb2, 1, time.Second)
waitForInboundGateways(t, sb2, 1, time.Second)
// Now start A1 which will connect to B2
oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2)
oa1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa2.Cluster.Host, oa2.Cluster.Port))
sa1 := runGatewayServer(oa1)
defer sa1.Shutdown()
waitForOutboundGateways(t, sa1, 1, time.Second)
waitForInboundGateways(t, sb2, 2, time.Second)
checkClusterFormed(t, sa1, sa2)
// Finally, start B1 that will connect to A1.
ob1 := testGatewayOptionsFromToWithServers(t, "B", "A", sa1)
ob1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", ob2.Cluster.Host, ob2.Cluster.Port))
sb1 := runGatewayServer(ob1)
defer sb1.Shutdown()
// Check that we have the outbound gateway from B1 to A1
checkFor(t, 3*time.Second, 15*time.Millisecond, func() error {
c := sb1.getOutboundGatewayConnection("A")
if c == nil {
return fmt.Errorf("Outbound connection to A not created yet")
}
c.mu.Lock()
name := c.opts.Name
nc := c.nc
c.mu.Unlock()
if name != sa1.ID() {
// Force a disconnect
nc.Close()
return fmt.Errorf("Was unable to have B1 connect to A1")
}
return nil
})
waitForInboundGateways(t, sa1, 1, time.Second)
checkClusterFormed(t, sb1, sb2)
a1URL := fmt.Sprintf("nats://%s:%d", oa1.Host, oa1.Port)
a2URL := fmt.Sprintf("nats://%s:%d", oa2.Host, oa2.Port)
b1URL := fmt.Sprintf("nats://%s:%d", ob1.Host, ob1.Port)
b2URL := fmt.Sprintf("nats://%s:%d", ob2.Host, ob2.Port)
ncb1 := natsConnect(t, b1URL)
defer ncb1.Close()
ncb2 := natsConnect(t, b2URL)
defer ncb2.Close()
natsSubSync(t, ncb1, "just.a.sub")
natsSubSync(t, ncb2, "just.a.sub")
checkExpectedSubs(t, 2, sb1, sb2)
// For this test, we want A to be checking B's interest in order
// to send messages (which would cause replies to be dropped if
// there is no interest registered on A). So from A servers,
// send to various subjects and cause B's to switch to interestOnly
// mode.
nca1 := natsConnect(t, a1URL)
defer nca1.Close()
for i := 0; i < 10; i++ {
natsPub(t, nca1, fmt.Sprintf("reject.%d", i), []byte("hello"))
}
nca2 := natsConnect(t, a2URL)
defer nca2.Close()
for i := 0; i < 10; i++ {
natsPub(t, nca2, fmt.Sprintf("reject.%d", i), []byte("hello"))
}
checkSwitchedMode := func(t *testing.T, s *Server) {
t.Helper()
checkFor(t, 2*time.Second, 15*time.Millisecond, func() error {
var switchedMode bool
c := s.getOutboundGatewayConnection("B")
ei, _ := c.gw.outsim.Load(globalAccountName)
if ei != nil {
e := ei.(*outsie)
e.RLock()
switchedMode = e.ni == nil && e.mode == InterestOnly
e.RUnlock()
}
if !switchedMode {
return fmt.Errorf("Still not switched mode")
}
return nil
})
}
checkSwitchedMode(t, sa1)
checkSwitchedMode(t, sa2)
// Setup a subscriber on _INBOX.> on each of A's servers.
total := 1000
expected := int32(total)
rcvOnA := int32(0)
qrcvOnA := int32(0)
natsSub(t, nca1, "myreply.>", func(_ *nats.Msg) {
atomic.AddInt32(&rcvOnA, 1)
})
natsQueueSub(t, nca2, "myreply.>", "bar", func(_ *nats.Msg) {
atomic.AddInt32(&qrcvOnA, 1)
})
checkExpectedSubs(t, 2, sa1, sa2)
// Ok.. so now we will run the actual test where we
// create a responder on A1 and make sure that every
// single request from B1 gets the reply. Will repeat
// test with responder connected to A2.
sendReqs := func(t *testing.T, subConn *nats.Conn) {
t.Helper()
responder := natsSub(t, subConn, "foo", func(m *nats.Msg) {
m.Respond([]byte("reply"))
})
natsFlush(t, subConn)
checkExpectedSubs(t, 3, sa1, sa2)
// We are not going to use Request() because this sets
// a wildcard subscription on an INBOX and less likely
// to produce the race. Instead we will explicitly set
// the subscription on the reply subject and create one
// per request.
for i := 0; i < total/2; i++ {
reply := fmt.Sprintf("myreply.%d", i)
replySub := natsQueueSubSync(t, ncb1, reply, "bar")
natsFlush(t, ncb1)
// Let's make sure we have interest on B2.
if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 {
checkFor(t, time.Second, time.Millisecond, func() error {
if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 {
return fmt.Errorf("B still not registered interest on %s", reply)
}
return nil
})
}
natsPubReq(t, ncb1, "foo", reply, []byte("request"))
if _, err := replySub.NextMsg(time.Second); err != nil {
t.Fatalf("Did not receive reply: %v", err)
}
natsUnsub(t, replySub)
}
responder.Unsubscribe()
natsFlush(t, subConn)
checkExpectedSubs(t, 2, sa1, sa2)
}
sendReqs(t, nca1)
sendReqs(t, nca2)
checkFor(t, time.Second, 15*time.Millisecond, func() error {
if n := atomic.LoadInt32(&rcvOnA); n != expected {
return fmt.Errorf("Subs on A expected to get %v replies, got %v", expected, n)
}
return nil
})
// We should not have received a single message on the queue sub
// on cluster A because messages will have been delivered to
// the member on cluster B.
if n := atomic.LoadInt32(&qrcvOnA); n != 0 {
t.Fatalf("Queue sub on A should not have received message, got %v", n)
}
}
func TestNoRaceRouteMemUsage(t *testing.T) {
oa := DefaultOptions()
sa := RunServer(oa)
defer sa.Shutdown()
ob := DefaultOptions()
ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port))
sb := RunServer(ob)
defer sb.Shutdown()
checkClusterFormed(t, sa, sb)
responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port))
defer responder.Close()
for i := 0; i < 10; i++ {
natsSub(t, responder, "foo", func(m *nats.Msg) {
m.Respond(m.Data)
})
}
natsFlush(t, responder)
payload := make([]byte, 50*1024)
bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port)
// Capture mem usage
mem := runtime.MemStats{}
runtime.ReadMemStats(&mem)
inUseBefore := mem.HeapInuse
for i := 0; i < 100; i++ {
requestor := natsConnect(t, bURL)
inbox := nats.NewInbox()
sub := natsSubSync(t, requestor, inbox)
natsPubReq(t, requestor, "foo", inbox, payload)
for j := 0; j < 10; j++ {
natsNexMsg(t, sub, time.Second)
}
requestor.Close()
}
runtime.GC()
debug.FreeOSMemory()
runtime.ReadMemStats(&mem)
inUseNow := mem.HeapInuse
if inUseNow > 3*inUseBefore {
t.Fatalf("Heap in-use before was %v, now %v: too high", inUseBefore, inUseNow)
}
}
func TestNoRaceRouteCache(t *testing.T) {
maxPerAccountCacheSize = 20
prunePerAccountCacheSize = 5
closedSubsCheckInterval = 250 * time.Millisecond
defer func() {
maxPerAccountCacheSize = defaultMaxPerAccountCacheSize
prunePerAccountCacheSize = defaultPrunePerAccountCacheSize
closedSubsCheckInterval = defaultClosedSubsCheckInterval
}()
for _, test := range []struct {
name string
useQueue bool
}{
{"plain_sub", false},
{"queue_sub", true},
} {
t.Run(test.name, func(t *testing.T) {
oa := DefaultOptions()
oa.NoSystemAccount = true
sa := RunServer(oa)
defer sa.Shutdown()
ob := DefaultOptions()
ob.NoSystemAccount = true
ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port))
sb := RunServer(ob)
defer sb.Shutdown()
checkClusterFormed(t, sa, sb)
responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port))
defer responder.Close()
natsSub(t, responder, "foo", func(m *nats.Msg) {
m.Respond(m.Data)
})
natsFlush(t, responder)
checkExpectedSubs(t, 1, sa)
checkExpectedSubs(t, 1, sb)
bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port)
requestor := natsConnect(t, bURL)
defer requestor.Close()
ch := make(chan struct{}, 1)
cb := func(_ *nats.Msg) {
select {
case ch <- struct{}{}:
default:
}
}
sendReqs := func(t *testing.T, nc *nats.Conn, count int, unsub bool) {
t.Helper()
for i := 0; i < count; i++ {
inbox := nats.NewInbox()
var sub *nats.Subscription
if test.useQueue {
sub = natsQueueSub(t, nc, inbox, "queue", cb)
} else {
sub = natsSub(t, nc, inbox, cb)
}
natsPubReq(t, nc, "foo", inbox, []byte("hello"))
select {
case <-ch:
case <-time.After(time.Second):
t.Fatalf("Failed to get reply")
}
if unsub {
natsUnsub(t, sub)
}
}
}
sendReqs(t, requestor, maxPerAccountCacheSize+1, true)
var route *client
sb.mu.Lock()
for _, r := range sb.routes {
route = r
break
}
sb.mu.Unlock()
checkExpected := func(t *testing.T, expected int) {
t.Helper()
checkFor(t, 2*time.Second, 15*time.Millisecond, func() error {
route.mu.Lock()
n := len(route.in.pacache)
route.mu.Unlock()
if n != expected {
return fmt.Errorf("Expected %v subs in the cache, got %v", expected, n)
}
return nil
})
}
checkExpected(t, (maxPerAccountCacheSize+1)-(prunePerAccountCacheSize+1))
// Wait for more than the orphan check
time.Sleep(2 * closedSubsCheckInterval)
// Add a new subs up to point where new prune would occur
sendReqs(t, requestor, prunePerAccountCacheSize+1, false)
// Now closed subs should have been removed, so expected
// subs in the cache should be the new ones.
checkExpected(t, prunePerAccountCacheSize+1)
// Now try wil implicit unsubscribe (due to connection close)
sendReqs(t, requestor, maxPerAccountCacheSize+1, false)
requestor.Close()
checkExpected(t, maxPerAccountCacheSize-prunePerAccountCacheSize)
// Wait for more than the orphan check
time.Sleep(2 * closedSubsCheckInterval)
// Now create new connection and send prunePerAccountCacheSize+1
// and that should cause all subs from previous connection to be
// removed from cache
requestor = natsConnect(t, bURL)
defer requestor.Close()
sendReqs(t, requestor, prunePerAccountCacheSize+1, false)
checkExpected(t, prunePerAccountCacheSize+1)
})
}
}
func TestNoRaceFetchAccountDoesNotRegisterAccountTwice(t *testing.T) {
sa, oa, sb, ob, _ := runTrustedGateways(t)
defer sa.Shutdown()
defer sb.Shutdown()
// Let's create a user account.
okp, _ := nkeys.FromSeed(oSeed)
akp, _ := nkeys.CreateAccount()
pub, _ := akp.PublicKey()
nac := jwt.NewAccountClaims(pub)
jwt, _ := nac.Encode(okp)
userAcc := pub
// Replace B's account resolver with one that introduces
// delay during the Fetch()
sac := &slowAccResolver{AccountResolver: sb.AccountResolver()}
sb.SetAccountResolver(sac)
// Add the account in sa and sb
addAccountToMemResolver(sa, userAcc, jwt)
addAccountToMemResolver(sb, userAcc, jwt)
// Tell the slow account resolver which account to slow down
sac.Lock()
sac.acc = userAcc
sac.Unlock()
urlA := fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port)
urlB := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port)
nca, err := nats.Connect(urlA, createUserCreds(t, sa, akp))
if err != nil {
t.Fatalf("Error connecting to A: %v", err)
}
defer nca.Close()
// Since there is an optimistic send, this message will go to B
// and on processing this message, B will lookup/fetch this
// account, which can produce race with the fetch of this
// account from A's system account that sent a notification
// about this account, or with the client connect just after
// that.
nca.Publish("foo", []byte("hello"))
// Now connect and create a subscription on B
ncb, err := nats.Connect(urlB, createUserCreds(t, sb, akp))
if err != nil {
t.Fatalf("Error connecting to A: %v", err)
}
defer ncb.Close()
sub, err := ncb.SubscribeSync("foo")
if err != nil {
t.Fatalf("Error on subscribe: %v", err)
}
ncb.Flush()
// Now send messages from A and B should ultimately start to receive
// them (once the subscription has been correctly registered)
ok := false
for i := 0; i < 10; i++ {
nca.Publish("foo", []byte("hello"))
if _, err := sub.NextMsg(100 * time.Millisecond); err != nil {
continue
}
ok = true
break
}
if !ok {
t.Fatalf("B should be able to receive messages")
}
checkTmpAccounts := func(t *testing.T, s *Server) {
t.Helper()
empty := true
s.tmpAccounts.Range(func(_, _ interface{}) bool {
empty = false
return false
})
if !empty {
t.Fatalf("tmpAccounts is not empty")
}
}
checkTmpAccounts(t, sa)
checkTmpAccounts(t, sb)
}
func TestNoRaceWriteDeadline(t *testing.T) {
opts := DefaultOptions()
opts.NoSystemAccount = true
opts.WriteDeadline = 30 * time.Millisecond
s := RunServer(opts)
defer s.Shutdown()
c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer c.Close()
if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil {
t.Fatalf("Error sending protocols to server: %v", err)
}
// Reduce socket buffer to increase reliability of getting
// write deadline errors.
c.(*net.TCPConn).SetReadBuffer(4)
url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)
sender, err := nats.Connect(url)
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer sender.Close()
payload := make([]byte, 1000000)
total := 1000
for i := 0; i < total; i++ {
if err := sender.Publish("foo", payload); err != nil {
t.Fatalf("Error on publish: %v", err)
}
}
// Flush sender connection to ensure that all data has been sent.
if err := sender.Flush(); err != nil {
t.Fatalf("Error on flush: %v", err)
}
// At this point server should have closed connection c.
// On certain platforms, it may take more than one call before
// getting the error.
for i := 0; i < 100; i++ {
if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil {
// ok
return
}
}
t.Fatal("Connection should have been closed")
}
func TestNoRaceLeafNodeClusterNameConflictDeadlock(t *testing.T) {
o := DefaultOptions()
o.LeafNode.Port = -1
s := RunServer(o)
defer s.Shutdown()
u, err := url.Parse(fmt.Sprintf("nats://127.0.0.1:%d", o.LeafNode.Port))
if err != nil {
t.Fatalf("Error parsing url: %v", err)
}
o1 := DefaultOptions()
o1.ServerName = "A1"
o1.Cluster.Name = "clusterA"
o1.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}}
s1 := RunServer(o1)
defer s1.Shutdown()
checkLeafNodeConnected(t, s1)
o2 := DefaultOptions()
o2.ServerName = "A2"
o2.Cluster.Name = "clusterA"
o2.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port))
o2.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}}
s2 := RunServer(o2)
defer s2.Shutdown()
checkLeafNodeConnected(t, s2)
checkClusterFormed(t, s1, s2)
o3 := DefaultOptions()
o3.ServerName = "A3"
o3.Cluster.Name = "" // intentionally not set
o3.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port))
o3.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}}
s3 := RunServer(o3)
defer s3.Shutdown()
checkLeafNodeConnected(t, s3)
checkClusterFormed(t, s1, s2, s3)
}
// This test is same than TestAccountAddServiceImportRace but running
// without the -race flag, it would capture more easily the possible
// duplicate sid, resulting in less than expected number of subscriptions
// in the account's internal subscriptions map.
func TestNoRaceAccountAddServiceImportRace(t *testing.T) {
TestAccountAddServiceImportRace(t)
}
// Similar to the routed version. Make sure we receive all of the
// messages with auto-unsubscribe enabled.
func TestNoRaceQueueAutoUnsubscribe(t *testing.T) {
opts := DefaultOptions()
s := RunServer(opts)
defer s.Shutdown()
nc, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port))
if err != nil {
t.Fatalf("Error on connect: %v", err)
}
defer nc.Close()
rbar := int32(0)
barCb := func(m *nats.Msg) {
atomic.AddInt32(&rbar, 1)
}
rbaz := int32(0)
bazCb := func(m *nats.Msg) {
atomic.AddInt32(&rbaz, 1)
}
// Create 1000 subscriptions with auto-unsubscribe of 1.
// Do two groups, one bar and one baz.
total := 1000
for i := 0; i < total; i++ {
qsub, err := nc.QueueSubscribe("foo", "bar", barCb)
if err != nil {
t.Fatalf("Error on subscribe: %v", err)
}
if err := qsub.AutoUnsubscribe(1); err != nil {
t.Fatalf("Error on auto-unsubscribe: %v", err)
}
qsub, err = nc.QueueSubscribe("foo", "baz", bazCb)
if err != nil {
t.Fatalf("Error on subscribe: %v", err)
}
if err := qsub.AutoUnsubscribe(1); err != nil {
t.Fatalf("Error on auto-unsubscribe: %v", err)
}
}
nc.Flush()
expected := int32(total)
for i := int32(0); i < expected; i++ {
nc.Publish("foo", []byte("Don't Drop Me!"))
}
nc.Flush()
checkFor(t, 5*time.Second, 10*time.Millisecond, func() error {
nbar := atomic.LoadInt32(&rbar)
nbaz := atomic.LoadInt32(&rbaz)
if nbar == expected && nbaz == expected {
return nil
}
return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'",
expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz))
})
}
func TestNoRaceAcceptLoopsDoNotLeaveOpenedConn(t *testing.T) {
for _, test := range []struct {
name string
url func(o *Options) (string, int)
}{
{"client", func(o *Options) (string, int) { return o.Host, o.Port }},
{"route", func(o *Options) (string, int) { return o.Cluster.Host, o.Cluster.Port }},
{"gateway", func(o *Options) (string, int) { return o.Gateway.Host, o.Gateway.Port }},
{"leafnode", func(o *Options) (string, int) { return o.LeafNode.Host, o.LeafNode.Port }},
{"websocket", func(o *Options) (string, int) { return o.Websocket.Host, o.Websocket.Port }},
} {
t.Run(test.name, func(t *testing.T) {
o := DefaultOptions()
o.DisableShortFirstPing = true
o.Accounts = []*Account{NewAccount("$SYS")}
o.SystemAccount = "$SYS"
o.Cluster.Name = "abc"
o.Cluster.Host = "127.0.0.1"
o.Cluster.Port = -1
o.Gateway.Name = "abc"
o.Gateway.Host = "127.0.0.1"
o.Gateway.Port = -1
o.LeafNode.Host = "127.0.0.1"
o.LeafNode.Port = -1
o.Websocket.Host = "127.0.0.1"
o.Websocket.Port = -1
o.Websocket.HandshakeTimeout = 1
o.Websocket.NoTLS = true
s := RunServer(o)
defer s.Shutdown()
host, port := test.url(o)
url := fmt.Sprintf("%s:%d", host, port)
var conns []net.Conn
wg := sync.WaitGroup{}
wg.Add(1)
done := make(chan struct{}, 1)
go func() {
defer wg.Done()
// Have an upper limit
for i := 0; i < 200; i++ {
c, err := net.Dial("tcp", url)
if err != nil {
return
}
conns = append(conns, c)
select {
case <-done:
return
default:
}
}
}()
time.Sleep(15 * time.Millisecond)
s.Shutdown()
close(done)
wg.Wait()
for _, c := range conns {
c.SetReadDeadline(time.Now().Add(2 * time.Second))
br := bufio.NewReader(c)
// Read INFO for connections that were accepted
_, _, err := br.ReadLine()
if err == nil {
// After that, the connection should be closed,
// so we should get an error here.
_, _, err = br.ReadLine()
}
// We expect an io.EOF or any other error indicating the use of a closed
// connection, but we should not get the timeout error.
if ne, ok := err.(net.Error); ok && ne.Timeout() {
err = nil
}
if err == nil {
var buf [10]byte
c.SetDeadline(time.Now().Add(2 * time.Second))
c.Write([]byte("C"))
_, err = c.Read(buf[:])
if ne, ok := err.(net.Error); ok && ne.Timeout() {
err = nil
}
}
if err == nil {
t.Fatalf("Connection should have been closed")
}
c.Close()
}
})
}
}
func TestNoRaceJetStreamDeleteStreamManyConsumers(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
mname := "MYS"
mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Storage: FileStorage})
if err != nil {
t.Fatalf("Unexpected error adding stream: %v", err)
}
// This number needs to be higher than the internal sendq size to trigger what this test is testing.
for i := 0; i < 2000; i++ {
_, err := mset.addConsumer(&ConsumerConfig{
Durable: fmt.Sprintf("D-%d", i),
DeliverSubject: fmt.Sprintf("deliver.%d", i),
})
if err != nil {
t.Fatalf("Error creating consumer: %v", err)
}
}
// With bug this would not return and would hang.
mset.delete()
}
// We used to swap accounts on an inbound message when processing service imports.
// Until JetStream this was kinda ok, but with JetStream we can have pull consumers
// trying to access the clients account in another Go routine now which causes issues.
// This is not limited to the case above, its just the one that exposed it.
// This test is to show that issue and that the fix works, meaning we no longer swap c.acc.
func TestNoRaceJetStreamServiceImportAccountSwapIssue(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
// Client based API
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo", "bar"},
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
sub, err := js.PullSubscribe("foo", "dlc")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
beforeSubs := s.NumSubscriptions()
// How long we want both sides to run.
timeout := time.Now().Add(3 * time.Second)
errs := make(chan error, 1)
// Publishing side, which will signal the consumer that is waiting and which will access c.acc. If publish
// operation runs concurrently we will catch c.acc being $SYS some of the time.
go func() {
time.Sleep(100 * time.Millisecond)
for time.Now().Before(timeout) {
// This will signal the delivery of the pull messages.
js.Publish("foo", []byte("Hello"))
// This will swap the account because of JetStream service import.
// We can get an error here with the bug or not.
if _, err := js.StreamInfo("TEST"); err != nil {
errs <- err
return
}
}
errs <- nil
}()
// Pull messages flow.
var received int
for time.Now().Before(timeout) {
if msgs, err := sub.Fetch(1, nats.MaxWait(200*time.Millisecond)); err == nil {
for _, m := range msgs {
received++
m.AckSync()
}
} else {
break
}
}
// Wait on publisher Go routine and check for errors.
if err := <-errs; err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Double check all received.
si, err := js.StreamInfo("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if int(si.State.Msgs) != received {
t.Fatalf("Expected to receive %d msgs, only got %d", si.State.Msgs, received)
}
// Now check for leaked subs from the fetch call above. That is what we first saw from the bug.
if afterSubs := s.NumSubscriptions(); afterSubs != beforeSubs {
t.Fatalf("Leaked subscriptions: %d before, %d after", beforeSubs, afterSubs)
}
}
func TestNoRaceJetStreamAPIStreamListPaging(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
// Create 2X limit
streamsNum := 2 * JSApiNamesLimit
for i := 1; i <= streamsNum; i++ {
name := fmt.Sprintf("STREAM-%06d", i)
cfg := StreamConfig{Name: name, Storage: MemoryStorage}
_, err := s.GlobalAccount().addStream(&cfg)
if err != nil {
t.Fatalf("Unexpected error adding stream: %v", err)
}
}
// Client for API requests.
nc := clientConnectToServer(t, s)
defer nc.Close()
reqList := func(offset int) []byte {
t.Helper()
var req []byte
if offset > 0 {
req, _ = json.Marshal(&ApiPagedRequest{Offset: offset})
}
resp, err := nc.Request(JSApiStreams, req, time.Second)
if err != nil {
t.Fatalf("Unexpected error getting stream list: %v", err)
}
return resp.Data
}
checkResp := func(resp []byte, expectedLen, expectedOffset int) {
t.Helper()
var listResponse JSApiStreamNamesResponse
if err := json.Unmarshal(resp, &listResponse); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if len(listResponse.Streams) != expectedLen {
t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Streams))
}
if listResponse.Total != streamsNum {
t.Fatalf("Expected total to be %d but got %d", streamsNum, listResponse.Total)
}
if listResponse.Offset != expectedOffset {
t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset)
}
if expectedLen < 1 {
return
}
// Make sure we get the right stream.
sname := fmt.Sprintf("STREAM-%06d", expectedOffset+1)
if listResponse.Streams[0] != sname {
t.Fatalf("Expected stream %q to be first, got %q", sname, listResponse.Streams[0])
}
}
checkResp(reqList(0), JSApiNamesLimit, 0)
checkResp(reqList(JSApiNamesLimit), JSApiNamesLimit, JSApiNamesLimit)
checkResp(reqList(streamsNum), 0, streamsNum)
checkResp(reqList(streamsNum-22), 22, streamsNum-22)
checkResp(reqList(streamsNum+22), 0, streamsNum)
}
func TestNoRaceJetStreamAPIConsumerListPaging(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
sname := "MYSTREAM"
mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: sname})
if err != nil {
t.Fatalf("Unexpected error adding stream: %v", err)
}
// Client for API requests.
nc := clientConnectToServer(t, s)
defer nc.Close()
consumersNum := JSApiNamesLimit
for i := 1; i <= consumersNum; i++ {
dsubj := fmt.Sprintf("d.%d", i)
sub, _ := nc.SubscribeSync(dsubj)
defer sub.Unsubscribe()
nc.Flush()
_, err := mset.addConsumer(&ConsumerConfig{DeliverSubject: dsubj})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
reqListSubject := fmt.Sprintf(JSApiConsumersT, sname)
reqList := func(offset int) []byte {
t.Helper()
var req []byte
if offset > 0 {
req, _ = json.Marshal(&JSApiConsumersRequest{ApiPagedRequest: ApiPagedRequest{Offset: offset}})
}
resp, err := nc.Request(reqListSubject, req, time.Second)
if err != nil {
t.Fatalf("Unexpected error getting stream list: %v", err)
}
return resp.Data
}
checkResp := func(resp []byte, expectedLen, expectedOffset int) {
t.Helper()
var listResponse JSApiConsumerNamesResponse
if err := json.Unmarshal(resp, &listResponse); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if len(listResponse.Consumers) != expectedLen {
t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Consumers))
}
if listResponse.Total != consumersNum {
t.Fatalf("Expected total to be %d but got %d", consumersNum, listResponse.Total)
}
if listResponse.Offset != expectedOffset {
t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset)
}
}
checkResp(reqList(0), JSApiNamesLimit, 0)
checkResp(reqList(consumersNum-22), 22, consumersNum-22)
checkResp(reqList(consumersNum+22), 0, consumersNum)
}
func TestNoRaceJetStreamWorkQueueLoadBalance(t *testing.T) {
s := RunBasicJetStreamServer()
defer s.Shutdown()
mname := "MY_MSG_SET"
mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Subjects: []string{"foo", "bar"}})
if err != nil {
t.Fatalf("Unexpected error adding message set: %v", err)
}
defer mset.delete()
// Create basic work queue mode consumer.
oname := "WQ"
o, err := mset.addConsumer(&ConsumerConfig{Durable: oname, AckPolicy: AckExplicit})
if err != nil {
t.Fatalf("Expected no error with durable, got %v", err)
}
defer o.delete()
// To send messages.
nc := clientConnectToServer(t, s)
defer nc.Close()
// For normal work queue semantics, you send requests to the subject with stream and consumer name.
reqMsgSubj := o.requestNextMsgSubject()
numWorkers := 25
counts := make([]int32, numWorkers)
var received int32
rwg := &sync.WaitGroup{}
rwg.Add(numWorkers)
wg := &sync.WaitGroup{}
wg.Add(numWorkers)
ch := make(chan bool)
toSend := 1000
for i := 0; i < numWorkers; i++ {
nc := clientConnectToServer(t, s)
defer nc.Close()
go func(index int32) {
rwg.Done()
defer wg.Done()
<-ch
for counter := &counts[index]; ; {
m, err := nc.Request(reqMsgSubj, nil, 100*time.Millisecond)
if err != nil {
return
}
m.Respond(nil)
atomic.AddInt32(counter, 1)
if total := atomic.AddInt32(&received, 1); total >= int32(toSend) {
return
}
}
}(int32(i))
}
// Wait for requestors to be ready
rwg.Wait()
close(ch)
sendSubj := "bar"
for i := 0; i < toSend; i++ {
sendStreamMsg(t, nc, sendSubj, "Hello World!")
}
// Wait for test to complete.
wg.Wait()
target := toSend / numWorkers
delta := target/2 + 5
low, high := int32(target-delta), int32(target+delta)
for i := 0; i < numWorkers; i++ {
if msgs := atomic.LoadInt32(&counts[i]); msgs < low || msgs > high {
t.Fatalf("Messages received for worker [%d] too far off from target of %d, got %d", i, target, msgs)
}
}
}
func TestNoRaceJetStreamClusterLargeStreamInlineCatchup(t *testing.T) {
c := createJetStreamClusterExplicit(t, "LSS", 3)
defer c.shutdown()
// Client based API
s := c.randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo"},
Replicas: 3,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
sr := c.randomNonStreamLeader("$G", "TEST")
sr.Shutdown()
// In case sr was meta leader.
c.waitOnLeader()
msg, toSend := []byte("Hello JS Clustering"), 5000
// Now fill up stream.
for i := 0; i < toSend; i++ {
if _, err = js.Publish("foo", msg); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
si, err := js.StreamInfo("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Check active state as well, shows that the owner answered.
if si.State.Msgs != uint64(toSend) {
t.Fatalf("Expected %d msgs, got bad state: %+v", toSend, si.State)
}
// Kill our current leader to make just 2.
c.streamLeader("$G", "TEST").Shutdown()
// Now restart the shutdown peer and wait for it to be current.
sr = c.restartServer(sr)
c.waitOnStreamCurrent(sr, "$G", "TEST")
// Ask other servers to stepdown as leader so that sr becomes the leader.
checkFor(t, 20*time.Second, 200*time.Millisecond, func() error {
c.waitOnStreamLeader("$G", "TEST")
if sl := c.streamLeader("$G", "TEST"); sl != sr {
sl.JetStreamStepdownStream("$G", "TEST")
return fmt.Errorf("Server %s is not leader yet", sr)
}
return nil
})
si, err = js.StreamInfo("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Check that we have all of our messsages stored.
// Wait for a bit for upper layers to process.
checkFor(t, 2*time.Second, 100*time.Millisecond, func() error {
if si.State.Msgs != uint64(toSend) {
return fmt.Errorf("Expected %d msgs, got %d", toSend, si.State.Msgs)
}
return nil
})
}
func TestNoRaceJetStreamClusterStreamCreateAndLostQuorum(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R5S", 3)
defer c.shutdown()
// Client based API
s := c.randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
sub, err := nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if _, err := js.AddStream(&nats.StreamConfig{Name: "NO-LQ-START", Replicas: 3}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
c.waitOnStreamLeader("$G", "NO-LQ-START")
checkSubsPending(t, sub, 0)
c.stopAll()
// Start up the one we were connected to first and wait for it to be connected.
s = c.restartServer(s)
nc, err = nats.Connect(s.ClientURL())
if err != nil {
t.Fatalf("Failed to create client: %v", err)
}
defer nc.Close()
sub, err = nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
nc.Flush()
c.restartAll()
c.waitOnStreamLeader("$G", "NO-LQ-START")
checkSubsPending(t, sub, 0)
}
func TestNoRaceJetStreamSuperClusterMirrors(t *testing.T) {
sc := createJetStreamSuperCluster(t, 3, 3)
defer sc.shutdown()
// Client based API
s := sc.clusterForName("C2").randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
// Create source stream.
_, err := js.AddStream(&nats.StreamConfig{Name: "S1", Subjects: []string{"foo", "bar"}, Replicas: 3, Placement: &nats.Placement{Cluster: "C2"}})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Needed while Go client does not have mirror support.
createStream := func(cfg *nats.StreamConfig) {
t.Helper()
if _, err := js.AddStream(cfg); err != nil {
t.Fatalf("Unexpected error: %+v", err)
}
}
// Send 100 messages.
for i := 0; i < 100; i++ {
if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
createStream(&nats.StreamConfig{
Name: "M1",
Mirror: &nats.StreamSource{Name: "S1"},
Placement: &nats.Placement{Cluster: "C1"},
})
checkFor(t, 2*time.Second, 100*time.Millisecond, func() error {
si, err := js.StreamInfo("M1")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 100 {
return fmt.Errorf("Expected 100 msgs, got state: %+v", si.State)
}
return nil
})
// Purge the source stream.
if err := js.PurgeStream("S1"); err != nil {
t.Fatalf("Unexpected purge error: %v", err)
}
// Send 50 more msgs now.
for i := 0; i < 50; i++ {
if _, err := js.Publish("bar", []byte("OK")); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
createStream(&nats.StreamConfig{
Name: "M2",
Mirror: &nats.StreamSource{Name: "S1"},
Replicas: 3,
Placement: &nats.Placement{Cluster: "C3"},
})
checkFor(t, 10*time.Second, 100*time.Millisecond, func() error {
si, err := js.StreamInfo("M2")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 50 {
return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State)
}
if si.State.FirstSeq != 101 {
return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State)
}
return nil
})
sl := sc.clusterForName("C3").streamLeader("$G", "M2")
doneCh := make(chan bool)
// Now test that if the mirror get's interrupted that it picks up where it left off etc.
go func() {
// Send 100 more messages.
for i := 0; i < 100; i++ {
if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil {
t.Errorf("Unexpected publish on %d error: %v", i, err)
}
time.Sleep(2 * time.Millisecond)
}
doneCh <- true
}()
time.Sleep(20 * time.Millisecond)
sl.Shutdown()
<-doneCh
sc.clusterForName("C3").waitOnStreamLeader("$G", "M2")
checkFor(t, 10*time.Second, 100*time.Millisecond, func() error {
si, err := js.StreamInfo("M2")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 150 {
return fmt.Errorf("Expected 150 msgs, got state: %+v", si.State)
}
if si.State.FirstSeq != 101 {
return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State)
}
return nil
})
}
func TestNoRaceJetStreamSuperClusterMixedModeMirrors(t *testing.T) {
// Unlike the similar sources test, this test is not reliably catching the bug
// that would cause mirrors to not have the expected messages count.
// Still, adding this test in case we have a regression and we are lucky in
// getting the failure while running this.
tmpl := `
listen: 127.0.0.1:-1
server_name: %s
jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'}
leaf: { listen: 127.0.0.1:-1 }
cluster {
name: %s
listen: 127.0.0.1:%d
routes = [%s]
}
accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } }
`
sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 4,
func(serverName, clusterName, storeDir, conf string) string {
sname := serverName[strings.Index(serverName, "-")+1:]
switch sname {
case "S5", "S6", "S7":
conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ")
default:
conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ")
}
return conf
})
defer sc.shutdown()
// Connect our client to a non JS server
c := sc.randomCluster()
var s *Server
for s == nil {
if as := c.randomServer(); !as.JetStreamEnabled() {
s = as
break
}
}
nc, js := jsClientConnect(t, s)
defer nc.Close()
toSend := 1000
// Create 10 origin streams
for i := 0; i < 10; i++ {
name := fmt.Sprintf("S%d", i+1)
if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Load them up with a bunch of messages.
for n := 0; n < toSend; n++ {
m := nats.NewMsg(name)
m.Header.Set("stream", name)
m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10))
if err := nc.PublishMsg(m); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
}
for i := 0; i < 3; i++ {
// Now create our mirrors
wg := sync.WaitGroup{}
mirrorsCount := 10
wg.Add(mirrorsCount)
errCh := make(chan error, 1)
for m := 0; m < mirrorsCount; m++ {
sname := fmt.Sprintf("S%d", rand.Intn(10)+1)
go func(sname string, mirrorIdx int) {
defer wg.Done()
if _, err := js.AddStream(&nats.StreamConfig{
Name: fmt.Sprintf("M%d", mirrorIdx),
Mirror: &nats.StreamSource{Name: sname},
Replicas: 3,
}); err != nil {
select {
case errCh <- err:
default:
}
}
}(sname, m+1)
}
wg.Wait()
select {
case err := <-errCh:
t.Fatalf("Error creating mirrors: %v", err)
default:
}
// Now check the mirrors have all expected messages
for m := 0; m < mirrorsCount; m++ {
name := fmt.Sprintf("M%d", m+1)
checkFor(t, 15*time.Second, 500*time.Millisecond, func() error {
si, err := js.StreamInfo(name)
if err != nil {
t.Fatalf("Could not retrieve stream info")
}
if si.State.Msgs != uint64(toSend) {
return fmt.Errorf("Expected %d msgs, got state: %+v", toSend, si.State)
}
return nil
})
err := js.DeleteStream(name)
require_NoError(t, err)
}
}
}
func TestNoRaceJetStreamSuperClusterSources(t *testing.T) {
sc := createJetStreamSuperCluster(t, 3, 3)
defer sc.shutdown()
// Client based API
s := sc.clusterForName("C1").randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
// Create our source streams.
for _, sname := range []string{"foo", "bar", "baz"} {
if _, err := js.AddStream(&nats.StreamConfig{Name: sname, Replicas: 1}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
sendBatch := func(subject string, n int) {
for i := 0; i < n; i++ {
msg := fmt.Sprintf("MSG-%d", i+1)
if _, err := js.Publish(subject, []byte(msg)); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
}
// Populate each one.
sendBatch("foo", 10)
sendBatch("bar", 15)
sendBatch("baz", 25)
// Needed while Go client does not have mirror support for creating mirror or source streams.
createStream := func(cfg *nats.StreamConfig) {
t.Helper()
if _, err := js.AddStream(cfg); err != nil {
t.Fatalf("Unexpected error: %+v", err)
}
}
cfg := &nats.StreamConfig{
Name: "MS",
Sources: []*nats.StreamSource{
{Name: "foo"},
{Name: "bar"},
{Name: "baz"},
},
}
createStream(cfg)
time.Sleep(time.Second)
// Faster timeout since we loop below checking for condition.
js2, err := nc.JetStream(nats.MaxWait(50 * time.Millisecond))
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
checkFor(t, 10*time.Second, 100*time.Millisecond, func() error {
si, err := js2.StreamInfo("MS")
if err != nil {
return err
}
if si.State.Msgs != 50 {
return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State)
}
return nil
})
// Purge the source streams.
for _, sname := range []string{"foo", "bar", "baz"} {
if err := js.PurgeStream(sname); err != nil {
t.Fatalf("Unexpected purge error: %v", err)
}
}
if err := js.DeleteStream("MS"); err != nil {
t.Fatalf("Unexpected delete error: %v", err)
}
// Send more msgs now.
sendBatch("foo", 10)
sendBatch("bar", 15)
sendBatch("baz", 25)
cfg = &nats.StreamConfig{
Name: "MS2",
Sources: []*nats.StreamSource{
{Name: "foo"},
{Name: "bar"},
{Name: "baz"},
},
Replicas: 3,
Placement: &nats.Placement{Cluster: "C3"},
}
createStream(cfg)
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
si, err := js2.StreamInfo("MS2")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 50 {
return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State)
}
if si.State.FirstSeq != 1 {
return fmt.Errorf("Expected start seq of 1, got state: %+v", si.State)
}
return nil
})
sl := sc.clusterForName("C3").streamLeader("$G", "MS2")
doneCh := make(chan bool)
if sl == sc.leader() {
nc.Request(JSApiLeaderStepDown, nil, time.Second)
sc.waitOnLeader()
}
// Now test that if the mirror get's interrupted that it picks up where it left off etc.
go func() {
// Send 50 more messages each.
for i := 0; i < 50; i++ {
msg := fmt.Sprintf("R-MSG-%d", i+1)
for _, sname := range []string{"foo", "bar", "baz"} {
m := nats.NewMsg(sname)
m.Header.Set(nats.MsgIdHdr, sname+"-"+msg)
m.Data = []byte(msg)
if _, err := js.PublishMsg(m); err != nil {
t.Errorf("Unexpected publish error: %v", err)
}
}
time.Sleep(2 * time.Millisecond)
}
doneCh <- true
}()
time.Sleep(20 * time.Millisecond)
sl.Shutdown()
sc.clusterForName("C3").waitOnStreamLeader("$G", "MS2")
<-doneCh
checkFor(t, 15*time.Second, 100*time.Millisecond, func() error {
si, err := js2.StreamInfo("MS2")
if err != nil {
return err
}
if si.State.Msgs != 200 {
return fmt.Errorf("Expected 200 msgs, got state: %+v", si.State)
}
return nil
})
}
func TestNoRaceJetStreamClusterSourcesMuxd(t *testing.T) {
c := createJetStreamClusterExplicit(t, "SMUX", 3)
defer c.shutdown()
// Client for API requests.
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
// Send in 10000 messages.
msg, toSend := make([]byte, 1024), 10000
rand.Read(msg)
var sources []*nats.StreamSource
// Create 10 origin streams.
for i := 1; i <= 10; i++ {
name := fmt.Sprintf("O-%d", i)
if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Load them up with a bunch of messages.
for n := 0; n < toSend; n++ {
if err := nc.Publish(name, msg); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
sources = append(sources, &nats.StreamSource{Name: name})
}
// Now create our downstream stream that sources from all of them.
if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 2, Sources: sources}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
checkFor(t, 20*time.Second, 500*time.Millisecond, func() error {
si, err := js.StreamInfo("S")
if err != nil {
t.Fatalf("Could not retrieve stream info")
}
if si.State.Msgs != uint64(10*toSend) {
return fmt.Errorf("Expected %d msgs, got state: %+v", toSend*10, si.State)
}
return nil
})
}
func TestNoRaceJetStreamSuperClusterMixedModeSources(t *testing.T) {
tmpl := `
listen: 127.0.0.1:-1
server_name: %s
jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'}
leaf: { listen: 127.0.0.1:-1 }
cluster {
name: %s
listen: 127.0.0.1:%d
routes = [%s]
}
accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } }
`
sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 2,
func(serverName, clusterName, storeDir, conf string) string {
sname := serverName[strings.Index(serverName, "-")+1:]
switch sname {
case "S5", "S6", "S7":
conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ")
default:
conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ")
}
return conf
})
defer sc.shutdown()
// Connect our client to a non JS server
c := sc.randomCluster()
var s *Server
for s == nil {
if as := c.randomServer(); !as.JetStreamEnabled() {
s = as
break
}
}
nc, js := jsClientConnect(t, s)
defer nc.Close()
toSend := 1000
var sources []*nats.StreamSource
// Create 100 origin streams.
for i := 1; i <= 100; i++ {
name := fmt.Sprintf("O-%d", i)
if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Load them up with a bunch of messages.
for n := 0; n < toSend; n++ {
m := nats.NewMsg(name)
m.Header.Set("stream", name)
m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10))
if err := nc.PublishMsg(m); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
sources = append(sources, &nats.StreamSource{Name: name})
}
for i := 0; i < 3; i++ {
// Now create our downstream stream that sources from all of them.
if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 3, Sources: sources}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
checkFor(t, 15*time.Second, 1000*time.Millisecond, func() error {
si, err := js.StreamInfo("S")
if err != nil {
t.Fatalf("Could not retrieve stream info")
}
if si.State.Msgs != uint64(100*toSend) {
return fmt.Errorf("Expected %d msgs, got state: %+v", toSend*100, si.State)
}
return nil
})
err := js.DeleteStream("S")
require_NoError(t, err)
}
}
func TestNoRaceJetStreamClusterExtendedStreamPurgeStall(t *testing.T) {
// Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm.
skip(t)
cerr := func(t *testing.T, err error) {
t.Helper()
if err != nil {
t.Fatalf("unexepected err: %s", err)
}
}
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
si, err := js.AddStream(&nats.StreamConfig{
Name: "KV",
Subjects: []string{"kv.>"},
Storage: nats.FileStorage,
})
cerr(t, err)
// 100kb messages spread over 1000 different subjects
body := make([]byte, 100*1024)
for i := 0; i < 50000; i++ {
if _, err := js.PublishAsync(fmt.Sprintf("kv.%d", i%1000), body); err != nil {
cerr(t, err)
}
}
checkFor(t, 5*time.Second, 200*time.Millisecond, func() error {
if si, err = js.StreamInfo("KV"); err != nil {
return err
}
if si.State.Msgs == 50000 {
return nil
}
return fmt.Errorf("waiting for more")
})
jp, _ := json.Marshal(&JSApiStreamPurgeRequest{Subject: "kv.20"})
start := time.Now()
res, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), jp, time.Minute)
elapsed := time.Since(start)
cerr(t, err)
pres := JSApiStreamPurgeResponse{}
err = json.Unmarshal(res.Data, &pres)
cerr(t, err)
if !pres.Success {
t.Fatalf("purge failed: %#v", pres)
}
if elapsed > time.Second {
t.Fatalf("Purge took too long %s", elapsed)
}
v, _ := s.Varz(nil)
if v.Mem > 100*1024*1024 { // 100MB limit but in practice < 100MB -> Was ~7GB when failing.
t.Fatalf("Used too much memory: %v", friendlyBytes(v.Mem))
}
}
func TestNoRaceJetStreamClusterMirrorExpirationAndMissingSequences(t *testing.T) {
c := createJetStreamClusterExplicit(t, "MMS", 9)
defer c.shutdown()
// Client for API requests.
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
sendBatch := func(n int) {
t.Helper()
// Send a batch to a given subject.
for i := 0; i < n; i++ {
if _, err := js.Publish("TEST", []byte("OK")); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
}
checkStream := func(stream string, num uint64) {
t.Helper()
checkFor(t, 10*time.Second, 50*time.Millisecond, func() error {
si, err := js.StreamInfo(stream)
if err != nil {
return err
}
if si.State.Msgs != num {
return fmt.Errorf("Expected %d msgs, got %d", num, si.State.Msgs)
}
return nil
})
}
checkMirror := func(num uint64) { t.Helper(); checkStream("M", num) }
checkTest := func(num uint64) { t.Helper(); checkStream("TEST", num) }
// Origin
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
MaxAge: 100 * time.Millisecond,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
ts := c.streamLeader("$G", "TEST")
ml := c.leader()
// Create mirror now.
for ms := ts; ms == ts || ms == ml; {
_, err = js.AddStream(&nats.StreamConfig{
Name: "M",
Mirror: &nats.StreamSource{Name: "TEST"},
Replicas: 2,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
ms = c.streamLeader("$G", "M")
if ts == ms || ms == ml {
// Delete and retry.
js.DeleteStream("M")
}
}
sendBatch(10)
checkMirror(10)
// Now shutdown the server with the mirror.
ms := c.streamLeader("$G", "M")
ms.Shutdown()
c.waitOnLeader()
// Send more messages but let them expire.
sendBatch(10)
checkTest(0)
c.restartServer(ms)
c.checkClusterFormed()
c.waitOnStreamLeader("$G", "M")
sendBatch(10)
checkMirror(20)
}
func TestNoRaceLargeActiveOnReplica(t *testing.T) {
// Uncomment to run.
skip(t)
c := createJetStreamClusterExplicit(t, "LAG", 3)
defer c.shutdown()
// Client for API requests.
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
timeout := time.Now().Add(60 * time.Second)
for time.Now().Before(timeout) {
si, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo", "bar"},
Replicas: 3,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
for _, r := range si.Cluster.Replicas {
if r.Active > 5*time.Second {
t.Fatalf("Bad Active value: %+v", r)
}
}
if err := js.DeleteStream("TEST"); err != nil {
t.Fatalf("Unexpected delete error: %v", err)
}
}
}
func TestNoRaceJetStreamSuperClusterRIPStress(t *testing.T) {
// Uncomment to run. Needs to be on a big machine.
skip(t)
sc := createJetStreamSuperCluster(t, 3, 3)
defer sc.shutdown()
// Client based API
s := sc.clusterForName("C2").randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
scm := make(map[string][]string)
// Create 50 streams per cluster.
for _, cn := range []string{"C1", "C2", "C3"} {
var streams []string
for i := 0; i < 50; i++ {
sn := fmt.Sprintf("%s-S%d", cn, i+1)
streams = append(streams, sn)
_, err := js.AddStream(&nats.StreamConfig{
Name: sn,
Replicas: 3,
Placement: &nats.Placement{Cluster: cn},
MaxAge: 2 * time.Minute,
MaxMsgs: 50_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
scm[cn] = streams
}
sourceForCluster := func(cn string) []*nats.StreamSource {
var sns []string
switch cn {
case "C1":
sns = scm["C2"]
case "C2":
sns = scm["C3"]
case "C3":
sns = scm["C1"]
default:
t.Fatalf("Unknown cluster %q", cn)
}
var ss []*nats.StreamSource
for _, sn := range sns {
ss = append(ss, &nats.StreamSource{Name: sn})
}
return ss
}
// Mux all 50 streams from one cluster to a single stream across a GW connection to another cluster.
_, err := js.AddStream(&nats.StreamConfig{
Name: "C1-S-MUX",
Replicas: 2,
Placement: &nats.Placement{Cluster: "C1"},
Sources: sourceForCluster("C2"),
MaxAge: time.Minute,
MaxMsgs: 20_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
_, err = js.AddStream(&nats.StreamConfig{
Name: "C2-S-MUX",
Replicas: 2,
Placement: &nats.Placement{Cluster: "C2"},
Sources: sourceForCluster("C3"),
MaxAge: time.Minute,
MaxMsgs: 20_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
_, err = js.AddStream(&nats.StreamConfig{
Name: "C3-S-MUX",
Replicas: 2,
Placement: &nats.Placement{Cluster: "C3"},
Sources: sourceForCluster("C1"),
MaxAge: time.Minute,
MaxMsgs: 20_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Now create mirrors for our mux'd streams.
_, err = js.AddStream(&nats.StreamConfig{
Name: "C1-MIRROR",
Replicas: 3,
Placement: &nats.Placement{Cluster: "C1"},
Mirror: &nats.StreamSource{Name: "C3-S-MUX"},
MaxAge: 5 * time.Minute,
MaxMsgs: 10_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
_, err = js.AddStream(&nats.StreamConfig{
Name: "C2-MIRROR",
Replicas: 3,
Placement: &nats.Placement{Cluster: "C2"},
Mirror: &nats.StreamSource{Name: "C2-S-MUX"},
MaxAge: 5 * time.Minute,
MaxMsgs: 10_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
_, err = js.AddStream(&nats.StreamConfig{
Name: "C3-MIRROR",
Replicas: 3,
Placement: &nats.Placement{Cluster: "C3"},
Mirror: &nats.StreamSource{Name: "C1-S-MUX"},
MaxAge: 5 * time.Minute,
MaxMsgs: 10_000,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var jsc []nats.JetStream
// Create 64 clients.
for i := 0; i < 64; i++ {
s := sc.randomCluster().randomServer()
nc, _ := jsClientConnect(t, s)
defer nc.Close()
js, err := nc.JetStream(nats.PublishAsyncMaxPending(8 * 1024))
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
jsc = append(jsc, js)
}
msg := make([]byte, 1024)
rand.Read(msg)
// 10 minutes
expires := time.Now().Add(480 * time.Second)
for time.Now().Before(expires) {
for _, sns := range scm {
rand.Shuffle(len(sns), func(i, j int) { sns[i], sns[j] = sns[j], sns[i] })
for _, sn := range sns {
js := jsc[rand.Intn(len(jsc))]
if _, err = js.PublishAsync(sn, msg); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
}
time.Sleep(10 * time.Millisecond)
}
}
func TestNoRaceJetStreamSlowFilteredInititalPendingAndFirstMsg(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
// Create directly here to force multiple blocks, etc.
a, err := s.LookupAccount("$G")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
mset, err := a.addStreamWithStore(
&StreamConfig{
Name: "S",
Subjects: []string{"foo", "bar", "baz", "foo.bar.baz", "foo.*"},
},
&FileStoreConfig{
BlockSize: 4 * 1024 * 1024,
AsyncFlush: true,
},
)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
nc, js := jsClientConnect(t, s)
defer nc.Close()
toSend := 100_000 // 500k total though.
// Messages will be 'foo' 'bar' 'baz' repeated 100k times.
// Then 'foo.bar.baz' all contigous for 100k.
// Then foo.N for 1-100000
for i := 0; i < toSend; i++ {
js.PublishAsync("foo", []byte("HELLO"))
js.PublishAsync("bar", []byte("WORLD"))
js.PublishAsync("baz", []byte("AGAIN"))
}
// Make contiguous block of same subject.
for i := 0; i < toSend; i++ {
js.PublishAsync("foo.bar.baz", []byte("ALL-TOGETHER"))
}
// Now add some more at the end.
for i := 0; i < toSend; i++ {
js.PublishAsync(fmt.Sprintf("foo.%d", i+1), []byte("LATER"))
}
checkFor(t, 10*time.Second, 250*time.Millisecond, func() error {
si, err := js.StreamInfo("S")
if err != nil {
return err
}
if si.State.Msgs != uint64(5*toSend) {
return fmt.Errorf("Expected %d msgs, got %d", 5*toSend, si.State.Msgs)
}
return nil
})
// Threshold for taking too long.
const thresh = 50 * time.Millisecond
var dindex int
testConsumerCreate := func(subj string, startSeq, expectedNumPending uint64) {
t.Helper()
dindex++
dname := fmt.Sprintf("dur-%d", dindex)
cfg := ConsumerConfig{FilterSubject: subj, Durable: dname, AckPolicy: AckExplicit}
if startSeq > 1 {
cfg.OptStartSeq, cfg.DeliverPolicy = startSeq, DeliverByStartSequence
}
start := time.Now()
o, err := mset.addConsumer(&cfg)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if delta := time.Since(start); delta > thresh {
t.Fatalf("Creating consumer for %q and start: %d took too long: %v", subj, startSeq, delta)
}
if ci := o.info(); ci.NumPending != expectedNumPending {
t.Fatalf("Expected NumPending of %d, got %d", expectedNumPending, ci.NumPending)
}
}
testConsumerCreate("foo.100000", 1, 1)
testConsumerCreate("foo.100000", 222_000, 1)
testConsumerCreate("foo", 1, 100_000)
testConsumerCreate("foo", 4, 100_000-1)
testConsumerCreate("foo.bar.baz", 1, 100_000)
testConsumerCreate("foo.bar.baz", 350_001, 50_000)
testConsumerCreate("*", 1, 300_000)
testConsumerCreate("*", 4, 300_000-3)
testConsumerCreate(">", 1, 500_000)
testConsumerCreate(">", 50_000, 500_000-50_000+1)
testConsumerCreate("foo.10", 1, 1)
// Also test that we do not take long if the start sequence is later in the stream.
sub, err := js.PullSubscribe("foo.100000", "dlc")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
start := time.Now()
fetchMsgs(t, sub, 1, time.Second)
if delta := time.Since(start); delta > thresh {
t.Fatalf("Took too long for pull subscriber to fetch the message: %v", delta)
}
// Now do some deletes and make sure these are handled correctly.
// Delete 3 foo messages.
mset.removeMsg(1)
mset.removeMsg(4)
mset.removeMsg(7)
testConsumerCreate("foo", 1, 100_000-3)
// Make sure wider scoped subjects do the right thing from a pending perspective.
o, err := mset.addConsumer(&ConsumerConfig{FilterSubject: ">", Durable: "cat", AckPolicy: AckExplicit})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
ci, expected := o.info(), uint64(500_000-3)
if ci.NumPending != expected {
t.Fatalf("Expected NumPending of %d, got %d", expected, ci.NumPending)
}
// Send another and make sure its captured by our wide scope consumer.
js.Publish("foo", []byte("HELLO AGAIN"))
if ci = o.info(); ci.NumPending != expected+1 {
t.Fatalf("Expected the consumer to recognize the wide scoped consumer, wanted pending of %d, got %d", expected+1, ci.NumPending)
}
// Stop current server and test restart..
sd := s.JetStreamConfig().StoreDir
s.Shutdown()
// Restart.
s = RunJetStreamServerOnPort(-1, sd)
defer s.Shutdown()
a, err = s.LookupAccount("$G")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
mset, err = a.lookupStream("S")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Make sure we recovered our per subject state on restart.
testConsumerCreate("foo.100000", 1, 1)
testConsumerCreate("foo", 1, 100_000-2)
}
func TestNoRaceJetStreamFileStoreBufferReuse(t *testing.T) {
// Uncomment to run. Needs to be on a big machine.
skip(t)
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
cfg := &StreamConfig{Name: "TEST", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage}
if _, err := s.GlobalAccount().addStreamWithStore(cfg, nil); err != nil {
t.Fatalf("Unexpected error adding stream: %v", err)
}
// Client for API requests.
nc, js := jsClientConnect(t, s)
defer nc.Close()
toSend := 200_000
m := nats.NewMsg("foo")
m.Data = make([]byte, 8*1024)
rand.Read(m.Data)
start := time.Now()
for i := 0; i < toSend; i++ {
m.Reply = _EMPTY_
switch i % 3 {
case 0:
m.Subject = "foo"
case 1:
m.Subject = "bar"
case 2:
m.Subject = "baz"
}
m.Header.Set("X-ID2", fmt.Sprintf("XXXXX-%d", i))
if _, err := js.PublishMsgAsync(m); err != nil {
t.Fatalf("Err on publish: %v", err)
}
}
<-js.PublishAsyncComplete()
fmt.Printf("TOOK %v to publish\n", time.Since(start))
v, err := s.Varz(nil)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
fmt.Printf("MEM AFTER PUBLISH is %v\n", friendlyBytes(v.Mem))
si, _ := js.StreamInfo("TEST")
fmt.Printf("si is %+v\n", si.State)
received := 0
done := make(chan bool)
cb := func(m *nats.Msg) {
received++
if received >= toSend {
done <- true
}
}
start = time.Now()
sub, err := js.Subscribe("*", cb, nats.EnableFlowControl(), nats.IdleHeartbeat(time.Second), nats.AckNone())
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
defer sub.Unsubscribe()
<-done
fmt.Printf("TOOK %v to consume\n", time.Since(start))
v, err = s.Varz(nil)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
fmt.Printf("MEM AFTER SUBSCRIBE is %v\n", friendlyBytes(v.Mem))
}
// Report of slow restart for a server that has many messages that have expired while it was not running.
func TestNoRaceJetStreamSlowRestartWithManyExpiredMsgs(t *testing.T) {
opts := DefaultTestOptions
opts.Port = -1
opts.JetStream = true
s := RunServer(&opts)
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
// Client for API requests.
nc, js := jsClientConnect(t, s)
defer nc.Close()
ttl := 2 * time.Second
_, err := js.AddStream(&nats.StreamConfig{
Name: "ORDERS",
Subjects: []string{"orders.*"},
MaxAge: ttl,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Attach a consumer who is filtering on a wildcard subject as well.
// This does not affect it like I thought originally but will keep it here.
_, err = js.AddConsumer("ORDERS", &nats.ConsumerConfig{
Durable: "c22",
FilterSubject: "orders.*",
AckPolicy: nats.AckExplicitPolicy,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Now fill up with messages.
toSend := 100_000
for i := 1; i <= toSend; i++ {
js.PublishAsync(fmt.Sprintf("orders.%d", i), []byte("OK"))
}
<-js.PublishAsyncComplete()
sdir := strings.TrimSuffix(s.JetStreamConfig().StoreDir, JetStreamStoreDir)
s.Shutdown()
// Let them expire while not running.
time.Sleep(ttl + 500*time.Millisecond)
start := time.Now()
opts.Port = -1
opts.StoreDir = sdir
s = RunServer(&opts)
elapsed := time.Since(start)
defer s.Shutdown()
if elapsed > 2*time.Second {
t.Fatalf("Took %v for restart which is too long", elapsed)
}
// Check everything is correct.
nc, js = jsClientConnect(t, s)
defer nc.Close()
si, err := js.StreamInfo("ORDERS")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 0 {
t.Fatalf("Expected no msgs after restart, got %d", si.State.Msgs)
}
}
func TestNoRaceJetStreamStalledMirrorsAfterExpire(t *testing.T) {
c := createJetStreamClusterExplicit(t, "JSC", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
cfg := &nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo.*"},
Replicas: 1,
MaxAge: 100 * time.Millisecond,
}
if _, err := js.AddStream(cfg); err != nil {
t.Fatalf("Error creating stream: %v", err)
}
if _, err := js.AddStream(&nats.StreamConfig{
Name: "M",
Replicas: 2,
Mirror: &nats.StreamSource{Name: "TEST"},
}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
sendBatch := func(batch int) {
t.Helper()
for i := 0; i < batch; i++ {
js.PublishAsync("foo.bar", []byte("Hello"))
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
}
numMsgs := 10_000
sendBatch(numMsgs)
// Turn off expiration so we can test we did not stall.
cfg.MaxAge = 0
if _, err := js.UpdateStream(cfg); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
sendBatch(numMsgs)
// Wait for mirror to be caught up.
checkFor(t, 10*time.Second, 500*time.Millisecond, func() error {
si, err := js.StreamInfo("M")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.LastSeq != uint64(2*numMsgs) {
return fmt.Errorf("Expected %d as last sequence, got state: %+v", 2*numMsgs, si.State)
}
return nil
})
}
// We will use JetStream helpers to create supercluster but this test is about exposing the ability to access
// account scoped connz with subject interest filtering.
func TestNoRaceJetStreamSuperClusterAccountConnz(t *testing.T) {
// This has 4 different account, 3 general and system.
sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 3, 3)
defer sc.shutdown()
// Create 20 connections on account one and two
// Create JetStream assets for each as well to make sure by default we do not report them.
num := 20
for i := 0; i < num; i++ {
nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo("one", "p"), nats.Name("one"))
defer nc.Close()
if i%2 == 0 {
nc.SubscribeSync("foo")
} else {
nc.SubscribeSync("bar")
}
nc, js := jsClientConnect(t, sc.randomServer(), nats.UserInfo("two", "p"), nats.Name("two"))
nc.SubscribeSync("baz")
nc.SubscribeSync("foo.bar.*")
nc.SubscribeSync(fmt.Sprintf("id.%d", i+1))
js.AddStream(&nats.StreamConfig{Name: fmt.Sprintf("TEST:%d", i+1)})
defer nc.Close()
}
type czapi struct {
Server *ServerInfo
Data *Connz
Error *ApiError
}
parseConnz := func(buf []byte) *Connz {
t.Helper()
var cz czapi
if err := json.Unmarshal(buf, &cz); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if cz.Error != nil {
t.Fatalf("Unexpected error: %+v", cz.Error)
}
return cz.Data
}
doRequest := func(reqSubj, acc, filter string, expected int) {
t.Helper()
nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo(acc, "p"), nats.Name(acc))
defer nc.Close()
mch := make(chan *nats.Msg, 9)
sub, _ := nc.ChanSubscribe(nats.NewInbox(), mch)
var req []byte
if filter != _EMPTY_ {
req, _ = json.Marshal(&ConnzOptions{FilterSubject: filter})
}
if err := nc.PublishRequest(reqSubj, sub.Subject, req); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// So we can igniore ourtselves.
cid, _ := nc.GetClientID()
sid := nc.ConnectedServerId()
wt := time.NewTimer(200 * time.Millisecond)
var conns []*ConnInfo
LOOP:
for {
select {
case m := <-mch:
if len(m.Data) == 0 {
t.Fatalf("No responders")
}
cr := parseConnz(m.Data)
// For account scoped, NumConns and Total should be the same (sans limits and offsets).
// It Total should not include other accounts since that would leak information about the system.
if filter == _EMPTY_ && cr.NumConns != cr.Total {
t.Fatalf("NumConns and Total should be same with account scoped connz, got %+v", cr)
}
for _, c := range cr.Conns {
if c.Name != acc {
t.Fatalf("Got wrong account: %q vs %q for %+v", acc, c.Account, c)
}
if !(c.Cid == cid && cr.ID == sid) {
conns = append(conns, c)
}
}
wt.Reset(200 * time.Millisecond)
case <-wt.C:
break LOOP
}
}
if len(conns) != expected {
t.Fatalf("Expected to see %d conns but got %d", expected, len(conns))
}
}
doSysRequest := func(acc string, expected int) {
t.Helper()
doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, _EMPTY_, expected)
}
doAccRequest := func(acc string, expected int) {
t.Helper()
doRequest("$SYS.REQ.ACCOUNT.PING.CONNZ", acc, _EMPTY_, expected)
}
doFiltered := func(acc, filter string, expected int) {
t.Helper()
doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, filter, expected)
}
doSysRequest("one", 20)
doAccRequest("one", 20)
doSysRequest("two", 20)
doAccRequest("two", 20)
// Now check filtering.
doFiltered("one", _EMPTY_, 20)
doFiltered("one", ">", 20)
doFiltered("one", "bar", 10)
doFiltered("two", "bar", 0)
doFiltered("two", "id.1", 1)
doFiltered("two", "id.*", 20)
doFiltered("two", "foo.bar.*", 20)
doFiltered("two", "foo.>", 20)
}
func TestNoRaceCompressedConnz(t *testing.T) {
s := RunBasicJetStreamServer()
config := s.JetStreamConfig()
if config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, _ := jsClientConnect(t, s)
defer nc.Close()
doRequest := func(compress string) {
t.Helper()
m := nats.NewMsg("$SYS.REQ.ACCOUNT.PING.CONNZ")
m.Header.Add("Accept-Encoding", compress)
resp, err := nc.RequestMsg(m, time.Second)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
buf := resp.Data
// Make sure we have an encoding header.
ce := resp.Header.Get("Content-Encoding")
switch strings.ToLower(ce) {
case "gzip":
zr, err := gzip.NewReader(bytes.NewReader(buf))
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
defer zr.Close()
buf, err = ioutil.ReadAll(zr)
if err != nil && err != io.ErrUnexpectedEOF {
t.Fatalf("Unexpected error: %v", err)
}
case "snappy", "s2":
sr := s2.NewReader(bytes.NewReader(buf))
buf, err = ioutil.ReadAll(sr)
if err != nil && err != io.ErrUnexpectedEOF {
t.Fatalf("Unexpected error: %v", err)
}
default:
t.Fatalf("Unknown content-encoding of %q", ce)
}
var cz ServerAPIConnzResponse
if err := json.Unmarshal(buf, &cz); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if cz.Error != nil {
t.Fatalf("Unexpected error: %+v", cz.Error)
}
}
doRequest("gzip")
doRequest("snappy")
doRequest("s2")
}
func TestNoRaceJetStreamClusterExtendedStreamPurge(t *testing.T) {
for _, st := range []StorageType{FileStorage, MemoryStorage} {
t.Run(st.String(), func(t *testing.T) {
c := createJetStreamClusterExplicit(t, "JSC", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
cfg := StreamConfig{
Name: "KV",
Subjects: []string{"kv.>"},
Storage: st,
Replicas: 2,
MaxMsgsPer: 100,
}
req, err := json.Marshal(cfg)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Do manually for now.
nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second)
c.waitOnStreamLeader("$G", "KV")
si, err := js.StreamInfo("KV")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si == nil || si.Config.Name != "KV" {
t.Fatalf("StreamInfo is not correct %+v", si)
}
for i := 0; i < 1000; i++ {
js.PublishAsync("kv.foo", []byte("OK")) // 1 * i
js.PublishAsync("kv.bar", []byte("OK")) // 2 * i
js.PublishAsync("kv.baz", []byte("OK")) // 3 * i
}
// First is 2700, last is 3000
for i := 0; i < 700; i++ {
js.PublishAsync(fmt.Sprintf("kv.%d", i+1), []byte("OK"))
}
// Now first is 2700, last is 3700
select {
case <-js.PublishAsyncComplete():
case <-time.After(10 * time.Second):
t.Fatalf("Did not receive completion signal")
}
si, err = js.StreamInfo("KV")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 1000 {
t.Fatalf("Expected %d msgs, got %d", 1000, si.State.Msgs)
}
shouldFail := func(preq *JSApiStreamPurgeRequest) {
req, _ := json.Marshal(preq)
resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var pResp JSApiStreamPurgeResponse
if err = json.Unmarshal(resp.Data, &pResp); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if pResp.Success || pResp.Error == nil {
t.Fatalf("Expected an error response but got none")
}
}
// Sequence and Keep should be mutually exclusive.
shouldFail(&JSApiStreamPurgeRequest{Sequence: 10, Keep: 10})
purge := func(preq *JSApiStreamPurgeRequest, newTotal uint64) {
t.Helper()
req, _ := json.Marshal(preq)
resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var pResp JSApiStreamPurgeResponse
if err = json.Unmarshal(resp.Data, &pResp); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if !pResp.Success || pResp.Error != nil {
t.Fatalf("Got a bad response %+v", pResp)
}
si, err = js.StreamInfo("KV")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != newTotal {
t.Fatalf("Expected total after purge to be %d but got %d", newTotal, si.State.Msgs)
}
}
expectLeft := func(subject string, expected uint64) {
t.Helper()
ci, err := js.AddConsumer("KV", &nats.ConsumerConfig{Durable: "dlc", FilterSubject: subject, AckPolicy: nats.AckExplicitPolicy})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
defer js.DeleteConsumer("KV", "dlc")
if ci.NumPending != expected {
t.Fatalf("Expected %d remaining but got %d", expected, ci.NumPending)
}
}
purge(&JSApiStreamPurgeRequest{Subject: "kv.foo"}, 900)
expectLeft("kv.foo", 0)
purge(&JSApiStreamPurgeRequest{Subject: "kv.bar", Keep: 1}, 801)
expectLeft("kv.bar", 1)
purge(&JSApiStreamPurgeRequest{Subject: "kv.baz", Sequence: 2851}, 751)
expectLeft("kv.baz", 50)
purge(&JSApiStreamPurgeRequest{Subject: "kv.*"}, 0)
// RESET
js.DeleteStream("KV")
// Do manually for now.
nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second)
c.waitOnStreamLeader("$G", "KV")
if _, err := js.StreamInfo("KV"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Put in 100.
for i := 0; i < 100; i++ {
js.PublishAsync("kv.foo", []byte("OK"))
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(time.Second):
t.Fatalf("Did not receive completion signal")
}
purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10)
purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10)
expectLeft("kv.foo", 10)
// RESET AGAIN
js.DeleteStream("KV")
// Do manually for now.
nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second)
c.waitOnStreamLeader("$G", "KV")
if _, err := js.StreamInfo("KV"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Put in 100.
for i := 0; i < 100; i++ {
js.Publish("kv.foo", []byte("OK"))
}
purge(&JSApiStreamPurgeRequest{Keep: 10}, 10)
expectLeft(">", 10)
// RESET AGAIN
js.DeleteStream("KV")
// Do manually for now.
nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second)
if _, err := js.StreamInfo("KV"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Put in 100.
for i := 0; i < 100; i++ {
js.Publish("kv.foo", []byte("OK"))
}
purge(&JSApiStreamPurgeRequest{Sequence: 90}, 11) // Up to 90 so we keep that, hence the 11.
expectLeft(">", 11)
})
}
}
func TestNoRaceJetStreamFileStoreCompaction(t *testing.T) {
s := RunBasicJetStreamServer()
config := s.JetStreamConfig()
if config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
cfg := &nats.StreamConfig{
Name: "KV",
Subjects: []string{"KV.>"},
MaxMsgsPerSubject: 1,
}
if _, err := js.AddStream(cfg); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
toSend := 10_000
data := make([]byte, 4*1024)
rand.Read(data)
// First one.
js.PublishAsync("KV.FM", data)
for i := 0; i < toSend; i++ {
js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data)
}
// Do again and overwrite the previous batch.
for i := 0; i < toSend; i++ {
js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(time.Second):
t.Fatalf("Did not receive completion signal")
}
// Now check by hand the utilization level.
mset, err := s.GlobalAccount().lookupStream("KV")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
total, used, _ := mset.Store().Utilization()
if pu := 100.0 * float32(used) / float32(total); pu < 80.0 {
t.Fatalf("Utilization is less than 80%%, got %.2f", pu)
}
}
func TestNoRaceJetStreamEncryptionEnabledOnRestartWithExpire(t *testing.T) {
conf := createConfFile(t, []byte(`
listen: 127.0.0.1:-1
jetstream: enabled
`))
defer removeFile(t, conf)
s, _ := RunServerWithConfig(conf)
defer s.Shutdown()
config := s.JetStreamConfig()
if config == nil {
t.Fatalf("Expected config but got none")
}
defer removeDir(t, config.StoreDir)
nc, js := jsClientConnect(t, s)
defer nc.Close()
toSend := 10_000
cfg := &nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo", "bar"},
MaxMsgs: int64(toSend),
}
if _, err := js.AddStream(cfg); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
data := make([]byte, 4*1024) // 4K payload
rand.Read(data)
for i := 0; i < toSend; i++ {
js.PublishAsync("foo", data)
js.PublishAsync("bar", data)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
_, err := js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dlc", AckPolicy: nats.AckExplicitPolicy})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Restart
nc.Close()
s.Shutdown()
ncs := fmt.Sprintf("\nlisten: 127.0.0.1:-1\njetstream: {key: %q, store_dir: %q}\n", "s3cr3t!", config.StoreDir)
conf = createConfFile(t, []byte(ncs))
defer removeFile(t, conf)
// Try to drain entropy to see if effects startup time.
drain := make([]byte, 32*1024*1024) // Pull 32Mb of crypto rand.
crand.Read(drain)
start := time.Now()
s, _ = RunServerWithConfig(conf)
defer s.Shutdown()
dd := time.Since(start)
if dd > 5*time.Second {
t.Fatalf("Restart took longer than expected: %v", dd)
}
}
// This test was from Ivan K. and showed a bug in the filestore implementation.
// This is skipped by default since it takes >40s to run.
func TestNoRaceJetStreamOrderedConsumerMissingMsg(t *testing.T) {
// Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm.
skip(t)
s := RunBasicJetStreamServer()
config := s.JetStreamConfig()
if config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
if _, err := js.AddStream(&nats.StreamConfig{
Name: "benchstream",
Subjects: []string{"testsubject"},
Replicas: 1,
}); err != nil {
t.Fatalf("add stream failed: %s", err)
}
total := 1_000_000
numSubs := 10
ch := make(chan struct{}, numSubs)
wg := sync.WaitGroup{}
wg.Add(numSubs)
errCh := make(chan error, 1)
for i := 0; i < numSubs; i++ {
nc, js := jsClientConnect(t, s)
defer nc.Close()
go func(nc *nats.Conn, js nats.JetStreamContext) {
defer wg.Done()
received := 0
_, err := js.Subscribe("testsubject", func(m *nats.Msg) {
meta, _ := m.Metadata()
if meta.Sequence.Consumer != meta.Sequence.Stream {
nc.Close()
errCh <- fmt.Errorf("Bad meta: %+v", meta)
}
received++
if received == total {
ch <- struct{}{}
}
}, nats.OrderedConsumer())
if err != nil {
select {
case errCh <- fmt.Errorf("Error creating sub: %v", err):
default:
}
}
}(nc, js)
}
wg.Wait()
select {
case e := <-errCh:
t.Fatal(e)
default:
}
payload := make([]byte, 500)
for i := 1; i <= total; i++ {
js.PublishAsync("testsubject", payload)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(10 * time.Second):
t.Fatalf("Did not send all messages")
}
// Now wait for consumers to be done:
for i := 0; i < numSubs; i++ {
select {
case <-ch:
case <-time.After(10 * time.Second):
t.Fatal("Did not receive all messages for all consumers in time")
}
}
}
// Issue #2488 - Bad accounting, can not reproduce the stalled consumers after last several PRs.
// Issue did show bug in ack logic for no-ack and interest based retention.
func TestNoRaceJetStreamClusterInterestPolicyAckNone(t *testing.T) {
for _, test := range []struct {
name string
durable string
}{
{"durable", "dlc"},
{"ephemeral", _EMPTY_},
} {
t.Run(test.name, func(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "cluster",
Subjects: []string{"cluster.*"},
Retention: nats.InterestPolicy,
Discard: nats.DiscardOld,
Replicas: 3,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var received uint32
mh := func(m *nats.Msg) {
atomic.AddUint32(&received, 1)
}
_, err = js.Subscribe("cluster.created", mh, nats.Durable(test.durable), nats.DeliverNew(), nats.AckNone())
if err != nil {
t.Fatalf("Unexepected error: %v", err)
}
msg := []byte("ACK ME")
const total = uint32(1_000)
for i := 0; i < int(total); i++ {
if _, err := js.Publish("cluster.created", msg); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
//time.Sleep(100 * time.Microsecond)
}
// Wait for all messages to be received.
checkFor(t, 2*time.Second, 100*time.Millisecond, func() error {
r := atomic.LoadUint32(&received)
if r == total {
return nil
}
return fmt.Errorf("Received only %d out of %d", r, total)
})
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
si, err := js.StreamInfo("cluster")
if err != nil {
t.Fatalf("Error getting stream info: %v", err)
}
if si.State.Msgs != 0 {
return fmt.Errorf("Expected no messages, got %d", si.State.Msgs)
}
return nil
})
})
}
}
// There was a bug in the filestore compact code that would cause a store
// with JSExpectedLastSubjSeq to fail with "wrong last sequence: 0"
func TestNoRaceJetStreamLastSubjSeqAndFilestoreCompact(t *testing.T) {
s := RunBasicJetStreamServer()
defer s.Shutdown()
// Client based API
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "MQTT_sess",
Subjects: []string{"MQTT.sess.>"},
Storage: nats.FileStorage,
Retention: nats.LimitsPolicy,
Replicas: 1,
MaxMsgsPerSubject: 1,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
firstPayload := make([]byte, 40)
secondPayload := make([]byte, 380)
for iter := 0; iter < 2; iter++ {
for i := 0; i < 4000; i++ {
subj := "MQTT.sess." + string(getHash(fmt.Sprintf("client_%d", i)))
pa, err := js.Publish(subj, firstPayload)
if err != nil {
t.Fatalf("Error on publish: %v", err)
}
m := nats.NewMsg(subj)
m.Data = secondPayload
eseq := strconv.FormatInt(int64(pa.Sequence), 10)
m.Header.Set(JSExpectedLastSubjSeq, eseq)
if _, err := js.PublishMsg(m); err != nil {
t.Fatalf("Error on publish (iter=%v seq=%v): %v", iter+1, pa.Sequence, err)
}
}
}
}
// Issue #2548
func TestNoRaceJetStreamClusterMemoryStreamConsumerRaftGrowth(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "memory-leak",
Subjects: []string{"memory-leak"},
Retention: nats.LimitsPolicy,
MaxMsgs: 1000,
Discard: nats.DiscardOld,
MaxAge: time.Minute,
Storage: nats.MemoryStorage,
Replicas: 3,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
_, err = js.QueueSubscribe("memory-leak", "q1", func(msg *nats.Msg) {
time.Sleep(1 * time.Second)
msg.AckSync()
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Send 10k (Must be > 8192 which is compactNumMin from monitorConsumer.
msg := []byte("NATS is a connective technology that powers modern distributed systems.")
for i := 0; i < 10_000; i++ {
if _, err := js.Publish("memory-leak", msg); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
// We will verify here that the underlying raft layer for the leader is not > 8192
cl := c.consumerLeader("$G", "memory-leak", "q1")
mset, err := cl.GlobalAccount().lookupStream("memory-leak")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
o := mset.lookupConsumer("q1")
if o == nil {
t.Fatalf("Error looking up consumer %q", "q1")
}
node := o.raftNode().(*raft)
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
if ms := node.wal.(*memStore); ms.State().Msgs > 8192 {
return fmt.Errorf("Did not compact the raft memory WAL")
}
return nil
})
}
func TestNoRaceJetStreamClusterCorruptWAL(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Subjects: []string{"foo"}, Replicas: 3}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
sub, err := js.PullSubscribe("foo", "dlc")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
numMsgs := 1000
for i := 0; i < numMsgs; i++ {
js.PublishAsync("foo", []byte("WAL"))
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
for i, m := range fetchMsgs(t, sub, 200, 5*time.Second) {
// Ack first 50 and every other even on after that..
if i < 50 || i%2 == 1 {
m.AckSync()
}
}
// Make sure acks processed.
time.Sleep(200 * time.Millisecond)
nc.Close()
// Check consumer consistency.
checkConsumerWith := func(delivered, ackFloor uint64, ackPending int) {
t.Helper()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
ci, err := js.ConsumerInfo("TEST", "dlc")
if err != nil {
return fmt.Errorf("Unexpected error: %v", err)
}
if ci.Delivered.Consumer != ci.Delivered.Stream || ci.Delivered.Consumer != delivered {
return fmt.Errorf("Expected %d for delivered, got %+v", delivered, ci.Delivered)
}
if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != ackFloor {
return fmt.Errorf("Expected %d for ack floor, got %+v", ackFloor, ci.AckFloor)
}
nm := uint64(numMsgs)
if ci.NumPending != nm-delivered {
return fmt.Errorf("Expected num pending to be %d, got %d", nm-delivered, ci.NumPending)
}
if ci.NumAckPending != ackPending {
return fmt.Errorf("Expected num ack pending to be %d, got %d", ackPending, ci.NumAckPending)
}
return nil
})
}
checkConsumer := func() {
t.Helper()
checkConsumerWith(200, 50, 75)
}
checkConsumer()
// Grab the consumer leader.
cl := c.consumerLeader("$G", "TEST", "dlc")
mset, err := cl.GlobalAccount().lookupStream("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
o := mset.lookupConsumer("dlc")
if o == nil {
t.Fatalf("Error looking up consumer %q", "dlc")
}
// Grab underlying raft node and the WAL (filestore) and we will attempt to "corrupt" it.
node := o.raftNode().(*raft)
fs := node.wal.(*fileStore)
fcfg, cfg := fs.fcfg, fs.cfg.StreamConfig
// Stop all the servers.
c.stopAll()
// Manipulate directly with cluster down.
fs, err = newFileStore(fcfg, cfg)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
state := fs.State()
sm, err := fs.LoadMsg(state.LastSeq, nil)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
ae, err := node.decodeAppendEntry(sm.msg, nil, _EMPTY_)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
dentry := func(dseq, sseq, dc uint64, ts int64) []byte {
b := make([]byte, 4*binary.MaxVarintLen64+1)
b[0] = byte(updateDeliveredOp)
n := 1
n += binary.PutUvarint(b[n:], dseq)
n += binary.PutUvarint(b[n:], sseq)
n += binary.PutUvarint(b[n:], dc)
n += binary.PutVarint(b[n:], ts)
return b[:n]
}
// Let's put a non-contigous AppendEntry into the system.
ae.pindex += 10
// Add in delivered record.
ae.entries = []*Entry{&Entry{EntryNormal, dentry(1000, 1000, 1, time.Now().UnixNano())}}
encoded, err := ae.encode(nil)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if _, _, err := fs.StoreMsg(_EMPTY_, nil, encoded); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
fs.Stop()
c.restartAllSamePorts()
c.waitOnStreamLeader("$G", "TEST")
c.waitOnConsumerLeader("$G", "TEST", "dlc")
checkConsumer()
// Now we will truncate out the WAL out from underneath the leader.
// Grab the consumer leader.
nc, js = jsClientConnect(t, c.randomServer())
defer nc.Close()
cl = c.consumerLeader("$G", "TEST", "dlc")
mset, err = cl.GlobalAccount().lookupStream("TEST")
require_NoError(t, err)
o = mset.lookupConsumer("dlc")
require_NoError(t, err)
// Grab underlying raft node and the WAL (filestore) and truncate it.
// This will simulate the WAL losing state due to truncate and we want to make sure it recovers.
fs = o.raftNode().(*raft).wal.(*fileStore)
state = fs.State()
err = fs.Truncate(state.FirstSeq)
require_NoError(t, err)
state = fs.State()
sub, err = js.PullSubscribe("foo", "dlc")
require_NoError(t, err)
// This will cause us to stepdown and truncate our WAL.
sub.Fetch(100)
c.waitOnConsumerLeader("$G", "TEST", "dlc")
// We can't trust the results sans that we have a leader back in place and the ackFloor.
ci, err := js.ConsumerInfo("TEST", "dlc")
require_NoError(t, err)
if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != 50 {
t.Fatalf("Expected %d for ack floor, got %+v", 50, ci.AckFloor)
}
}
func TestNoRaceJetStreamClusterInterestRetentionDeadlock(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
s := c.randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
// This can trigger deadlock with current architecture.
// If stream is !limitsRetention and consumer is DIRECT and ack none we will try to place the msg seq
// onto a chan for the stream to consider removing. All conditions above must hold to trigger.
// We will attempt to trigger here with a stream mirror setup which uses and R=1 DIRECT consumer to replicate msgs.
_, err := js.AddStream(&nats.StreamConfig{Name: "S", Retention: nats.InterestPolicy, Storage: nats.MemoryStorage})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Create a mirror which will create the consumer profile to trigger.
_, err = js.AddStream(&nats.StreamConfig{Name: "M", Mirror: &nats.StreamSource{Name: "S"}})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Queue up alot of messages.
numRequests := 20_000
for i := 0; i < numRequests; i++ {
js.PublishAsync("S", []byte("Q"))
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
si, err := js.StreamInfo("S")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != 0 {
return fmt.Errorf("Expected 0 msgs, got state: %+v", si.State)
}
return nil
})
}
func TestNoRaceJetStreamClusterMaxConsumersAndDirect(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
s := c.randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
// We want to max sure max consumer limits do not affect mirrors or sources etc.
_, err := js.AddStream(&nats.StreamConfig{Name: "S", Storage: nats.MemoryStorage, MaxConsumers: 1})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var mirrors []string
for i := 0; i < 10; i++ {
// Create a mirror.
mname := fmt.Sprintf("M-%d", i+1)
mirrors = append(mirrors, mname)
_, err = js.AddStream(&nats.StreamConfig{Name: mname, Mirror: &nats.StreamSource{Name: "S"}})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
// Queue up messages.
numRequests := 20
for i := 0; i < numRequests; i++ {
js.Publish("S", []byte("Q"))
}
checkFor(t, 5*time.Second, 100*time.Millisecond, func() error {
for _, mname := range mirrors {
si, err := js.StreamInfo(mname)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != uint64(numRequests) {
return fmt.Errorf("Expected %d msgs for %q, got state: %+v", numRequests, mname, si.State)
}
}
return nil
})
}
// Make sure when we try to hard reset a stream state in a cluster that we also re-create the consumers.
func TestNoRaceJetStreamClusterStreamReset(t *testing.T) {
// Speed up raft
omin, omax, ohb := minElectionTimeout, maxElectionTimeout, hbInterval
minElectionTimeout = 250 * time.Millisecond
maxElectionTimeout = time.Second
hbInterval = 50 * time.Millisecond
defer func() {
minElectionTimeout = omin
maxElectionTimeout = omax
hbInterval = ohb
}()
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
s := c.randomServer()
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo.*"},
Replicas: 2,
Retention: nats.WorkQueuePolicy,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
numRequests := 20
for i := 0; i < numRequests; i++ {
js.Publish("foo.created", []byte("REQ"))
}
// Durable.
sub, err := js.SubscribeSync("foo.created", nats.Durable("d1"))
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
defer sub.Unsubscribe()
si, err := js.StreamInfo("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if si.State.Msgs != uint64(numRequests) {
t.Fatalf("Expected %d msgs, got bad state: %+v", numRequests, si.State)
}
// Let settle a bit for Go routine checks.
time.Sleep(250 * time.Millisecond)
// Grab number go routines.
base := runtime.NumGoroutine()
// Make the consumer busy here by async sending a bunch of messages.
for i := 0; i < numRequests*10; i++ {
js.PublishAsync("foo.created", []byte("REQ"))
}
// Grab a server that is the consumer leader for the durable.
cl := c.consumerLeader("$G", "TEST", "d1")
mset, err := cl.GlobalAccount().lookupStream("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
// Do a hard reset here by hand.
mset.resetClusteredState(nil)
// Wait til we have the consumer leader re-elected.
c.waitOnConsumerLeader("$G", "TEST", "d1")
// So we do not wait all 10s in each call to ConsumerInfo.
js2, _ := nc.JetStream(nats.MaxWait(250 * time.Millisecond))
// Make sure we can get the consumer info eventually.
checkFor(t, 5*time.Second, 200*time.Millisecond, func() error {
_, err := js2.ConsumerInfo("TEST", "d1")
return err
})
// Grab number go routines.
if after := runtime.NumGoroutine(); base > after {
t.Fatalf("Expected %d go routines, got %d", base, after)
}
// Simulate a low level write error on our consumer and make sure we can recover etc.
cl = c.consumerLeader("$G", "TEST", "d1")
mset, err = cl.GlobalAccount().lookupStream("TEST")
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
o := mset.lookupConsumer("d1")
if o == nil {
t.Fatalf("Did not retrieve consumer")
}
node := o.raftNode().(*raft)
if node == nil {
t.Fatalf("could not retrieve the raft node for consumer")
}
nc.Close()
node.setWriteErr(io.ErrShortWrite)
c.stopAll()
c.restartAll()
c.waitOnStreamLeader("$G", "TEST")
c.waitOnConsumerLeader("$G", "TEST", "d1")
}
// Reports of high cpu on compaction for a KV store.
func TestNoRaceJetStreamKeyValueCompaction(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
kv, err := js.CreateKeyValue(&nats.KeyValueConfig{
Bucket: "COMPACT",
Replicas: 3,
})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
value := strings.Repeat("A", 128*1024)
for i := 0; i < 5_000; i++ {
key := fmt.Sprintf("K-%d", rand.Intn(256)+1)
if _, err := kv.PutString(key, value); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
}
// Trying to recreate an issue rip saw with KV and server restarts complaining about
// mismatch for a few minutes and growing memory.
func TestNoRaceJetStreamClusterStreamSeqMismatchIssue(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
kv, err := js.CreateKeyValue(&nats.KeyValueConfig{
Bucket: "MM",
Replicas: 3,
TTL: 500 * time.Millisecond,
})
require_NoError(t, err)
for i := 1; i <= 10; i++ {
if _, err := kv.PutString("k", "1"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
// Close in case we are connected here. Will recreate.
nc.Close()
// Shutdown a non-leader.
s := c.randomNonStreamLeader("$G", "KV_MM")
s.Shutdown()
nc, js = jsClientConnect(t, c.randomServer())
defer nc.Close()
kv, err = js.KeyValue("MM")
require_NoError(t, err)
// Now change the state of the stream such that we have to do a compact upon restart
// of the downed server.
for i := 1; i <= 10; i++ {
if _, err := kv.PutString("k", "2"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
// Raft could save us here so need to run a compact on the leader.
snapshotLeader := func() {
sl := c.streamLeader("$G", "KV_MM")
if sl == nil {
t.Fatalf("Did not get the leader")
}
mset, err := sl.GlobalAccount().lookupStream("KV_MM")
require_NoError(t, err)
node := mset.raftNode()
if node == nil {
t.Fatalf("Could not get stream group")
}
if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil {
t.Fatalf("Error installing snapshot: %v", err)
}
}
// Now wait for expiration
time.Sleep(time.Second)
snapshotLeader()
s = c.restartServer(s)
c.waitOnServerCurrent(s)
// We want to make sure we do not reset the raft state on a catchup due to no request yield.
// Bug was if we did not actually request any help from snapshot we did not set mset.lseq properly.
// So when we send next batch that would cause raft reset due to cluster reset for our stream.
mset, err := s.GlobalAccount().lookupStream("KV_MM")
require_NoError(t, err)
for i := 1; i <= 10; i++ {
if _, err := kv.PutString("k1", "X"); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}
c.waitOnStreamCurrent(s, "$G", "KV_MM")
// Make sure we did not reset our stream.
msetNew, err := s.GlobalAccount().lookupStream("KV_MM")
require_NoError(t, err)
if msetNew != mset {
t.Fatalf("Stream was reset")
}
}
func TestNoRaceJetStreamClusterStreamDropCLFS(t *testing.T) {
c := createJetStreamClusterExplicit(t, "R3S", 3)
defer c.shutdown()
// Client based API
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
kv, err := js.CreateKeyValue(&nats.KeyValueConfig{
Bucket: "CLFS",
Replicas: 3,
})
require_NoError(t, err)
// Will work
_, err = kv.Create("k.1", []byte("X"))
require_NoError(t, err)
// Drive up CLFS state on leader.
for i := 0; i < 10; i++ {
_, err = kv.Create("k.1", []byte("X"))
require_Error(t, err)
}
// Bookend with new key success.
_, err = kv.Create("k.2", []byte("Z"))
require_NoError(t, err)
// Close in case we are connected here. Will recreate.
nc.Close()
// Shutdown, which will also clear clfs.
s := c.randomNonStreamLeader("$G", "KV_CLFS")
s.Shutdown()
nc, js = jsClientConnect(t, c.randomServer())
defer nc.Close()
kv, err = js.KeyValue("CLFS")
require_NoError(t, err)
// Drive up CLFS state on leader.
for i := 0; i < 10; i++ {
_, err = kv.Create("k.1", []byte("X"))
require_Error(t, err)
}
sl := c.streamLeader("$G", "KV_CLFS")
if sl == nil {
t.Fatalf("Did not get the leader")
}
mset, err := sl.GlobalAccount().lookupStream("KV_CLFS")
require_NoError(t, err)
node := mset.raftNode()
if node == nil {
t.Fatalf("Could not get stream group")
}
if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil {
t.Fatalf("Error installing snapshot: %v", err)
}
_, err = kv.Create("k.3", []byte("ZZZ"))
require_NoError(t, err)
s = c.restartServer(s)
c.waitOnServerCurrent(s)
mset, err = s.GlobalAccount().lookupStream("KV_CLFS")
require_NoError(t, err)
_, err = kv.Create("k.4", []byte("YYY"))
require_NoError(t, err)
c.waitOnStreamCurrent(s, "$G", "KV_CLFS")
// Make sure we did not reset our stream.
msetNew, err := s.GlobalAccount().lookupStream("KV_CLFS")
require_NoError(t, err)
if msetNew != mset {
t.Fatalf("Stream was reset")
}
}
func TestNoRaceJetStreamMemstoreWithLargeInteriorDeletes(t *testing.T) {
s := RunBasicJetStreamServer()
config := s.JetStreamConfig()
if config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
// Client for API requests.
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo", "bar"},
MaxMsgsPerSubject: 1,
Storage: nats.MemoryStorage,
})
require_NoError(t, err)
acc, err := s.lookupAccount("$G")
require_NoError(t, err)
mset, err := acc.lookupStream("TEST")
require_NoError(t, err)
msg := []byte("Hello World!")
if _, err := js.PublishAsync("foo", msg); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
for i := 1; i <= 1_000_000; i++ {
if _, err := js.PublishAsync("bar", msg); err != nil {
t.Fatalf("Unexpected publish error: %v", err)
}
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
now := time.Now()
ss := mset.stateWithDetail(true)
// Before the fix the snapshot for this test would be > 200ms on my setup.
if elapsed := time.Since(now); elapsed > 50*time.Millisecond {
t.Fatalf("Took too long to snapshot: %v", elapsed)
}
if ss.Msgs != 2 || ss.FirstSeq != 1 || ss.LastSeq != 1_000_001 || ss.NumDeleted != 999999 {
// To not print out on error.
ss.Deleted = nil
t.Fatalf("Bad State: %+v", ss)
}
}
// This is related to an issue reported where we were exhausting threads by trying to
// cleanup too many consumers at the same time.
// https://github.com/nats-io/nats-server/issues/2742
func TestNoRaceConsumerFileStoreConcurrentDiskIO(t *testing.T) {
storeDir := createDir(t, JetStreamStoreDir)
defer removeDir(t, storeDir)
// Artificially adjust our environment for this test.
gmp := runtime.GOMAXPROCS(32)
defer runtime.GOMAXPROCS(gmp)
maxT := debug.SetMaxThreads(64)
defer debug.SetMaxThreads(maxT)
fs, err := newFileStore(FileStoreConfig{StoreDir: storeDir}, StreamConfig{Name: "MT", Storage: FileStorage})
require_NoError(t, err)
defer fs.Stop()
startCh := make(chan bool)
var wg sync.WaitGroup
var swg sync.WaitGroup
ts := time.Now().UnixNano()
// Create 1000 consumerStores
n := 1000
swg.Add(n)
for i := 1; i <= n; i++ {
name := fmt.Sprintf("o%d", i)
o, err := fs.ConsumerStore(name, &ConsumerConfig{AckPolicy: AckExplicit})
require_NoError(t, err)
wg.Add(1)
swg.Done()
go func() {
defer wg.Done()
// Will make everyone run concurrently.
<-startCh
o.UpdateDelivered(22, 22, 1, ts)
buf, _ := o.(*consumerFileStore).encodeState()
o.(*consumerFileStore).writeState(buf)
o.Delete()
}()
}
swg.Wait()
close(startCh)
wg.Wait()
}
func TestNoRaceJetStreamClusterHealthz(t *testing.T) {
c := createJetStreamCluster(t, jsClusterAccountsTempl, "HZ", _EMPTY_, 3, 23033, true)
defer c.shutdown()
nc1, js1 := jsClientConnect(t, c.randomServer(), nats.UserInfo("one", "p"))
defer nc1.Close()
nc2, js2 := jsClientConnect(t, c.randomServer(), nats.UserInfo("two", "p"))
defer nc2.Close()
var err error
for _, sname := range []string{"foo", "bar", "baz"} {
_, err = js1.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3})
require_NoError(t, err)
_, err = js2.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3})
require_NoError(t, err)
}
// R1
_, err = js1.AddStream(&nats.StreamConfig{Name: "r1", Replicas: 1})
require_NoError(t, err)
// Now shutdown then send a bunch of data.
s := c.servers[0]
s.Shutdown()
for i := 0; i < 5_000; i++ {
_, err = js1.PublishAsync("foo", []byte("OK"))
require_NoError(t, err)
_, err = js2.PublishAsync("bar", []byte("OK"))
require_NoError(t, err)
}
select {
case <-js1.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
select {
case <-js2.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
s = c.restartServer(s)
opts := s.getOpts()
opts.HTTPHost = "127.0.0.1"
opts.HTTPPort = 11222
err = s.StartMonitoring()
require_NoError(t, err)
url := fmt.Sprintf("http://127.0.0.1:%d/healthz", opts.HTTPPort)
getHealth := func() (int, *HealthStatus) {
resp, err := http.Get(url)
require_NoError(t, err)
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
require_NoError(t, err)
var hs HealthStatus
err = json.Unmarshal(body, &hs)
require_NoError(t, err)
return resp.StatusCode, &hs
}
errors := 0
checkFor(t, 20*time.Second, 100*time.Millisecond, func() error {
code, hs := getHealth()
if code >= 200 && code < 300 {
return nil
}
errors++
return fmt.Errorf("Got %d status with %+v", code, hs)
})
if errors == 0 {
t.Fatalf("Expected to have some errors until we became current, got none")
}
}
// Test that we can receive larger messages with stream subject details.
// Also test that we will fail at some point and the user can fall back to
// an orderedconsumer like we do with watch for KV Keys() call.
func TestNoRaceJetStreamStreamInfoSubjectDetailsLimits(t *testing.T) {
conf := createConfFile(t, []byte(`
listen: 127.0.0.1:-1
jetstream: enabled
accounts: {
default: {
jetstream: true
users: [ {user: me, password: pwd} ]
limits { max_payload: 256 }
}
}
`))
defer removeFile(t, conf)
s, _ := RunServerWithConfig(conf)
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s, nats.UserInfo("me", "pwd"))
defer nc.Close()
// Make sure we cannot send larger than 256 bytes.
// But we can receive larger.
sub, err := nc.SubscribeSync("foo")
require_NoError(t, err)
err = nc.Publish("foo", []byte(strings.Repeat("A", 300)))
require_Error(t, err, nats.ErrMaxPayload)
sub.Unsubscribe()
_, err = js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"*", "X.*"},
})
require_NoError(t, err)
n := JSMaxSubjectDetails
for i := 0; i < n; i++ {
_, err := js.PublishAsync(fmt.Sprintf("X.%d", i), []byte("OK"))
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
getInfo := func(filter string) *StreamInfo {
t.Helper()
// Need to grab StreamInfo by hand for now.
req, err := json.Marshal(&JSApiStreamInfoRequest{SubjectsFilter: filter})
require_NoError(t, err)
resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second)
require_NoError(t, err)
var si StreamInfo
err = json.Unmarshal(resp.Data, &si)
require_NoError(t, err)
return &si
}
si := getInfo("X.*")
if len(si.State.Subjects) != n {
t.Fatalf("Expected to get %d subject details, got %d", n, len(si.State.Subjects))
}
// Now add one more message in which will exceed our internal limits for subject details.
_, err = js.Publish("foo", []byte("TOO MUCH"))
require_NoError(t, err)
req, err := json.Marshal(&JSApiStreamInfoRequest{SubjectsFilter: nats.AllKeys})
require_NoError(t, err)
resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second)
require_NoError(t, err)
var sir JSApiStreamInfoResponse
err = json.Unmarshal(resp.Data, &sir)
require_NoError(t, err)
if !IsNatsErr(sir.Error, JSStreamInfoMaxSubjectsErr) {
t.Fatalf("Did not get correct error response: %+v", sir.Error)
}
}
func TestNoRaceJetStreamSparseConsumers(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
msg := []byte("ok")
cases := []struct {
name string
mconfig *nats.StreamConfig
}{
{"MemoryStore", &nats.StreamConfig{Name: "TEST", Storage: nats.MemoryStorage, MaxMsgsPerSubject: 25_000_000,
Subjects: []string{"*"}}},
{"FileStore", &nats.StreamConfig{Name: "TEST", Storage: nats.FileStorage, MaxMsgsPerSubject: 25_000_000,
Subjects: []string{"*"}}},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
js.DeleteStream("TEST")
_, err := js.AddStream(c.mconfig)
require_NoError(t, err)
// We will purposely place foo msgs near the beginning, then in middle, then at the end.
for n := 0; n < 2; n++ {
_, err = js.PublishAsync("foo", msg)
require_NoError(t, err)
for i := 0; i < 1_000_000; i++ {
_, err = js.PublishAsync("bar", msg)
require_NoError(t, err)
}
_, err = js.PublishAsync("foo", msg)
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
// Now create a consumer on foo.
ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{DeliverSubject: "x.x", FilterSubject: "foo", AckPolicy: nats.AckNonePolicy})
require_NoError(t, err)
done, received := make(chan bool), uint64(0)
cb := func(m *nats.Msg) {
received++
if received >= ci.NumPending {
done <- true
}
}
sub, err := nc.Subscribe("x.x", cb)
require_NoError(t, err)
defer sub.Unsubscribe()
start := time.Now()
var elapsed time.Duration
select {
case <-done:
elapsed = time.Since(start)
case <-time.After(10 * time.Second):
t.Fatal("Did not receive all messages for all consumers in time")
}
if elapsed > 500*time.Millisecond {
t.Fatalf("Getting all messages took longer than expected: %v", elapsed)
}
})
}
}
func TestNoRaceJetStreamConsumerFilterPerfDegradation(t *testing.T) {
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, _ := jsClientConnect(t, s)
defer nc.Close()
js, err := nc.JetStream(nats.PublishAsyncMaxPending(256))
require_NoError(t, err)
_, err = js.AddStream(&nats.StreamConfig{
Name: "test",
Subjects: []string{"test.*.subj"},
Replicas: 1,
})
require_NoError(t, err)
toSend := 50_000
count := 0
ch := make(chan struct{}, 6)
_, err = js.Subscribe("test.*.subj", func(m *nats.Msg) {
m.Ack()
if count++; count == toSend {
ch <- struct{}{}
}
}, nats.DeliverNew(), nats.ManualAck())
require_NoError(t, err)
msg := make([]byte, 1024)
sent := int32(0)
send := func() {
defer func() { ch <- struct{}{} }()
for i := 0; i < toSend/5; i++ {
msgID := atomic.AddInt32(&sent, 1)
_, err := js.Publish(fmt.Sprintf("test.%d.subj", msgID), msg)
if err != nil {
t.Error(err)
return
}
}
}
for i := 0; i < 5; i++ {
go send()
}
timeout := time.NewTimer(10 * time.Second)
for i := 0; i < 6; i++ {
select {
case <-ch:
case <-timeout.C:
t.Fatal("Took too long")
}
}
}
func TestNoRaceFileStoreSubjectInfoWithSnapshotCleanup(t *testing.T) {
storeDir := createDir(t, JetStreamStoreDir)
defer removeDir(t, storeDir)
fs, err := newFileStore(FileStoreConfig{StoreDir: storeDir, BlockSize: 1024 * 1024}, StreamConfig{Name: "TEST", Storage: FileStorage})
require_NoError(t, err)
defer fs.Stop()
n, msg := 10_000, []byte(strings.Repeat("Z", 1024))
for i := 0; i < n; i++ {
_, _, err := fs.StoreMsg(fmt.Sprintf("X.%d", i), nil, msg)
require_NoError(t, err)
}
// Snapshot causes us to write out per subject info, fss files.
// We want to make sure they get cleaned up.
sr, err := fs.Snapshot(5*time.Second, false, false)
require_NoError(t, err)
var buf [4 * 1024 * 1024]byte
for {
if _, err = sr.Reader.Read(buf[:]); err == io.EOF {
break
}
require_NoError(t, err)
}
var seqs []uint64
for i := 1; i <= n; i++ {
seqs = append(seqs, uint64(i))
}
// Randomly delete msgs, make sure we cleanup as we empty the message blocks.
rand.Shuffle(len(seqs), func(i, j int) { seqs[i], seqs[j] = seqs[j], seqs[i] })
for _, seq := range seqs {
_, err := fs.RemoveMsg(seq)
require_NoError(t, err)
}
// We will have cleanup the main .blk and .idx sans the lmb, but we should not have any *.fss files.
fms, err := filepath.Glob(filepath.Join(storeDir, msgDir, fssScanAll))
require_NoError(t, err)
if len(fms) > 0 {
t.Fatalf("Expected to find no fss files, found %d", len(fms))
}
}
func TestNoRaceFileStoreKeyFileCleanup(t *testing.T) {
storeDir := createDir(t, JetStreamStoreDir)
defer removeDir(t, storeDir)
prf := func(context []byte) ([]byte, error) {
h := hmac.New(sha256.New, []byte("dlc22"))
if _, err := h.Write(context); err != nil {
return nil, err
}
return h.Sum(nil), nil
}
fs, err := newFileStoreWithCreated(
FileStoreConfig{StoreDir: storeDir, BlockSize: 1024 * 1024},
StreamConfig{Name: "TEST", Storage: FileStorage},
time.Now(),
prf)
require_NoError(t, err)
defer fs.Stop()
n, msg := 10_000, []byte(strings.Repeat("Z", 1024))
for i := 0; i < n; i++ {
_, _, err := fs.StoreMsg(fmt.Sprintf("X.%d", i), nil, msg)
require_NoError(t, err)
}
var seqs []uint64
for i := 1; i <= n; i++ {
seqs = append(seqs, uint64(i))
}
// Randomly delete msgs, make sure we cleanup as we empty the message blocks.
rand.Shuffle(len(seqs), func(i, j int) { seqs[i], seqs[j] = seqs[j], seqs[i] })
for _, seq := range seqs {
_, err := fs.RemoveMsg(seq)
require_NoError(t, err)
}
// We will have cleanup the main .blk and .idx sans the lmb, but we should not have any *.fss files.
kms, err := filepath.Glob(filepath.Join(storeDir, msgDir, keyScanAll))
require_NoError(t, err)
if len(kms) > 1 {
t.Fatalf("Expected to find only 1 key file, found %d", len(kms))
}
}
func TestNoRaceMsgIdPerfDuringCatchup(t *testing.T) {
// Uncomment to run. Needs to be on a bigger machine. Do not want as part of Travis tests atm.
skip(t)
c := createJetStreamClusterExplicit(t, "JSC", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.serverByName("S-1"))
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Replicas: 3,
})
require_NoError(t, err)
// This will be the one we restart.
sl := c.streamLeader("$G", "TEST")
// Now move leader.
_, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second)
require_NoError(t, err)
c.waitOnStreamLeader("$G", "TEST")
// Connect to new leader.
nc, _ = jsClientConnect(t, c.streamLeader("$G", "TEST"))
defer nc.Close()
js, err = nc.JetStream(nats.PublishAsyncMaxPending(1024))
require_NoError(t, err)
n, ss, sr := 1_000_000, 250_000, 800_000
m := nats.NewMsg("TEST")
m.Data = []byte(strings.Repeat("Z", 2048))
// Target rate 10k msgs/sec
start := time.Now()
for i := 0; i < n; i++ {
m.Header.Set(JSMsgId, strconv.Itoa(i))
_, err := js.PublishMsgAsync(m)
require_NoError(t, err)
//time.Sleep(42 * time.Microsecond)
if i == ss {
fmt.Printf("SD")
sl.Shutdown()
} else if i == sr {
nc.Flush()
select {
case <-js.PublishAsyncComplete():
case <-time.After(10 * time.Second):
}
fmt.Printf("RS")
sl = c.restartServer(sl)
}
if i%10_000 == 0 {
fmt.Print("#")
}
}
fmt.Println()
// Wait to receive all messages.
select {
case <-js.PublishAsyncComplete():
case <-time.After(20 * time.Second):
t.Fatalf("Did not receive completion signal")
}
tt := time.Since(start)
si, err := js.StreamInfo("TEST")
require_NoError(t, err)
fmt.Printf("Took %v to send %d msgs\n", tt, n)
fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds())
fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds())
c.waitOnStreamCurrent(sl, "$G", "TEST")
for _, s := range c.servers {
mset, _ := s.GlobalAccount().lookupStream("TEST")
if state := mset.store.State(); state.Msgs != uint64(n) {
t.Fatalf("Expected server %v to have correct number of msgs %d but got %d", s, n, state.Msgs)
}
}
}
func TestNoRaceRebuildDeDupeAndMemoryPerf(t *testing.T) {
skip(t)
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{Name: "DD"})
require_NoError(t, err)
m := nats.NewMsg("DD")
m.Data = []byte(strings.Repeat("Z", 2048))
start := time.Now()
n := 1_000_000
for i := 0; i < n; i++ {
m.Header.Set(JSMsgId, strconv.Itoa(i))
_, err := js.PublishMsgAsync(m)
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(20 * time.Second):
t.Fatalf("Did not receive completion signal")
}
tt := time.Since(start)
si, err := js.StreamInfo("DD")
require_NoError(t, err)
fmt.Printf("Took %v to send %d msgs\n", tt, n)
fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds())
fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds())
v, _ := s.Varz(nil)
fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem))
mset, err := s.GlobalAccount().lookupStream("DD")
require_NoError(t, err)
mset.mu.Lock()
mset.ddloaded = false
start = time.Now()
mset.rebuildDedupe()
fmt.Printf("TOOK %v to rebuild dd\n", time.Since(start))
mset.mu.Unlock()
v, _ = s.Varz(nil)
fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem))
// Now do an ephemeral consumer and whip through every message. Doing same calculations.
start = time.Now()
received, done := 0, make(chan bool)
sub, err := js.Subscribe("DD", func(m *nats.Msg) {
received++
if received >= n {
done <- true
}
}, nats.OrderedConsumer())
require_NoError(t, err)
select {
case <-done:
case <-time.After(10 * time.Second):
if s.NumSlowConsumers() > 0 {
t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", received, n)
}
t.Fatalf("Failed to receive all large messages: %d of %d\n", received, n)
}
fmt.Printf("TOOK %v to receive all %d msgs\n", time.Since(start), n)
sub.Unsubscribe()
v, _ = s.Varz(nil)
fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem))
}
func TestNoRaceMemoryUsageOnLimitedStreamWithMirror(t *testing.T) {
skip(t)
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, js := jsClientConnect(t, s)
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{Name: "DD", Subjects: []string{"ORDERS.*"}, MaxMsgs: 10_000})
require_NoError(t, err)
_, err = js.AddStream(&nats.StreamConfig{
Name: "M",
Mirror: &nats.StreamSource{Name: "DD"},
MaxMsgs: 10_000,
})
require_NoError(t, err)
m := nats.NewMsg("ORDERS.0")
m.Data = []byte(strings.Repeat("Z", 2048))
start := time.Now()
n := 1_000_000
for i := 0; i < n; i++ {
m.Subject = fmt.Sprintf("ORDERS.%d", i)
m.Header.Set(JSMsgId, strconv.Itoa(i))
_, err := js.PublishMsgAsync(m)
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(20 * time.Second):
t.Fatalf("Did not receive completion signal")
}
tt := time.Since(start)
si, err := js.StreamInfo("DD")
require_NoError(t, err)
fmt.Printf("Took %v to send %d msgs\n", tt, n)
fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds())
fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds())
v, _ := s.Varz(nil)
fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem))
}
func TestNoRaceOrderedConsumerLongRTTPerformance(t *testing.T) {
skip(t)
s := RunBasicJetStreamServer()
if config := s.JetStreamConfig(); config != nil {
defer removeDir(t, config.StoreDir)
}
defer s.Shutdown()
nc, _ := jsClientConnect(t, s)
defer nc.Close()
js, err := nc.JetStream(nats.PublishAsyncMaxPending(1000))
require_NoError(t, err)
_, err = js.AddStream(&nats.StreamConfig{Name: "OCP"})
require_NoError(t, err)
n, msg := 100_000, []byte(strings.Repeat("D", 30_000))
for i := 0; i < n; i++ {
_, err := js.PublishAsync("OCP", msg)
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(5 * time.Second):
t.Fatalf("Did not receive completion signal")
}
// Approximately 3GB
si, err := js.StreamInfo("OCP")
require_NoError(t, err)
start := time.Now()
received, done := 0, make(chan bool)
sub, err := js.Subscribe("OCP", func(m *nats.Msg) {
received++
if received >= n {
done <- true
}
}, nats.OrderedConsumer())
require_NoError(t, err)
defer sub.Unsubscribe()
// Wait to receive all messages.
select {
case <-done:
case <-time.After(30 * time.Second):
t.Fatalf("Did not receive all of our messages")
}
tt := time.Since(start)
fmt.Printf("Took %v to receive %d msgs\n", tt, n)
fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds())
fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds())
sub.Unsubscribe()
rtt := 10 * time.Millisecond
bw := 10 * 1024 * 1024 * 1024
proxy := newNetProxy(rtt, bw, bw, s.ClientURL())
defer proxy.stop()
nc, err = nats.Connect(proxy.clientURL())
require_NoError(t, err)
js, err = nc.JetStream()
require_NoError(t, err)
start, received = time.Now(), 0
sub, err = js.Subscribe("OCP", func(m *nats.Msg) {
received++
if received >= n {
done <- true
}
}, nats.OrderedConsumer())
require_NoError(t, err)
defer sub.Unsubscribe()
// Wait to receive all messages.
select {
case <-done:
case <-time.After(60 * time.Second):
t.Fatalf("Did not receive all of our messages")
}
tt = time.Since(start)
fmt.Printf("Proxy RTT: %v, UP: %d, DOWN: %d\n", rtt, bw, bw)
fmt.Printf("Took %v to receive %d msgs\n", tt, n)
fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds())
fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds())
}
var jsClusterStallCatchupTempl = `
listen: 127.0.0.1:-1
server_name: %s
jetstream: {max_mem_store: 256MB, max_file_store: 32GB, store_dir: '%s'}
leaf {
listen: 127.0.0.1:-1
}
cluster {
name: %s
listen: 127.0.0.1:%d
routes = [%s]
}
# For access to system account.
accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } }
`
// Test our global stall gate for outstanding catchup bytes.
func TestNoRaceJetStreamClusterCatchupStallGate(t *testing.T) {
skip(t)
c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
// ~100k per message.
msg := []byte(strings.Repeat("A", 99_960))
// Create 200 streams with 100MB.
// Each server has ~2GB
var wg sync.WaitGroup
for i := 0; i < 20; i++ {
wg.Add(1)
go func(x int) {
defer wg.Done()
for n := 1; n <= 10; n++ {
sn := fmt.Sprintf("S-%d", n+x)
_, err := js.AddStream(&nats.StreamConfig{
Name: sn,
Replicas: 3,
})
require_NoError(t, err)
for i := 0; i < 100; i++ {
_, err := js.Publish(sn, msg)
require_NoError(t, err)
}
}
}(i * 20)
}
wg.Wait()
info, err := js.AccountInfo()
require_NoError(t, err)
require_True(t, info.Streams == 200)
runtime.GC()
debug.FreeOSMemory()
// Now bring a server down and wipe its storage.
s := c.servers[0]
vz, err := s.Varz(nil)
require_NoError(t, err)
fmt.Printf("MEM BEFORE is %v\n", friendlyBytes(vz.Mem))
sd := s.JetStreamConfig().StoreDir
s.Shutdown()
removeDir(t, sd)
s = c.restartServer(s)
c.waitOnServerHealthz(s)
runtime.GC()
debug.FreeOSMemory()
vz, err = s.Varz(nil)
require_NoError(t, err)
fmt.Printf("MEM AFTER is %v\n", friendlyBytes(vz.Mem))
}
func TestNoRaceJetStreamClusterCatchupBailMidway(t *testing.T) {
skip(t)
c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3)
defer c.shutdown()
ml := c.leader()
nc, js := jsClientConnect(t, ml)
defer nc.Close()
msg := []byte(strings.Repeat("A", 480))
for i := 0; i < maxConcurrentSyncRequests*2; i++ {
sn := fmt.Sprintf("CUP-%d", i+1)
_, err := js.AddStream(&nats.StreamConfig{
Name: sn,
Replicas: 3,
})
require_NoError(t, err)
for i := 0; i < 10_000; i++ {
_, err := js.PublishAsync(sn, msg)
require_NoError(t, err)
}
select {
case <-js.PublishAsyncComplete():
case <-time.After(10 * time.Second):
t.Fatalf("Did not receive completion signal")
}
}
jsz, _ := ml.Jsz(nil)
expectedMsgs := jsz.Messages
// Now select a server and shut it down, removing the storage directory.
s := c.randomNonLeader()
sd := s.JetStreamConfig().StoreDir
s.Shutdown()
removeDir(t, sd)
// Now restart the server.
s = c.restartServer(s)
// We want to force the follower to bail before the catchup through the
// upper level catchup logic completes.
checkFor(t, 5*time.Second, 10*time.Millisecond, func() error {
jsz, _ := s.Jsz(nil)
if jsz.Messages > expectedMsgs/2 {
s.Shutdown()
return nil
}
return fmt.Errorf("Not enough yet")
})
// Now restart the server.
s = c.restartServer(s)
checkFor(t, 5*time.Second, 500*time.Millisecond, func() error {
jsz, _ := s.Jsz(nil)
if jsz.Messages == expectedMsgs {
return nil
}
return fmt.Errorf("Not enough yet")
})
}
func TestNoRaceJetStreamAccountLimitsAndRestart(t *testing.T) {
c := createJetStreamClusterWithTemplate(t, jsClusterAccountLimitsTempl, "A3S", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Replicas: 3}); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
for i := 0; i < 20_000; i++ {
if _, err := js.Publish("TEST", []byte("A")); err != nil {
break
}
if i == 5_000 {
snl := c.randomNonStreamLeader("$JS", "TEST")
snl.Shutdown()
}
}
c.stopAll()
c.restartAll()
c.waitOnLeader()
c.waitOnStreamLeader("$JS", "TEST")
for _, cs := range c.servers {
c.waitOnStreamCurrent(cs, "$JS", "TEST")
}
}
func TestNoRaceJetStreamPullConsumersAndInteriorDeletes(t *testing.T) {
c := createJetStreamClusterExplicit(t, "ID", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "foo",
Replicas: 3,
MaxMsgs: 50000,
Retention: nats.InterestPolicy,
})
require_NoError(t, err)
c.waitOnStreamLeader(globalAccountName, "foo")
_, err = js.AddConsumer("foo", &nats.ConsumerConfig{
Durable: "foo",
FilterSubject: "foo",
MaxAckPending: 20000,
AckWait: time.Minute,
AckPolicy: nats.AckExplicitPolicy,
})
require_NoError(t, err)
c.waitOnConsumerLeader(globalAccountName, "foo", "foo")
rcv := int32(0)
prods := 5
cons := 5
wg := sync.WaitGroup{}
wg.Add(prods + cons)
toSend := 100000
for i := 0; i < cons; i++ {
go func() {
defer wg.Done()
sub, err := js.PullSubscribe("foo", "foo")
if err != nil {
return
}
for {
msgs, err := sub.Fetch(200, nats.MaxWait(250*time.Millisecond))
if err != nil {
if n := int(atomic.LoadInt32(&rcv)); n >= toSend {
return
}
continue
}
for _, m := range msgs {
m.Ack()
atomic.AddInt32(&rcv, 1)
}
}
}()
}
for i := 0; i < prods; i++ {
go func() {
defer wg.Done()
for i := 0; i < toSend/prods; i++ {
js.Publish("foo", []byte("hello"))
}
}()
}
time.Sleep(time.Second)
resp, err := nc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "foo", "foo"), nil, time.Second)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
var cdResp JSApiConsumerLeaderStepDownResponse
if err := json.Unmarshal(resp.Data, &cdResp); err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if cdResp.Error != nil {
t.Fatalf("Unexpected error: %+v", cdResp.Error)
}
ch := make(chan struct{})
go func() {
wg.Wait()
close(ch)
}()
select {
case <-ch:
// OK
case <-time.After(20 * time.Second):
t.Fatalf("Consumers took too long to consumer all messages")
}
}
func TestNoRaceJetStreamClusterInterestPullConsumerStreamLimitBug(t *testing.T) {
c := createJetStreamClusterExplicit(t, "JSC", 3)
defer c.shutdown()
nc, js := jsClientConnect(t, c.randomServer())
defer nc.Close()
_, err := js.AddStream(&nats.StreamConfig{
Name: "TEST",
Subjects: []string{"foo"},
Retention: nats.InterestPolicy,
MaxMsgs: 2000,
Replicas: 3,
})
require_NoError(t, err)
_, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dur", AckPolicy: nats.AckExplicitPolicy})
require_NoError(t, err)
qch := make(chan bool)
var wg sync.WaitGroup
// Publisher
wg.Add(1)
go func() {
defer wg.Done()
for {
pt := time.NewTimer(time.Duration(rand.Intn(2)) * time.Millisecond)
select {
case <-pt.C:
_, err := js.Publish("foo", []byte("BUG!"))
if err != nil {
t.Logf("Got a publisher error: %v", err)
return
}
case <-qch:
return
}
}
}()
time.Sleep(time.Second)
// Pull Consumers
wg.Add(100)
for i := 0; i < 100; i++ {
go func() {
defer wg.Done()
_, js := jsClientConnect(t, c.randomServer())
sub, err := js.PullSubscribe("foo", "dur")
require_NoError(t, err)
for {
pt := time.NewTimer(time.Duration(rand.Intn(300)) * time.Millisecond)
select {
case <-pt.C:
msgs, err := sub.Fetch(1)
if err != nil {
t.Logf("Got a Fetch error: %v", err)
return
}
if len(msgs) > 0 {
go func() {
ackDelay := time.Duration(rand.Intn(375)+15) * time.Millisecond
m := msgs[0]
time.AfterFunc(ackDelay, func() { m.AckSync() })
}()
}
case <-qch:
return
}
}
}()
}
time.Sleep(5 * time.Second)
close(qch)
wg.Wait()
time.Sleep(time.Second)
si, err := js.StreamInfo("TEST")
require_NoError(t, err)
ci, err := js.ConsumerInfo("TEST", "dur")
require_NoError(t, err)
ld := ci.Delivered.Stream
if si.State.FirstSeq > ld {
ld = si.State.FirstSeq - 1
}
if si.State.LastSeq-ld != ci.NumPending {
t.Fatalf("Expected NumPending to be %d got %d", si.State.LastSeq-ld, ci.NumPending)
}
}
// Net Proxy - For introducing RTT and BW constraints.
type netProxy struct {
listener net.Listener
conns []net.Conn
url string
}
func newNetProxy(rtt time.Duration, upRate, downRate int, serverURL string) *netProxy {
hp := net.JoinHostPort("127.0.0.1", "0")
l, e := net.Listen("tcp", hp)
if e != nil {
panic(fmt.Sprintf("Error listening on port: %s, %q", hp, e))
}
port := l.Addr().(*net.TCPAddr).Port
proxy := &netProxy{listener: l}
go func() {
client, err := l.Accept()
if err != nil {
return
}
server, err := net.DialTimeout("tcp", serverURL[7:], time.Second)
if err != nil {
panic("Can't connect to NATS server")
}
proxy.conns = append(proxy.conns, client, server)
go proxy.loop(rtt, upRate, client, server)
go proxy.loop(rtt, downRate, server, client)
}()
proxy.url = fmt.Sprintf("nats://127.0.0.1:%d", port)
return proxy
}
func (np *netProxy) clientURL() string {
return np.url
}
func (np *netProxy) loop(rtt time.Duration, tbw int, r, w net.Conn) {
delay := rtt / 2
const rbl = 8192
var buf [rbl]byte
ctx := context.Background()
rl := rate.NewLimiter(rate.Limit(tbw), rbl)
for fr := true; ; {
sr := time.Now()
n, err := r.Read(buf[:])
if err != nil {
return
}
// RTT delays
if fr || time.Since(sr) > 2*time.Millisecond {
fr = false
if delay > 0 {
time.Sleep(delay)
}
}
if err := rl.WaitN(ctx, n); err != nil {
return
}
if _, err = w.Write(buf[:n]); err != nil {
return
}
}
}
func (np *netProxy) stop() {
if np.listener != nil {
np.listener.Close()
np.listener = nil
for _, c := range np.conns {
c.Close()
}
}
}