From adef8281a2b6178f997f32ee8a7015e16f3c5d29 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Mon, 28 Aug 2023 10:22:59 -0700 Subject: [PATCH] Updates to the way meta indexing is handled for filestore. Historically we kept indexing information, either by sequence or by subject, as a per msg block operation. These were the "*.idx" and "*.fss" indexing files. When streams became very large this could have an impact on recovery time. Also, for encryption the fast path for determining if the indexing was current would require loading and decrypting the complete block. This design moves to a more traditional WAL and snapshot approach. The snapshots for the complete stream, including sumary information, global per subject information maps (PSIM) and per msg block details including summary and dmap, are processed asynchronously. The snapshot includes the msh block and has for the last record considered in the snapshot. On recovery the snapshot is read and processed and any additional records past the point of the snapshot itself are processed. To this end, any removal of a message has to be expressed as a delete tombstone that is always added the the fs.lmb file. These are processed on recovery and our indexing layer knows to skip them. Changing to this method drastically improves startup and recovery times, and has simplified the code. Some normal performance benefits have been seen as well. Signed-off-by: Derek Collison --- server/filestore.go | 1916 ++++++++++++++++++++++---------------- server/filestore_test.go | 1277 ++++++++++++------------- server/jetstream_test.go | 55 +- server/norace_test.go | 12 +- server/stream.go | 4 +- 5 files changed, 1732 insertions(+), 1532 deletions(-) diff --git a/server/filestore.go b/server/filestore.go index 11c1f558..57f61806 100644 --- a/server/filestore.go +++ b/server/filestore.go @@ -18,6 +18,7 @@ import ( "bytes" "crypto/aes" "crypto/cipher" + "crypto/rand" "crypto/sha256" "encoding/binary" "encoding/hex" @@ -35,8 +36,6 @@ import ( "sync/atomic" "time" - crand "crypto/rand" - "github.com/klauspost/compress/s2" "github.com/minio/highwayhash" "github.com/nats-io/nats-server/v2/server/avl" @@ -158,6 +157,7 @@ type fileStore struct { srv *Server mu sync.RWMutex state StreamState + tombs []uint64 ld *LostStreamData scb StorageUpdateHandler ageChk *time.Timer @@ -173,8 +173,11 @@ type fileStore struct { psim map[string]*psi hh hash.Hash64 qch chan struct{} + fch chan struct{} + fsld chan struct{} cfs []ConsumerStore sips int + dirty int closed bool fip bool receivedAny bool @@ -193,8 +196,6 @@ type msgBlock struct { nonce []byte mfn string mfd *os.File - ifn string - ifd *os.File cmp StoreCompression // Effective compression at the time of loading the block liwsz int64 index uint32 @@ -202,9 +203,7 @@ type msgBlock struct { rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk. msgs uint64 // User visible message count. fss map[string]*SimpleState - sfn string kfn string - lwits int64 lwts int64 llts int64 lrts int64 @@ -224,10 +223,6 @@ type msgBlock struct { noTrack bool closed bool - // To avoid excessive writes when expiring cache. - // These can be big. - fssNeedsWrite bool - // Used to mock write failures. mockWriteErr bool } @@ -269,8 +264,10 @@ const ( newScan = "%d.new" // used to scan index file names. indexScan = "%d.idx" - // used to load per subject meta information. - fssScan = "%d.fss" + // to look for orphans + indexScanAll = "*.idx" + // to look for orphans + fssScanAll = "*.fss" // used to store our block encryption key. keyScan = "%d.key" // to look for orphans @@ -301,6 +298,9 @@ const ( JetStreamMetaFileSum = "meta.sum" JetStreamMetaFileKey = "meta.key" + // This is the full snapshotted state for the stream. + streamStreamStateFile = "index.db" + // AEK key sizes minMetaKeySize = 64 minBlkKeySize = 64 @@ -326,10 +326,6 @@ const ( FileStoreMaxBlkSize = maxBlockSize // Check for bad record length value due to corrupt data. rlBadThresh = 32 * 1024 * 1024 - // Time threshold to write index info. - wiThresh = int64(30 * time.Second) - // Time threshold to write index info for non FIFO cases - winfThresh = int64(2 * time.Second) // Checksum size for hash for msg records. recordHashSize = 8 ) @@ -385,6 +381,8 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim prf: prf, oldprf: oldprf, qch: make(chan struct{}), + fch: make(chan struct{}, 1), + fsld: make(chan struct{}), } // Set flush in place to AsyncFlush which by default is false. @@ -415,9 +413,64 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim } } - // Recover our message state. - if err := fs.recoverMsgs(); err != nil { - return nil, err + // Attempt to recover our state. + err = fs.recoverFullState() + if err != nil { + // Hold onto state + prior := fs.state + // Reset anything that could have been set from above. + fs.state = StreamState{} + fs.psim = make(map[string]*psi) + fs.bim = make(map[uint32]*msgBlock) + fs.blks = nil + fs.tombs = nil + + // Recover our message state the old way + if err := fs.recoverMsgs(); err != nil { + return nil, err + } + + // Check if our prior remember a last past where we can see. + if fs.ld != nil && prior.LastSeq > fs.state.LastSeq { + fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime + if _, err = fs.newMsgBlockForWrite(); err != nil { + return nil, err + } + } + // Since we recovered here, make sure to kick ourselves to write out our stream state. + fs.dirty++ + defer fs.kickFlushStateLoop() + // Also make sure we get rid of old idx and fss files on return. + defer func() { + os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll)) + os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll)) + }() + } + + // Check if we have any left over tombstones to process. + if len(fs.tombs) > 0 { + fs.mu.Lock() + for _, seq := range fs.tombs { + fs.removeMsg(seq, false, false, false) + } + // Not needed after this phase. + fs.tombs = nil + fs.mu.Unlock() + } + + // Limits checks and enforcement. + fs.enforceMsgLimit() + fs.enforceBytesLimit() + + // Do age checks too, make sure to call in place. + if fs.cfg.MaxAge != 0 { + fs.expireMsgsOnRecover() + fs.startAgeChk() + } + + // If we have max msgs per subject make sure the is also enforced. + if fs.cfg.MaxMsgsPer > 0 { + fs.enforceMsgPerSubjectLimit() } // If the stream has an initial sequence number then make sure we @@ -456,6 +509,9 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) + // Spin up the go routine that will write out or full state stream index. + go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld) + return fs, nil } @@ -606,7 +662,7 @@ func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cip const seedSize = 32 seed = make([]byte, seedSize) - if n, err := crand.Read(seed); err != nil || n != seedSize { + if n, err := rand.Read(seed); err != nil || n != seedSize { return nil, nil, nil, nil, err } @@ -617,7 +673,7 @@ func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cip // Generate our nonce. Use same buffer to hold encrypted seed. nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) bek, err = genBlockEncryptionKey(sc, seed[:], nonce) if err != nil { @@ -641,9 +697,37 @@ func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, e return nil, errUnknownCipher } -// Write out meta and the checksum. // Lock should be held. -func (fs *fileStore) writeStreamMeta() error { +func (fs *fileStore) recoverAEK() error { + if fs.prf != nil && fs.aek == nil { + ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)) + if err != nil { + return err + } + rb, err := fs.prf([]byte(fs.cfg.Name)) + if err != nil { + return err + } + kek, err := genEncryptionKey(fs.fcfg.Cipher, rb) + if err != nil { + return err + } + ns := kek.NonceSize() + seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) + if err != nil { + return err + } + aek, err := genEncryptionKey(fs.fcfg.Cipher, seed) + if err != nil { + return err + } + fs.aek = aek + } + return nil +} + +// Lock should be held. +func (fs *fileStore) setupAEK() error { if fs.prf != nil && fs.aek == nil { key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name) if err != nil { @@ -659,6 +743,15 @@ func (fs *fileStore) writeStreamMeta() error { // Set our aek. fs.aek = key } + return nil +} + +// Write out meta and the checksum. +// Lock should be held. +func (fs *fileStore) writeStreamMeta() error { + if err := fs.setupAEK(); err != nil { + return err + } meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile) if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { @@ -671,7 +764,7 @@ func (fs *fileStore) writeStreamMeta() error { // Encrypt if needed. if fs.aek != nil { nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) b = fs.aek.Seal(nonce, nonce, b, nil) } @@ -750,74 +843,95 @@ func (fs *fileStore) noTrackSubjects() bool { return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0) } -// Lock held on entry -func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, error) { +// Will init the basics for a message block. +func (fs *fileStore) initMsgBlock(index uint32) *msgBlock { mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects()} mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) - mb.mfn = filepath.Join(mdir, fi.Name()) - mb.ifn = filepath.Join(mdir, fmt.Sprintf(indexScan, index)) - mb.sfn = filepath.Join(mdir, fmt.Sprintf(fssScan, index)) + mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index)) if mb.hh == nil { key := sha256.Sum256(fs.hashKeyForBlock(index)) mb.hh, _ = highwayhash.New64(key[:]) } + return mb +} + +// Lock for fs should be held. +func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error { + if fs.prf == nil { + return nil + } var createdKeys bool - - // Check if encryption is enabled. - if fs.prf != nil { - ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) + if err != nil { + // We do not seem to have keys even though we should. Could be a plaintext conversion. + // Create the keys and we will double check below. + if err := fs.genEncryptionKeysForBlock(mb); err != nil { + return err + } + createdKeys = true + } else { + if len(ekey) < minBlkKeySize { + return errBadKeySize + } + // Recover key encryption key. + rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) if err != nil { - // We do not seem to have keys even though we should. Could be a plaintext conversion. - // Create the keys and we will double check below. - if err := fs.genEncryptionKeysForBlock(mb); err != nil { - return nil, err - } - createdKeys = true - } else { - if len(ekey) < minBlkKeySize { - return nil, errBadKeySize - } - // Recover key encryption key. - rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) - if err != nil { - return nil, err - } + return err + } - sc := fs.fcfg.Cipher - kek, err := genEncryptionKey(sc, rb) - if err != nil { - return nil, err - } - ns := kek.NonceSize() - seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) - if err != nil { - // We may be here on a cipher conversion, so attempt to convert. - if err = mb.convertCipher(); err != nil { - return nil, err - } - } else { - mb.seed, mb.nonce = seed, ekey[:ns] - } - mb.aek, err = genEncryptionKey(sc, mb.seed) - if err != nil { - return nil, err - } - if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { - return nil, err + sc := fs.fcfg.Cipher + kek, err := genEncryptionKey(sc, rb) + if err != nil { + return err + } + ns := kek.NonceSize() + seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) + if err != nil { + // We may be here on a cipher conversion, so attempt to convert. + if err = mb.convertCipher(); err != nil { + return err } + } else { + mb.seed, mb.nonce = seed, ekey[:ns] + } + mb.aek, err = genEncryptionKey(sc, mb.seed) + if err != nil { + return err + } + if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { + return err } } // If we created keys here, let's check the data and if it is plaintext convert here. if createdKeys { if err := mb.convertToEncrypted(); err != nil { - return nil, err + return err } } + return nil +} + +// Load a last checksum if needed from the block file. +// Lock should be held. +func (mb *msgBlock) ensureLastChecksumLoaded() { + var empty [8]byte + if mb.lchk != empty { + return + } + copy(mb.lchk[0:], mb.lastChecksum()) +} + +// Lock held on entry +func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { + mb := fs.initMsgBlock(index) + fs.loadEncryptionForMsgBlock(mb) + // Open up the message file, but we will try to recover from the index file. // We will check that the last checksums match. file, err := os.Open(mb.mfn) @@ -840,7 +954,7 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e copy(lchk[0:], buf[len(buf)-checksumSize:]) } } else { - file.ReadAt(lchk[:], fi.Size()-checksumSize) + file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) } } @@ -862,9 +976,14 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e } // If we get data loss rebuilding the message block state record that with the fs itself. - if ld, _ := mb.rebuildState(); ld != nil { + ld, tombs, _ := mb.rebuildState() + if ld != nil { fs.addLostData(ld) } + // Collect all tombstones. + if len(tombs) > 0 { + fs.tombs = append(fs.tombs, tombs...) + } if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { fs.populateGlobalPerSubjectInfo(mb) @@ -872,10 +991,9 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e mb.tryForceExpireCacheLocked() } - // Rewrite this to make sure we are sync'd. - mb.writeIndexInfo() mb.closeFDs() fs.addMsgBlock(mb) + return mb, nil } @@ -889,12 +1007,6 @@ func (fs *fileStore) lostData() *LostStreamData { return &nld } -func (fs *fileStore) rebuildState(ld *LostStreamData) { - fs.mu.Lock() - defer fs.mu.Unlock() - fs.rebuildStateLocked(ld) -} - // Lock should be held. func (fs *fileStore) addLostData(ld *LostStreamData) { if ld == nil { @@ -910,6 +1022,12 @@ func (fs *fileStore) addLostData(ld *LostStreamData) { } } +func (fs *fileStore) rebuildState(ld *LostStreamData) { + fs.mu.Lock() + defer fs.mu.Unlock() + fs.rebuildStateLocked(ld) +} + // Lock should be held. func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) { fs.addLostData(ld) @@ -1007,9 +1125,6 @@ func (mb *msgBlock) convertCipher() error { if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil { return err } - // If we are here we want to delete other meta, e.g. idx, fss. - os.Remove(mb.ifn) - os.Remove(mb.sfn) return nil } return fmt.Errorf("unable to recover keys") @@ -1035,30 +1150,24 @@ func (mb *msgBlock) convertToEncrypted() error { if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil { return err } - if buf, err = os.ReadFile(mb.ifn); err == nil && len(buf) > 0 { - if err := checkNewHeader(buf); err != nil { - return err - } - buf = mb.aek.Seal(buf[:0], mb.nonce, buf, nil) - if err := os.WriteFile(mb.ifn, buf, defaultFilePerms); err != nil { - return err - } - } return nil } -func (mb *msgBlock) rebuildState() (*LostStreamData, error) { +// Rebuild the state of the blk based on what we have on disk in the N.blk file. +// We will return any lost data, and we will return any delete tombstones we encountered. +func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) { mb.mu.Lock() defer mb.mu.Unlock() return mb.rebuildStateLocked() } -func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { +// Rebuild the state of the blk based on what we have on disk in the N.blk file. +// Lock should be held. +func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { startLastSeq := mb.last.seq // Remove the .fss file and clear any cache we have set. mb.clearCacheAndOffset() - mb.removePerSubjectInfoLocked() buf, err := mb.loadBlock(nil) if err != nil || len(buf) == 0 { @@ -1077,7 +1186,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { mb.dmap.Empty() mb.first.seq = mb.last.seq + 1 } - return ld, err + return ld, nil, err } // Clear state we need to rebuild. @@ -1090,14 +1199,14 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { // Recreate to reset counter. mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) if err != nil { - return nil, err + return nil, nil, err } mb.bek.XORKeyStream(buf, buf) } // Check for compression. if buf, err = mb.decompressIfNeeded(buf); err != nil { - return nil, err + return nil, nil, err } mb.rbytes = uint64(len(buf)) @@ -1144,10 +1253,17 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { return &ld } + // For tombstones that we find and collect. + var ( + tombstones []uint64 + minTombstoneSeq uint64 + minTombstoneTs int64 + ) + for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { if index+msgHdrSize > lbuf { truncate(index) - return gatherLost(lbuf - index), nil + return gatherLost(lbuf - index), tombstones, nil } hdr := buf[index : index+msgHdrSize] @@ -1160,24 +1276,39 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { // Do some quick sanity checks here. if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { truncate(index) - return gatherLost(lbuf - index), errBadMsg + return gatherLost(lbuf - index), tombstones, errBadMsg } seq := le.Uint64(hdr[4:]) ts := int64(le.Uint64(hdr[12:])) + // Check if this is a delete tombstone. + if seq&tbit != 0 { + seq = seq &^ tbit + // Need to process this here and make sure we have accounted for this properly. + tombstones = append(tombstones, seq) + index += rl + if minTombstoneSeq == 0 || seq < minTombstoneSeq { + minTombstoneSeq, minTombstoneTs = seq, ts + } + continue + } + // This is an old erased message, or a new one that we can track. if seq == 0 || seq&ebit != 0 || seq < mb.first.seq { seq = seq &^ ebit - // Only add to dmap if past recorded first seq and non-zero. - if seq != 0 && seq >= mb.first.seq { - addToDmap(seq) - } - index += rl if seq >= mb.first.seq { + // Only add to dmap if past recorded first seq and non-zero. + if seq != 0 { + addToDmap(seq) + } mb.last.seq = seq mb.last.ts = ts + if mb.msgs == 0 { + mb.first.seq, mb.first.ts = seq+1, 0 + } } + index += rl continue } @@ -1188,13 +1319,11 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts } - deleted := mb.dmap.Exists(seq) - // Always set last. mb.last.seq = seq mb.last.ts = ts - if !deleted { + if !mb.dmap.Exists(seq) { data := buf[index+msgHdrSize : index+rl] if hh := mb.hh; hh != nil { hh.Reset() @@ -1208,7 +1337,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { checksum := hh.Sum(nil) if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) { truncate(index) - return gatherLost(lbuf - index), errBadMsg + return gatherLost(lbuf - index), tombstones, errBadMsg } copy(mb.lchk[0:], checksum) } @@ -1235,7 +1364,6 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { subj := mb.subjString(data[:slen]) mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } } // Advance to next record. @@ -1243,16 +1371,388 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { } // For empty msg blocks make sure we recover last seq correctly based off of first. - if mb.msgs == 0 && mb.first.seq > 0 { - mb.last.seq = mb.first.seq - 1 + // Or if we seem to have no messages but had a tombstone, which we use to remember + // sequences and timestamps now, use that to properly setup the first and last. + if mb.msgs == 0 { + if mb.first.seq > 0 { + mb.last.seq = mb.first.seq - 1 + } else if mb.first.seq == 0 && minTombstoneSeq > 0 { + mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0 + if mb.last.seq == 0 { + mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs + } + } } - // Update our fss file if needed. - if len(mb.fss) > 0 { - mb.writePerSubjectInfo() + return nil, tombstones, nil +} + +// Used when we scan the msg blocks. +type blockFiles struct { + blksSeen map[uint32]struct{} + maxIndex uint32 +} + +// This will grab all the block files. +func (fs *fileStore) grabMsgBlockFiles(ch chan *blockFiles) { + f, err := os.Open(filepath.Join(fs.fcfg.StoreDir, msgDir)) + if err != nil { + ch <- nil + return + } + defer f.Close() + + dirs, err := f.ReadDir(-1) + if err != nil { + ch <- nil + return } - return nil, nil + result := &blockFiles{blksSeen: make(map[uint32]struct{})} + + for _, fi := range dirs { + var index uint32 + if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { + result.blksSeen[index] = struct{}{} + if index > result.maxIndex { + result.maxIndex = index + } + } + } + ch <- result +} + +// recoverFullState will attempt to receover our last full state and re-process any state changes +// that happened afterwards. +func (fs *fileStore) recoverFullState() (rerr error) { + // Grab all the msgBlock files in parallel in case there are many. + rch := make(chan *blockFiles, 1) + go fs.grabMsgBlockFiles(rch) + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Check for any left over purged messages. + <-dios + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) + if _, err := os.Stat(pdir); err == nil { + os.RemoveAll(pdir) + } + // Grab our stream state file and load it in. + fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + buf, err := os.ReadFile(fn) + dios <- struct{}{} + + if err != nil { + return err + } + + const minLen = 32 + if len(buf) < minLen { + os.Remove(fn) + return errCorruptState + } + + // The highwayhash will be on the end. Check that it still matches. + h := buf[len(buf)-highwayhash.Size64:] + buf = buf[:len(buf)-highwayhash.Size64] + fs.hh.Reset() + fs.hh.Write(buf) + if !bytes.Equal(h, fs.hh.Sum(nil)) { + os.Remove(fn) + return errCorruptState + } + + // Decrypt if needed. + if fs.prf != nil { + // We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile + // since snapshots strip encryption. + if err := fs.recoverAEK(); err == nil { + ns := fs.aek.NonceSize() + buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil) + if err != nil { + return err + } + } + } + + if buf[0] != fullStateMagic || buf[1] != fullStateVersion { + os.Remove(fn) + return errCorruptState + } + + bi := hdrLen + + readU64 := func() uint64 { + if bi < 0 { + return 0 + } + v, n := binary.Uvarint(buf[bi:]) + if n <= 0 { + bi = -1 + return 0 + } + bi += n + return v + } + readI64 := func() int64 { + if bi < 0 { + return 0 + } + v, n := binary.Varint(buf[bi:]) + if n <= 0 { + bi = -1 + return -1 + } + bi += n + return v + } + + setTime := func(t *time.Time, ts int64) { + if ts == 0 { + *t = time.Time{} + } else { + *t = time.Unix(0, ts).UTC() + } + } + + var state StreamState + state.Msgs = readU64() + state.Bytes = readU64() + state.FirstSeq = readU64() + baseTime := readI64() + setTime(&state.FirstTime, baseTime) + state.LastSeq = readU64() + setTime(&state.LastTime, readI64()) + + // Check for per subject info. + if numSubjects := int(readU64()); numSubjects > 0 { + fs.psim = make(map[string]*psi, numSubjects) + for i := 0; i < numSubjects; i++ { + if lsubj := int(readU64()); lsubj > 0 { + if bi+lsubj > len(buf) { + os.Remove(fn) + return errCorruptState + } + subj := fs.subjString(buf[bi : bi+lsubj]) + bi += lsubj + psi := &psi{total: readU64(), fblk: uint32(readU64())} + if psi.total > 1 { + psi.lblk = uint32(readU64()) + } else { + psi.lblk = psi.fblk + } + fs.psim[subj] = psi + } + } + } + + if numBlocks := readU64(); numBlocks > 0 { + fs.blks = make([]*msgBlock, 0, numBlocks) + for i := 0; i < int(numBlocks); i++ { + index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64() + if bi < 0 { + break + } + mb := fs.initMsgBlock(index) + mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes + mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime + if numDeleted > 0 { + dmap, n, err := avl.Decode(buf[bi:]) + if err != nil { + os.Remove(fn) + return errCorruptState + } + mb.dmap = *dmap + mb.msgs -= numDeleted + bi += n + } + fs.addMsgBlock(mb) + } + } + + // Pull in last block index for the block that had last checksum when we wrote the full state. + blkIndex := uint32(readU64()) + var lchk [8]byte + if bi+len(lchk) > len(buf) { + bi = -1 + } else { + copy(lchk[0:], buf[bi:bi+len(lchk)]) + } + + // Check if we had any errors. + if bi < 0 { + os.Remove(fn) + return errCorruptState + } + + // Grab the max blk index we see from scanning the directory. The full snapshot has the index that was lmb when + // we created it, so with that and max we know blocks to process. We do this in parallel in casee lots of blks. + blkFiles := <-rch + + defer func() { + // Make sure we saw all of our blk files. + for _, mb := range fs.blks { + if _, ok := blkFiles.blksSeen[mb.index]; !ok { + if ld, _, _ := mb.rebuildState(); ld != nil { + // If we have lost data make sure we track here. + fs.addLostData(ld) + rerr = errCorruptState + } + } + } + }() + + // Move into place our state, msgBlks and subject info. + fs.state = state + + // If our saved state is past what we see on disk, fallback and rebuild. + if blkFiles != nil && blkFiles.maxIndex < blkIndex { + return errPriorState + } + + // First let's check the happy path, open the blk file that was the lmb when we created the full state. + // See if we have the last block available. + var matched bool + var mb *msgBlock + if mb = fs.bim[blkIndex]; mb != nil { + matched = bytes.Equal(mb.lastChecksum(), lchk[:]) + if matched && blkIndex == blkFiles.maxIndex { + return nil + } + // Remove the last message block since we will re-process below. + fs.removeMsgBlockFromList(mb) + } + + // If we are here we did not match the happy path. + // We need to go through and find our checksum. This should be in blkIndex, but might not be. + start, stop := blkIndex, blkFiles.maxIndex + if matched { + start++ + } + + for bi := start; bi <= stop; bi++ { + nmb, err := fs.recoverMsgBlock(bi) + if err != nil { + return err + } + if nmb != nil { + // Check if we have to account for a partial message block. + if !matched && mb != nil && mb.index == nmb.index { + if err := fs.adjustAccounting(mb, nmb); err != nil { + return err + } + } + // Update top level accounting. + if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq { + fs.state.FirstSeq = nmb.first.seq + fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() + } + if nmb.last.seq > fs.state.LastSeq { + fs.state.LastSeq = nmb.last.seq + fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() + } + fs.state.Msgs += nmb.msgs + fs.state.Bytes += nmb.bytes + } + } + + return nil +} + +// adjustAccounting will be called when a stream state was only partially accounted for +// with a message block, e.g. additional records were added after the stream state. +// Lock should be held. +func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error { + nmb.mu.Lock() + defer nmb.mu.Unlock() + + // First make sure the new block is loaded. + if nmb.cacheNotLoaded() { + nmb.loadMsgsWithLock() + } + nmb.ensurePerSubjectInfoLoaded() + + lookupAndAdjust := func(seq uint64) error { + var smv StoreMsg + // Lookup the message. + sm, err := nmb.cacheLookup(seq, &smv) + if err != nil { + return err + } + // Since we found it we just need to adjust fs totals and psim. + fs.state.Msgs-- + fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) + if len(sm.subj) > 0 && fs.psim != nil { + fs.removePerSubject(sm.subj) + } + return nil + } + + // Walk all the original mb's sequences that were included in the stream state. + for seq := mb.first.seq; seq <= mb.last.seq; seq++ { + // If we had already declared it deleted we can move on since you can not undelete. + if mb.dmap.Exists(seq) { + continue + } + // Lookup the message. + if err := lookupAndAdjust(seq); err != nil { + return err + } + } + + // Now check to see if we had a higher first for the recovered state mb vs nmb. + if nmb.first.seq < mb.first.seq { + for seq := nmb.first.seq; seq < mb.first.seq; seq++ { + // Lookup the message. + if err := lookupAndAdjust(seq); err != nil { + return err + } + } + // Now set first for nmb. + nmb.first = mb.first + } + + return nil +} + +// Grabs last checksum for the named block file. +// Takes into account encryption etc. +func (mb *msgBlock) lastChecksum() []byte { + f, err := os.Open(mb.mfn) + if err != nil { + return nil + } + defer f.Close() + + var lchk [8]byte + if fi, _ := f.Stat(); fi != nil { + mb.rbytes = uint64(fi.Size()) + } + if mb.rbytes < checksumSize { + return nil + } + // Encrypted? + // Check for encryption, we do not load keys on startup anymore so might need to load them here. + if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { + if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { + return nil + } + } + if mb.bek != nil { + if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { + bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) + if err != nil { + return nil + } + mb.bek = bek + mb.bek.XORKeyStream(buf, buf) + copy(lchk[0:], buf[len(buf)-checksumSize:]) + } + } else { + f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) + } + return lchk[:] } func (fs *fileStore) recoverMsgs() error { @@ -1260,55 +1760,65 @@ func (fs *fileStore) recoverMsgs() error { defer fs.mu.Unlock() // Check for any left over purged messages. - pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) <-dios + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) if _, err := os.Stat(pdir); err == nil { os.RemoveAll(pdir) } + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + f, err := os.Open(mdir) + if err != nil { + dios <- struct{}{} + return errNotReadable + } + dirs, err := f.ReadDir(-1) + f.Close() dios <- struct{}{} - mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) - fis, err := os.ReadDir(mdir) if err != nil { return errNotReadable } - // Recover all of the msg blocks. - // These can come in a random order, so account for that. - for _, fi := range fis { - var index uint32 + indices := make(sort.IntSlice, 0, len(dirs)) + var index int + for _, fi := range dirs { if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { - finfo, err := fi.Info() - if err != nil { - return err + indices = append(indices, index) + } + } + indices.Sort() + + // Recover all of the msg blocks. + // We now guarantee they are coming in order. + for _, index := range indices { + if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil { + // This is a truncate block with possibly no index. If the OS got shutdown + // out from underneath of us this is possible. + if mb.first.seq == 0 { + mb.dirtyCloseWithRemove(true) + fs.removeMsgBlockFromList(mb) + continue } - if mb, err := fs.recoverMsgBlock(finfo, index); err == nil && mb != nil { - // This is a truncate block with possibly no index. If the OS got shutdown - // out from underneath of us this is possible. - if mb.first.seq == 0 { - mb.dirtyCloseWithRemove(true) - fs.removeMsgBlockFromList(mb) - continue - } - if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq + if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { + fs.state.FirstSeq = mb.first.seq + if mb.first.ts == 0 { + fs.state.FirstTime = time.Time{} + } else { fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } - if mb.last.seq > fs.state.LastSeq { - fs.state.LastSeq = mb.last.seq - fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() - } - fs.state.Msgs += mb.msgs - fs.state.Bytes += mb.bytes - } else { - return err } + if mb.last.seq > fs.state.LastSeq { + fs.state.LastSeq = mb.last.seq + fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() + } + fs.state.Msgs += mb.msgs + fs.state.Bytes += mb.bytes + } else { + return err } } - // Now make sure to sort blks for efficient lookup later with selectMsgBlock(). if len(fs.blks) > 0 { - sort.Slice(fs.blks, func(i, j int) bool { return fs.blks[i].index < fs.blks[j].index }) fs.lmb = fs.blks[len(fs.blks)-1] } else { _, err = fs.newMsgBlockForWrite() @@ -1319,12 +1829,7 @@ func (fs *fileStore) recoverMsgs() error { var emptyBlks []*msgBlock for _, mb := range fs.blks { if mb.msgs == 0 && mb.rbytes == 0 { - if mb == fs.lmb { - mb.first.seq, mb.first.ts = mb.last.seq+1, 0 - mb.closeAndKeepIndex(false) - } else { - emptyBlks = append(emptyBlks, mb) - } + emptyBlks = append(emptyBlks, mb) } } for _, mb := range emptyBlks { @@ -1354,21 +1859,6 @@ func (fs *fileStore) recoverMsgs() error { } } - // Limits checks and enforcement. - fs.enforceMsgLimit() - fs.enforceBytesLimit() - - // Do age checks too, make sure to call in place. - if fs.cfg.MaxAge != 0 { - fs.expireMsgsOnRecover() - fs.startAgeChk() - } - - // If we have max msgs per subject make sure the is also enforced. - if fs.cfg.MaxMsgsPer > 0 { - fs.enforceMsgPerSubjectLimit() - } - return nil } @@ -1376,8 +1866,10 @@ func (fs *fileStore) recoverMsgs() error { // We will treat this differently in case we have a recovery // that will expire alot of messages on startup. // Should only be called on startup. -// Lock should be held. func (fs *fileStore) expireMsgsOnRecover() { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.state.Msgs == 0 { return } @@ -1387,15 +1879,20 @@ func (fs *fileStore) expireMsgsOnRecover() { var deleted int var nts int64 - deleteEmptyBlock := func(mb *msgBlock) bool { - // If we are the last keep state to remember first sequence. + // If we expire all make sure to write out a tombstone. Need to be done by hand here, + // usually taken care of by fs.removeMsgBlock() but we do not call that here. + var last msgId + + deleteEmptyBlock := func(mb *msgBlock) { + // If we are the last keep state to remember first/last sequence. + // Do this part by hand since not deleting one by one. if mb == fs.lmb { - // Do this part by hand since not deleting one by one. - mb.first.seq, mb.first.ts = mb.last.seq+1, 0 - mb.closeAndKeepIndex(false) - // Clear any global subject state. - fs.psim = make(map[string]*psi) - return false + last = mb.last + } + // Make sure we do subject cleanup as well. + mb.ensurePerSubjectInfoLoaded() + for subj := range mb.fss { + fs.removePerSubject(subj) } // Make sure we do subject cleanup as well. mb.ensurePerSubjectInfoLoaded() @@ -1404,7 +1901,6 @@ func (fs *fileStore) expireMsgsOnRecover() { } mb.dirtyCloseWithRemove(true) deleted++ - return true } for _, mb := range fs.blks { @@ -1418,11 +1914,8 @@ func (fs *fileStore) expireMsgsOnRecover() { if mb.last.ts <= minAge { purged += mb.msgs bytes += mb.bytes - didRemove := deleteEmptyBlock(mb) + deleteEmptyBlock(mb) mb.mu.Unlock() - if !didRemove { - mb.writeIndexInfo() - } continue } @@ -1488,14 +1981,10 @@ func (fs *fileStore) expireMsgsOnRecover() { mb.selectNextFirst() } // Check if empty after processing, could happen if tail of messages are all deleted. - needWriteIndex := true if mb.msgs == 0 { - needWriteIndex = !deleteEmptyBlock(mb) + deleteEmptyBlock(mb) } mb.mu.Unlock() - if needWriteIndex { - mb.writeIndexInfo() - } break } @@ -1532,6 +2021,15 @@ func (fs *fileStore) expireMsgsOnRecover() { } // Make sure to we properly set the fs first sequence and timestamp. fs.selectNextFirst() + + // Check if we have no messages and blocks left. + if fs.lmb == nil && last.seq != 0 { + if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { + lmb.writeTombstone(last.seq, last.ts) + } + // Clear any global subject state. + fs.psim = make(map[string]*psi) + } } func copyMsgBlocks(src []*msgBlock) []*msgBlock { @@ -1584,12 +2082,17 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor mb.mu.Lock() defer mb.mu.Unlock() - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - return nil, false, err - } - fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter} + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return nil, false, err + } + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + return nil, false, err + } + } + // If we only have 1 subject currently and it matches our filter we can also set isAll. if !isAll && len(mb.fss) == 1 { _, isAll = mb.fss[filter] @@ -1630,12 +2133,6 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor return nil, false, ErrStoreMsgNotFound } - if mb.cacheNotLoaded() { - if err := mb.loadMsgsWithLock(); err != nil { - return nil, false, err - } - } - if sm == nil { sm = new(StoreMsg) } @@ -1647,10 +2144,10 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor continue } expireOk := seq == mb.last.seq && mb.llseq == seq + if isAll { + return fsm, expireOk, nil + } if doLinearScan { - if isAll { - return fsm, expireOk, nil - } if wc && subjectIsSubsetMatch(fsm.subj, filter) { return fsm, expireOk, nil } else if !wc && fsm.subj == filter { @@ -2268,11 +2765,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { if lmb := fs.lmb; lmb != nil { index = lmb.index + 1 - // Make sure to write out our index file if needed. - if lmb.indexNeedsUpdate() { - lmb.writeIndexInfo() - } - // Determine if we can reclaim any resources here. if fs.fip { lmb.mu.Lock() @@ -2291,6 +2783,13 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { mb.mu.Lock() mb.setupWriteCache(rbuf) mb.fss = make(map[string]*SimpleState) + + // Set cache time to creation time to start. + ts := time.Now().UnixNano() + mb.llts, mb.lwts = 0, ts + // Remember our last sequence number. + mb.first.seq = fs.state.LastSeq + 1 + mb.last.seq = fs.state.LastSeq mb.mu.Unlock() // Now do local hash. @@ -2310,17 +2809,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { } mb.mfd = mfd - mb.ifn = filepath.Join(mdir, fmt.Sprintf(indexScan, mb.index)) - ifd, err := os.OpenFile(mb.ifn, os.O_CREATE|os.O_RDWR, defaultFilePerms) - if err != nil { - mb.dirtyCloseWithRemove(true) - return nil, fmt.Errorf("Error creating msg index file [%q]: %v", mb.mfn, err) - } - mb.ifd = ifd - - // For subject based info. - mb.sfn = filepath.Join(mdir, fmt.Sprintf(fssScan, mb.index)) - // Check if encryption is enabled. if fs.prf != nil { if err := fs.genEncryptionKeysForBlock(mb); err != nil { @@ -2328,16 +2816,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { } } - // Set cache time to creation time to start. - ts := time.Now().UnixNano() - // Race detector wants these protected. - mb.mu.Lock() - mb.llts, mb.lwts = 0, ts - // Remember our last sequence number. - mb.first.seq = fs.state.LastSeq + 1 - mb.last.seq = fs.state.LastSeq - mb.mu.Unlock() - // If we know we will need this so go ahead and spin up. if !fs.fip { mb.spinUpFlushLoop() @@ -2346,6 +2824,10 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { // Add to our list of blocks and mark as last. fs.addMsgBlock(mb) + if fs.dirty > 0 { + fs.kickFlushStateLoop() + } + return mb, nil } @@ -2426,6 +2908,10 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in return err } + // Mark dirty here since we added in a new message. + // We do not kick the flusher, that happens on new msg block for write or Stop(). + fs.dirty++ + // Adjust top level tracking of per subject msg counts. if len(subj) > 0 { index := fs.lmb.index @@ -2471,6 +2957,12 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in } } } + } else if mb := fs.selectMsgBlock(fseq); mb != nil { + // If we are here we could not remove fseq from above, so rebuild. + var ld *LostStreamData + if ld, _, _ = mb.rebuildState(); ld != nil { + fs.rebuildStateLocked(ld) + } } } @@ -2547,10 +3039,6 @@ func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { mb.last.ts = nowts mb.first.seq = seq + 1 mb.first.ts = nowts - // Take care of index if needed. - if nowts-mb.lwits > wiThresh { - mb.writeIndexInfoLocked() - } } else { needsRecord = true mb.dmap.Insert(seq) @@ -2593,15 +3081,12 @@ func (fs *fileStore) rebuildFirst() { return } - fmb.removeIndexFile() - ld, _ := fmb.rebuildState() + ld, _, _ := fmb.rebuildState() fmb.mu.RLock() isEmpty := fmb.msgs == 0 fmb.mu.RUnlock() if isEmpty { fs.removeMsgBlock(fmb) - } else { - fmb.writeIndexInfo() } fs.selectNextFirst() fs.rebuildStateLocked(ld) @@ -2708,9 +3193,7 @@ func (fs *fileStore) enforceMsgPerSubjectLimit() { // Clear any global subject state. fs.psim = make(map[string]*psi) for _, mb := range fs.blks { - mb.removeIndexFile() - ld, err := mb.rebuildState() - mb.writeIndexInfo() + ld, _, err := mb.rebuildState() if err != nil && ld != nil { fs.addLostData(ld) } @@ -2771,7 +3254,6 @@ func (fs *fileStore) enforceMsgPerSubjectLimit() { // Now write updated index for all affected msgBlks. for mb := range blks { - mb.writeIndexInfo() mb.tryForceExpireCacheLocked() } } @@ -2804,7 +3286,6 @@ func (fs *fileStore) removePerSubject(subj string) { if len(subj) == 0 { return } - // We do not update sense of fblk here but will do so when we resolve during lookup. if info, ok := fs.psim[subj]; ok { info.total-- @@ -2836,7 +3317,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( fsUnlock() return false, ErrStoreClosed } - if fs.sips > 0 { + if !viaLimits && fs.sips > 0 { fsUnlock() return false, ErrStoreSnapshotInProgress } @@ -2944,6 +3425,9 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( mb.bytes = 0 } + // Mark as dirty for stream state. + fs.dirty++ + // If we are tracking subjects here make sure we update that accounting. mb.ensurePerSubjectInfoLoaded() @@ -2960,9 +3444,6 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( fifo := seq == mb.first.seq isLastBlock := mb == fs.lmb isEmpty := mb.msgs == 0 - // If we are removing the message via limits we do not need to write the index file here. - // If viaLimits this means on a restart we will properly cleanup these messages regardless. - shouldWriteIndex := !isEmpty && !viaLimits if fifo { mb.selectNextFirst() @@ -2970,7 +3451,11 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( // Can update this one in place. if seq == fs.state.FirstSeq { fs.state.FirstSeq = mb.first.seq // new one. - fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() + if mb.first.ts == 0 { + fs.state.FirstTime = time.Time{} + } else { + fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() + } } } } else if !isEmpty { @@ -2990,70 +3475,40 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( } } - var firstSeqNeedsUpdate bool - - // Decide how we want to clean this up. If last block and the only block left we will hold into index. - if isEmpty { - if isLastBlock { - mb.closeAndKeepIndex(viaLimits) - // We do not need to writeIndex since just did above. - shouldWriteIndex = false - } else { - fs.removeMsgBlock(mb) - } - firstSeqNeedsUpdate = seq == fs.state.FirstSeq - } - - var qch, fch chan struct{} - if shouldWriteIndex { - qch, fch = mb.qch, mb.fch - } - cb := fs.scb - if secure { if ld, _ := mb.flushPendingMsgsLocked(); ld != nil { // We have the mb lock here, this needs the mb locks so do in its own go routine. go fs.rebuildState(ld) } } - // Check if we need to write the index file and we are flush in place (fip). - if shouldWriteIndex && fs.fip { - // Check if this is the first message, common during expirations etc. - threshold := wiThresh - if !fifo { - // For out-of-order deletes, we will have a shorter threshold, but - // still won't write the index for every single delete. - threshold = winfThresh - } - if time.Now().UnixNano()-mb.lwits > threshold { - mb.writeIndexInfoLocked() - } + + // If empty remove this block and check if we need to update first sequence. + // We will write a tombstone at the end. + var firstSeqNeedsUpdate bool + if isEmpty { + fs.removeMsgBlock(mb) + firstSeqNeedsUpdate = seq == fs.state.FirstSeq } mb.mu.Unlock() - // Kick outside of lock. - if !fs.fip && shouldWriteIndex { - if qch == nil { - mb.spinUpFlushLoop() - } - select { - case fch <- struct{}{}: - default: - } - } - - // If we emptied the current message block and the seq was state.First.Seq + // If we emptied the current message block and the seq was state.FirstSeq // then we need to jump message blocks. We will also write the index so // we don't lose track of the first sequence. if firstSeqNeedsUpdate { fs.selectNextFirst() - // Write out the new first message block if we have one. - // We can ignore if we really have not changed message blocks from above. - if len(fs.blks) > 0 && fs.blks[0] != mb { - fmb := fs.blks[0] - fmb.writeIndexInfo() - } } + + // Check if we need to write a deleted record tombstone. + // This is for user initiated removes or to hold the first seq + // when the last block is empty. + if !viaLimits || (isEmpty && isLastBlock) { + if lmb := fs.lmb; sm != nil && lmb != nil { + lmb.writeTombstone(sm.seq, sm.ts) + } + fs.kickFlushStateLoop() + } + + cb := fs.scb fs.mu.Unlock() // Storage updates. @@ -3116,10 +3571,12 @@ func (mb *msgBlock) compact() { } // Only need to process non-deleted messages. seq := le.Uint64(hdr[4:]) + if !isDeleted(seq) { // Normal message here. nbuf = append(nbuf, buf[index:index+rl]...) - if !firstSet { + // Do not set based on tombstone. + if !firstSet && seq&tbit == 0 { firstSet = true mb.first.seq = seq } @@ -3167,7 +3624,6 @@ func (mb *msgBlock) compact() { } // Remove index file and wipe delete map, then rebuild. - mb.removeIndexFileLocked() mb.deleteDmap() mb.rebuildStateLocked() @@ -3269,22 +3725,6 @@ func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { mb.setInFlusher() defer mb.clearInFlusher() - // Will use to test if we have meta data updates. - var firstSeq, lastSeq uint64 - var dmapLen int - - infoChanged := func() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - var changed bool - if firstSeq != mb.first.seq || lastSeq != mb.last.seq || dmapLen != mb.dmap.Size() { - changed = true - firstSeq, lastSeq = mb.first.seq, mb.last.seq - dmapLen = mb.dmap.Size() - } - return changed - } - for { select { case <-fch: @@ -3319,9 +3759,6 @@ func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { } } } - if infoChanged() { - mb.writeIndexInfo() - } case <-qch: return } @@ -3340,7 +3777,7 @@ func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error { // Randomize record data := make([]byte, rl-emptyRecordLen) - crand.Read(data) + rand.Read(data) // Now write to underlying buffer. var b bytes.Buffer @@ -3502,8 +3939,6 @@ func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { mb.mu.Unlock() - // Write our index file. - mb.writeIndexInfo() // Load msgs again. mb.loadMsgs() @@ -3708,11 +4143,7 @@ func (mb *msgBlock) expireCacheLocked() { // Check if we can clear out our fss and idx unless under force expire. // We used to hold onto the idx longer but removes need buf now so no point. - mb.writePerSubjectInfo() mb.fss = nil - if mb.indexNeedsUpdateLocked() { - mb.writeIndexInfoLocked() - } mb.clearCache() } @@ -3793,9 +4224,6 @@ func (fs *fileStore) checkAndFlushAllBlocks() { fs.rebuildStateLocked(ld) } } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } } } @@ -3810,7 +4238,8 @@ func (fs *fileStore) checkMsgs() *LostStreamData { fs.psim = make(map[string]*psi) for _, mb := range fs.blks { - if ld, err := mb.rebuildState(); err != nil && ld != nil { + // FIXME(dlc) - check tombstones here too? + if ld, _, err := mb.rebuildState(); err != nil && ld != nil { // Rebuild fs state too. mb.fs.rebuildStateLocked(ld) } @@ -3842,16 +4271,18 @@ func (mb *msgBlock) enableForWriting(fip bool) error { return nil } +// Helper function to place a delete tombstone. +// Lock should be held. +func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error { + return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true) +} + // Will write the message record to the underlying message block. // filestore lock will be held. func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error { mb.mu.Lock() defer mb.mu.Unlock() - // Make sure we have a cache setup. - if mb.cache == nil { - mb.setupWriteCache(nil) - } // Enable for writing if our mfd is not open. if mb.mfd == nil { if err := mb.enableForWriting(flush); err != nil { @@ -3859,6 +4290,11 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte } } + // Make sure we have a cache setup. + if mb.cache == nil { + mb.setupWriteCache(nil) + } + // Check if we are tracking per subject for our simple state. // Do this before changing the cache that would trigger a flush pending msgs call // if we needed to regenerate the per subject info. @@ -3872,7 +4308,6 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte } else { mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } // Indexing @@ -3926,20 +4361,23 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte // Update write through cache. // Write to msg record. mb.cache.buf = append(mb.cache.buf, checksum...) - // Write index - mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) mb.cache.lrl = uint32(rl) - if mb.cache.fseq == 0 { - mb.cache.fseq = seq - } // Set cache timestamp for last store. mb.lwts = ts - // Decide if we write index info if flushing in place. - writeIndex := ts-mb.lwits > wiThresh - // Accounting - mb.updateAccounting(seq, ts, rl) + // Only update index and do accounting if not a delete tombstone. + if seq&tbit == 0 { + // Strip ebit if set. + seq = seq &^ ebit + if mb.cache.fseq == 0 { + mb.cache.fseq = seq + } + // Write index + mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) + // Accounting + mb.updateAccounting(seq, ts, rl) + } fch, werr := mb.fch, mb.werr @@ -3953,11 +4391,6 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte if err != nil { return err } - if writeIndex { - // If this fails still proceed on since the write above succeeded. - // We can recover this condition. - mb.writeIndexInfoLocked() - } } else { // Kick the flusher here. kickFlusher(fch) @@ -4002,10 +4435,6 @@ func (mb *msgBlock) closeFDsLockedNoCheck() { mb.mfd.Close() mb.mfd = nil } - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } } // bytesPending returns the buffer to be used for writing to the underlying file. @@ -4044,7 +4473,7 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { seq = seq &^ ebit } - if mb.first.seq == 0 || mb.first.ts == 0 { + if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq { mb.first.seq = seq mb.first.ts = ts } @@ -4052,7 +4481,6 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = ts mb.rbytes += rl - // Only update this accounting if message is not a deleted message. if !isDeleted { mb.bytes += rl mb.msgs++ @@ -4070,6 +4498,10 @@ func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg } // Grab our current last message block. mb := fs.lmb + + // Mark as dirty for stream state. + fs.dirty++ + if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize { if mb != nil && fs.fcfg.Compression != NoCompression { // We've now reached the end of this message block, if we want @@ -4253,21 +4685,14 @@ func (fs *fileStore) syncBlocks() { if mb.pendingWriteSize() > 0 { mb.flushPendingMsgs() } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } // Do actual sync. Hold lock for consistency. mb.mu.Lock() if !mb.closed { if mb.mfd != nil { mb.mfd.Sync() } - if mb.ifd != nil { - mb.ifd.Truncate(mb.liwsz) - mb.ifd.Sync() - } // See if we can close FDs due to being idle. - if mb.ifd != nil || mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { + if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { mb.dirtyCloseWithRemove(false) } } @@ -4365,27 +4790,38 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { buf = append(mb.cache.buf, buf...) } - lbuf := uint32(len(buf)) + // Create FSS if we should track. + if !mb.noTrack { + mb.fss = make(map[string]*SimpleState) + } + lbuf := uint32(len(buf)) for index < lbuf { if index+msgHdrSize > lbuf { return errCorruptState } hdr := buf[index : index+msgHdrSize] - rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), le.Uint16(hdr[20:]) + rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:])) // Clear any headers bit that could be set. rl &^= hbit dlen := int(rl) - msgHdrSize // Do some quick sanity checks here. - if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { + if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { // This means something is off. // TODO(dlc) - Add into bad list? return errCorruptState } - // Clear erase bit. + // Check for tombstones which we can skip in terms of indexing. + if seq&tbit != 0 { + index += rl + continue + } + + // Clear any erase bits. + erased := seq&ebit != 0 seq = seq &^ ebit // We defer checksum checks to individual msg cache lookups to amortorize costs and @@ -4400,16 +4836,29 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { } } pseq = seq - + // Add to our index. idx = append(idx, index) mb.cache.lrl = uint32(rl) // Adjust if we guessed wrong. if seq != 0 && seq < fseq { fseq = seq } + + // Handle FSS inline here. + if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { + bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)] + if ss := mb.fss[string(bsubj)]; ss != nil { + ss.Msgs++ + ss.Last = seq + } else { + subj := mb.subjString(bsubj) + mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} + } + } } index += rl } + mb.cache.buf = buf mb.cache.idx = idx mb.cache.fseq = fseq @@ -4490,12 +4939,10 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { for lbb := lob; lbb > 0; lbb = len(buf) { n, err := mb.writeAt(buf, woff) if err != nil { - mb.removePerSubjectInfoLocked() - mb.removeIndexFileLocked() mb.dirtyCloseWithRemove(false) - fsLostData, _ := mb.rebuildStateLocked() + ld, _, _ := mb.rebuildStateLocked() mb.werr = err - return fsLostData, err + return ld, err } // Update our write offset. woff += int64(n) @@ -4613,11 +5060,22 @@ func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { } n, err := io.ReadFull(f, buf) + // On success capture raw bytes size. + if err == nil { + mb.rbytes = uint64(n) + } return buf[:n], err } // Lock should be held. func (mb *msgBlock) loadMsgsWithLock() error { + // Check for encryption, we do not load keys on startup anymore so might need to load them here. + if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { + if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { + return err + } + } + // Check to see if we are loading already. if mb.loading { return nil @@ -4685,7 +5143,7 @@ checkCache: if err := mb.indexCacheBuf(buf); err != nil { if err == errCorruptState { var ld *LostStreamData - if ld, err = mb.rebuildStateLocked(); ld != nil { + if ld, _, err = mb.rebuildStateLocked(); ld != nil { // We do not know if fs is locked or not at this point. // This should be an exceptional condition so do so in Go routine. go mb.fs.rebuildState(ld) @@ -4732,26 +5190,27 @@ var ( errNoPending = errors.New("message block does not have pending data") errNotReadable = errors.New("storage directory not readable") errCorruptState = errors.New("corrupt state file") + errPriorState = errors.New("prior state file") errPendingData = errors.New("pending data still present") errNoEncryption = errors.New("encryption not enabled") errBadKeySize = errors.New("encryption bad key size") errNoMsgBlk = errors.New("no message block") - errMsgBlkClosed = errors.New("message block is closed") errMsgBlkTooBig = errors.New("message block size exceeded int capacity") errUnknownCipher = errors.New("unknown cipher") - errDIOStalled = errors.New("IO is stalled") errNoMainKey = errors.New("encrypted store encountered with no main key") ) -// Used for marking messages that have had their checksums checked. -// Used to signal a message record with headers. -const hbit = 1 << 31 - -// Used for marking erased messages sequences. -const ebit = 1 << 63 - -// Used to mark a bad index as deleted. -const dbit = 1 << 30 +const ( + // Used for marking messages that have had their checksums checked. + // Used to signal a message record with headers. + hbit = 1 << 31 + // Used for marking erased messages sequences. + ebit = 1 << 63 + // Used for marking tombstone sequences. + tbit = 1 << 62 + // Used to mark a bad index as deleted. + dbit = 1 << 30 +) // Will do a lookup from cache. // Lock should be held. @@ -4806,7 +5265,7 @@ func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { return nil, err } - // Deleted messages that are decoded return a 0 for seqeunce. + // Deleted messages that are decoded return a 0 for sequence. if fsm.seq == 0 { return nil, errDeletedMsg } @@ -4974,19 +5433,19 @@ func subjFromBytes(b []byte) string { // Given the `key` byte slice, this function will return the subject // as an interned string of `key` or a configured subject as to minimize memory allocations. // Lock should be held. -func (mb *msgBlock) subjString(skey []byte) string { - if len(skey) == 0 { +func (fs *fileStore) subjString(skey []byte) string { + if fs == nil || len(skey) == 0 { return _EMPTY_ } - if lsubjs := len(mb.fs.cfg.Subjects); lsubjs > 0 { + if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 { if lsubjs == 1 { // The cast for the comparison does not make a copy - if string(skey) == mb.fs.cfg.Subjects[0] { - return mb.fs.cfg.Subjects[0] + if string(skey) == fs.cfg.Subjects[0] { + return fs.cfg.Subjects[0] } } else { - for _, subj := range mb.fs.cfg.Subjects { + for _, subj := range fs.cfg.Subjects { if string(skey) == subj { return subj } @@ -4996,6 +5455,13 @@ func (mb *msgBlock) subjString(skey []byte) string { return subjFromBytes(skey) } +// Given the `key` byte slice, this function will return the subject +// as an interned string of `key` or a configured subject as to minimize memory allocations. +// Lock should be held. +func (mb *msgBlock) subjString(skey []byte) string { + return mb.fs.subjString(skey) +} + // LoadMsg will lookup the message by sequence number and return it if found. func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) { return fs.msgForSeq(seq, sm) @@ -5223,104 +5689,6 @@ func (mb *msgBlock) sinceLastWriteActivity() time.Duration { return time.Since(time.Unix(0, last).UTC()) } -// Determine if we need to write out this index info. -func (mb *msgBlock) indexNeedsUpdate() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - return mb.indexNeedsUpdateLocked() -} - -// Determine if we need to write out this index info. -// Lock should be held. -func (mb *msgBlock) indexNeedsUpdateLocked() bool { - return mb.lwits < mb.lwts || mb.lwits < mb.lrts -} - -// Write index info to the appropriate file. -// Filestore lock should be held. -func (mb *msgBlock) writeIndexInfo() error { - mb.mu.Lock() - defer mb.mu.Unlock() - return mb.writeIndexInfoLocked() -} - -// Write index info to the appropriate file. -// Filestore lock and mb lock should be held. -func (mb *msgBlock) writeIndexInfoLocked() error { - if mb.closed { - return errMsgBlkClosed - } - - // HEADER: magic version msgs bytes fseq fts lseq lts ndel checksum - // Make large enough to hold almost all possible maximum interior delete scenarios. - var hdr [42 * 1024]byte - - // Write header - hdr[0] = magic - hdr[1] = newVersion - - n := hdrLen - n += binary.PutUvarint(hdr[n:], mb.msgs) - n += binary.PutUvarint(hdr[n:], mb.bytes) - n += binary.PutUvarint(hdr[n:], mb.first.seq) - n += binary.PutVarint(hdr[n:], mb.first.ts) - n += binary.PutUvarint(hdr[n:], mb.last.seq) - n += binary.PutVarint(hdr[n:], mb.last.ts) - n += binary.PutUvarint(hdr[n:], uint64(mb.dmap.Size())) - buf := append(hdr[:n], mb.lchk[:]...) - - // Append a delete map if needed - if !mb.dmap.IsEmpty() { - // Always attempt to tack it onto end. - dmap, err := mb.dmap.Encode(hdr[len(buf):]) - if err != nil { - return err - } - if len(dmap) < cap(hdr)-len(buf) { - buf = hdr[:len(buf)+len(dmap)] - } else { - buf = append(buf, dmap...) - } - } - - // Open our FD if needed. - if mb.ifd == nil { - ifd, err := os.OpenFile(mb.ifn, os.O_CREATE|os.O_RDWR, defaultFilePerms) - if err != nil { - return err - } - if fi, _ := ifd.Stat(); fi != nil { - mb.liwsz = fi.Size() - } - mb.ifd = ifd - } - - // Encrypt if needed. - if mb.aek != nil { - buf = mb.aek.Seal(buf[:0], mb.nonce, buf, nil) - } - - // Check if this will be a short write, and if so truncate before writing here. - // We only really need to truncate if we are encryptyed or we have dmap entries. - // If no dmap entries readIndexInfo does the right thing in the presence of extra data left over. - if int64(len(buf)) < mb.liwsz && (mb.aek != nil || !mb.dmap.IsEmpty()) { - if err := mb.ifd.Truncate(0); err != nil { - mb.werr = err - return err - } - } - - var err error - if n, err = mb.ifd.WriteAt(buf, 0); err == nil { - mb.lwits = time.Now().UnixNano() - mb.liwsz = int64(n) - mb.werr = nil - } else { - mb.werr = err - } - return err -} - func checkNewHeader(hdr []byte) error { if hdr == nil || len(hdr) < 2 || hdr[0] != magic || (hdr[1] != version && hdr[1] != newVersion) { @@ -5331,7 +5699,8 @@ func checkNewHeader(hdr []byte) error { // readIndexInfo will read in the index information for the message block. func (mb *msgBlock) readIndexInfo() error { - buf, err := os.ReadFile(mb.ifn) + ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index)) + buf, err := os.ReadFile(ifn) if err != nil { return err } @@ -5350,7 +5719,7 @@ func (mb *msgBlock) readIndexInfo() error { } if err := checkNewHeader(buf); err != nil { - defer os.Remove(mb.ifn) + defer os.Remove(ifn) return fmt.Errorf("bad index file") } @@ -5392,13 +5761,13 @@ func (mb *msgBlock) readIndexInfo() error { // Check if this is a short write index file. if bi < 0 || bi+checksumSize > len(buf) { - os.Remove(mb.ifn) + os.Remove(ifn) return fmt.Errorf("short index file") } // Check for consistency if accounting. If something is off bail and we will rebuild. if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen { - os.Remove(mb.ifn) + os.Remove(ifn) return fmt.Errorf("accounting inconsistent") } @@ -5430,17 +5799,6 @@ func (mb *msgBlock) readIndexInfo() error { return nil } -func syncAndClose(mfd, ifd *os.File) { - if mfd != nil { - mfd.Sync() - mfd.Close() - } - if ifd != nil { - ifd.Sync() - ifd.Close() - } -} - // Will return total number of cache loads. func (fs *fileStore) cacheLoads() uint64 { var tl uint64 @@ -5604,8 +5962,6 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint mb.tryForceExpireCacheLocked() } mb.mu.Unlock() - // Update our index info on disk. - mb.writeIndexInfo() // Check if we should break out of top level too. if maxp > 0 && purged >= maxp { @@ -5616,9 +5972,12 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint fs.selectNextFirst() } + fs.dirty++ cb := fs.scb fs.mu.Unlock() + fs.kickFlushStateLoop() + if cb != nil { cb(-int64(purged), -int64(bytes), 0, _EMPTY_) } @@ -5655,6 +6014,10 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { fs.blks = nil fs.lmb = nil fs.bim = make(map[uint32]*msgBlock) + // Clear any per subject tracking. + fs.psim = make(map[string]*psi) + // Mark dirty + fs.dirty++ // Move the msgs directory out of the way, will delete out of band. // FIXME(dlc) - These can error and we need to change api above to propagate? @@ -5666,7 +6029,9 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { os.RemoveAll(pdir) } os.Rename(mdir, pdir) + go os.RemoveAll(pdir) + // Create new one. os.MkdirAll(mdir, defaultDirPerms) @@ -5681,14 +6046,17 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { fs.state.FirstSeq = fseq fs.state.LastSeq = fseq - 1 } - fs.lmb.first.seq = fs.state.FirstSeq - fs.lmb.last.seq = fs.state.LastSeq - fs.lmb.last.ts = fs.state.LastTime.UnixNano() - fs.lmb.writeIndexInfo() + lmb := fs.lmb + lmb.first.seq = fs.state.FirstSeq + lmb.last.seq = fs.state.LastSeq + lmb.last.ts = fs.state.LastTime.UnixNano() - // Clear any per subject tracking. - fs.psim = make(map[string]*psi) + if fs.lmb.last.seq > 1 { + // Leave a tombstone so we can remember our starting sequence in case + // full state becomes corrupted. + lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts) + } cb := fs.scb fs.mu.Unlock() @@ -5749,7 +6117,6 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { smb.mu.Lock() if smb.first.seq == seq { - isEmpty = smb.msgs == 0 goto SKIP } @@ -5832,18 +6199,12 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { } // Make sure to remove fss state. smb.fss = nil - smb.removePerSubjectInfoLocked() smb.clearCacheAndOffset() smb.rbytes = uint64(len(nbuf)) } } SKIP: - if !isEmpty { - // Make sure to write out our index info. - smb.writeIndexInfoLocked() - } - smb.mu.Unlock() if deleted > 0 { @@ -5873,6 +6234,9 @@ SKIP: } fs.state.Bytes -= bytes + fs.dirty++ + fs.kickFlushStateLoop() + cb := fs.scb fs.mu.Unlock() @@ -6004,6 +6368,9 @@ func (fs *fileStore) Truncate(seq uint64) error { // Reset our subject lookup info. fs.resetGlobalPerSubjectInfo() + fs.dirty++ + fs.kickFlushStateLoop() + cb := fs.scb fs.mu.Unlock() @@ -6028,29 +6395,6 @@ func (fs *fileStore) numMsgBlocks() int { return len(fs.blks) } -// Will remove our index file. -func (mb *msgBlock) removeIndexFile() { - mb.mu.RLock() - defer mb.mu.RUnlock() - mb.removeIndexFileLocked() -} - -func (mb *msgBlock) removeIndexFileLocked() { - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } - if mb.ifn != _EMPTY_ { - os.Remove(mb.ifn) - } -} - -func (mb *msgBlock) removePerSubjectInfoLocked() { - if mb.sfn != _EMPTY_ { - os.Remove(mb.sfn) - } -} - // Will add a new msgBlock. // Lock should be held. func (fs *fileStore) addMsgBlock(mb *msgBlock) { @@ -6082,37 +6426,14 @@ func (fs *fileStore) removeMsgBlock(mb *msgBlock) { fs.removeMsgBlockFromList(mb) // Check for us being last message block if mb == fs.lmb { + last := mb.last // Creating a new message write block requires that the lmb lock is not held. mb.mu.Unlock() - fs.newMsgBlockForWrite() - mb.mu.Lock() - } -} - -// When we have an empty block but want to keep the index for timestamp info etc. -// Lock should be held. -func (mb *msgBlock) closeAndKeepIndex(viaLimits bool) { - // We will leave a 0 length blk marker. - if mb.mfd != nil { - mb.mfd.Truncate(0) - } else { - // We were closed, so just write out an empty file. - os.WriteFile(mb.mfn, nil, defaultFilePerms) - } - // Make sure to write the index file so we can remember last seq and ts. - mb.writeIndexInfoLocked() - // Close - mb.dirtyCloseWithRemove(false) - - // Make sure to remove fss state. - mb.fss = nil - mb.removePerSubjectInfoLocked() - - // If we are encrypted we should reset our bek counter. - if mb.bek != nil { - if bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce); err == nil { - mb.bek = bek + // Write the tombstone to remember since this was last block. + if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { + lmb.writeTombstone(last.seq, last.ts) } + mb.mu.Lock() } } @@ -6134,13 +6455,8 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { mb.ctmr.Stop() mb.ctmr = nil } - // Check if we are tracking by subject. - if mb.fss != nil { - if !remove { - mb.writePerSubjectInfo() - } - mb.fss = nil - } + // Clear any tracking by subject. + mb.fss = nil // Close cache mb.clearCacheAndOffset() // Quit our loops. @@ -6152,26 +6468,16 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { mb.mfd.Close() mb.mfd = nil } - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } if remove { - if mb.ifn != _EMPTY_ { - os.Remove(mb.ifn) - mb.ifn = _EMPTY_ - } if mb.mfn != _EMPTY_ { os.Remove(mb.mfn) mb.mfn = _EMPTY_ } - if mb.sfn != _EMPTY_ { - os.Remove(mb.sfn) - mb.sfn = _EMPTY_ - } if mb.kfn != _EMPTY_ { os.Remove(mb.kfn) } + // Since we are removing a block kick the state flusher. + mb.fs.kickFlushStateLoop() } } @@ -6186,7 +6492,6 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { if ss.Msgs == 1 { delete(mb.fss, subj) - mb.fssNeedsWrite = true // Mark dirty return } @@ -6200,7 +6505,6 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { ss.First = ss.Last } ss.firstNeedsUpdate = false - mb.fssNeedsWrite = true // Mark dirty return } @@ -6248,7 +6552,6 @@ func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si continue } ss.First = seq - mb.fssNeedsWrite = true // Mark dirty return } } @@ -6266,17 +6569,12 @@ func (fs *fileStore) resetGlobalPerSubjectInfo() { // Lock should be held. func (mb *msgBlock) resetPerSubjectInfo() error { mb.fss = nil - mb.removePerSubjectInfoLocked() - return mb.generatePerSubjectInfo(true) + return mb.generatePerSubjectInfo() } // generatePerSubjectInfo will generate the per subject info via the raw msg block. -func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { - if !hasLock { - mb.mu.Lock() - defer mb.mu.Unlock() - } - +// Lock should be held. +func (mb *msgBlock) generatePerSubjectInfo() error { // Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info. if mb.msgs == 0 { return nil @@ -6286,6 +6584,10 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { if err := mb.loadMsgsWithLock(); err != nil { return err } + // indexCaceheBuf can produce fss now, so if non-nil we are good. + if mb.fss != nil { + return nil + } } // Create new one regardless. @@ -6312,7 +6614,6 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { } else { mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } } @@ -6324,38 +6625,6 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { return nil } -func (mb *msgBlock) loadPerSubjectInfo() ([]byte, error) { - const ( - fileHashIndex = 16 - mbHashIndex = 8 - minFileSize = 24 - ) - - buf, err := os.ReadFile(mb.sfn) - if err != nil { - return nil, err - } - - if len(buf) < minFileSize || checkHeader(buf) != nil { - return nil, errors.New("short fss state") - } - - // Check that we did not have any bit flips. - mb.hh.Reset() - mb.hh.Write(buf[0 : len(buf)-fileHashIndex]) - fhash := buf[len(buf)-fileHashIndex : len(buf)-mbHashIndex] - if checksum := mb.hh.Sum(nil); !bytes.Equal(checksum, fhash) { - return nil, errors.New("corrupt fss state") - } - - // Make sure it matches the last update recorded. - if !bytes.Equal(buf[len(buf)-mbHashIndex:], mb.lchk[:]) { - return nil, errors.New("outdated fss state") - } - - return buf, nil -} - // Helper to make sure fss loaded if we are tracking. // Lock should be held func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { @@ -6366,8 +6635,7 @@ func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { mb.fss = make(map[string]*SimpleState) return nil } - // Load from file. - return mb.readPerSubjectInfo(true) + return mb.generatePerSubjectInfo() } // Called on recovery to populate the global psim state. @@ -6376,23 +6644,10 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { mb.mu.Lock() defer mb.mu.Unlock() - if err := mb.readPerSubjectInfo(true); err != nil { + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { return } - // Quick sanity check. - // TODO(dlc) - This is here to auto-clear a bug. - fssMsgs := uint64(0) - for subj, ss := range mb.fss { - if len(subj) > 0 { - fssMsgs += ss.Msgs - } - } - // If we are off rebuild. - if fssMsgs != mb.msgs { - mb.generatePerSubjectInfo(true) - } - // Now populate psim. for subj, ss := range mb.fss { if len(subj) > 0 { @@ -6408,113 +6663,6 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { } } -// readPerSubjectInfo will attempt to restore the per subject information. -func (mb *msgBlock) readPerSubjectInfo(hasLock bool) error { - if mb.noTrack { - return nil - } - - buf, err := mb.loadPerSubjectInfo() - // On failure re-generate. - if err != nil { - return mb.generatePerSubjectInfo(hasLock) - } - - bi := hdrLen - readU64 := func() uint64 { - if bi < 0 { - return 0 - } - num, n := binary.Uvarint(buf[bi:]) - if n <= 0 { - bi = -1 - return 0 - } - bi += n - return num - } - - numEntries := readU64() - fss := make(map[string]*SimpleState, numEntries) - - if !hasLock { - mb.mu.Lock() - } - for i := uint64(0); i < numEntries; i++ { - lsubj := readU64() - // Make a copy or use a configured subject (to avoid mem allocation) - subj := mb.subjString(buf[bi : bi+int(lsubj)]) - bi += int(lsubj) - msgs, first, last := readU64(), readU64(), readU64() - fss[subj] = &SimpleState{Msgs: msgs, First: first, Last: last} - } - mb.fss = fss - mb.fssNeedsWrite = false - - // Make sure we run the cache expire timer. - if len(mb.fss) > 0 { - mb.llts = time.Now().UnixNano() - mb.startCacheExpireTimer() - } - - if !hasLock { - mb.mu.Unlock() - } - - return nil -} - -// writePerSubjectInfo will write out per subject information if we are tracking per subject. -// Lock should be held. -func (mb *msgBlock) writePerSubjectInfo() error { - // Raft groups do not have any subjects. - if len(mb.fss) == 0 || len(mb.sfn) == 0 || !mb.fssNeedsWrite { - return nil - } - var scratch [4 * binary.MaxVarintLen64]byte - var b bytes.Buffer - b.WriteByte(magic) - b.WriteByte(version) - n := binary.PutUvarint(scratch[0:], uint64(len(mb.fss))) - b.Write(scratch[0:n]) - for subj, ss := range mb.fss { - if ss.firstNeedsUpdate { - mb.recalculateFirstForSubj(subj, ss.First, ss) - } - n := binary.PutUvarint(scratch[0:], uint64(len(subj))) - b.Write(scratch[0:n]) - b.WriteString(subj) - // Encode all three parts of our simple state into same scratch buffer. - n = binary.PutUvarint(scratch[0:], ss.Msgs) - n += binary.PutUvarint(scratch[n:], ss.First) - n += binary.PutUvarint(scratch[n:], ss.Last) - b.Write(scratch[0:n]) - } - // Calculate hash for this information. - mb.hh.Reset() - mb.hh.Write(b.Bytes()) - b.Write(mb.hh.Sum(nil)) - // Now copy over checksum from the block itself, this allows us to know if we are in sync. - b.Write(mb.lchk[:]) - - // Gate this for when we have a large number of blocks expiring at the same time. - // Since we have the lock we would rather fail here then block. - // This is an optional structure that can be rebuilt on restart. - var err error - select { - case <-dios: - if err = os.WriteFile(mb.sfn, b.Bytes(), defaultFilePerms); err == nil { - // Clear write flag if no error. - mb.fssNeedsWrite = false - } - dios <- struct{}{} - default: - err = errDIOStalled - } - - return err -} - // Close the message block. func (mb *msgBlock) close(sync bool) { if mb == nil { @@ -6533,12 +6681,7 @@ func (mb *msgBlock) close(sync bool) { mb.ctmr = nil } - // Check if we are tracking by subject. - if len(mb.fss) > 0 && mb.fssNeedsWrite { - mb.writePerSubjectInfo() - } mb.fss = nil - mb.fssNeedsWrite = false // Close cache mb.clearCacheAndOffset() @@ -6547,18 +6690,13 @@ func (mb *msgBlock) close(sync bool) { close(mb.qch) mb.qch = nil } - if sync { - syncAndClose(mb.mfd, mb.ifd) - } else { - if mb.mfd != nil { - mb.mfd.Close() - } - if mb.ifd != nil { - mb.ifd.Close() + if mb.mfd != nil { + if sync { + mb.mfd.Sync() } + mb.mfd.Close() } mb.mfd = nil - mb.ifd = nil // Mark as closed. mb.closed = true } @@ -6620,14 +6758,178 @@ func (fs *fileStore) cancelSyncTimer() { } } +const ( + fullStateMagic = uint8(11) + fullStateVersion = uint8(1) +) + +// This go routine runs and receives kicks to write out our full stream state index. +// This will get kicked when we create a new block or when we delete a block in general. +// This is also called during Stop(). +func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) { + for { + select { + case <-fch: + fs.writeFullState() + case <-qch: + close(done) + return + } + } +} + +// Kick the flusher. +func (fs *fileStore) kickFlushStateLoop() { + kickFlusher(fs.fch) +} + +// Helper since unixnano of zero time undefined. +func timestampNormalized(t time.Time) int64 { + if t.IsZero() { + return 0 + } + return t.UnixNano() +} + +// This will write the full binary state for the stream. +// This plus everything new since last hash will be the total recovered state. +// This state dump will have the following. +// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp) +// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present. +// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset). +// 4. Last block index and hash of record inclusive to this stream state. +func (fs *fileStore) writeFullState() error { + fs.mu.Lock() + + if fs.closed || fs.dirty == 0 { + fs.mu.Unlock() + return nil + } + + var _buf [32 * 1024]byte + _buf[0], _buf[1] = fullStateMagic, fullStateVersion + buf := _buf[:hdrLen] + + buf = binary.AppendUvarint(buf, fs.state.Msgs) + buf = binary.AppendUvarint(buf, fs.state.Bytes) + buf = binary.AppendUvarint(buf, fs.state.FirstSeq) + buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime)) + buf = binary.AppendUvarint(buf, fs.state.LastSeq) + buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime)) + + // Do per subject information map if applicable. + numSubjects := len(fs.psim) + buf = binary.AppendUvarint(buf, uint64(numSubjects)) + + if numSubjects > 0 { + for subj, psi := range fs.psim { + buf = binary.AppendUvarint(buf, uint64(len(subj))) + buf = append(buf, subj...) + buf = binary.AppendUvarint(buf, psi.total) + buf = binary.AppendUvarint(buf, uint64(psi.fblk)) + if psi.total > 1 { + buf = binary.AppendUvarint(buf, uint64(psi.lblk)) + } + } + } + + // Now walk all blocks and write out first and last and optional dmap encoding. + var lbi uint32 + var lchk [8]byte + + nb := len(fs.blks) + buf = binary.AppendUvarint(buf, uint64(nb)) + + // Use basetime to save some space. + baseTime := timestampNormalized(fs.state.FirstTime) + + for _, mb := range fs.blks { + mb.mu.RLock() + buf = binary.AppendUvarint(buf, uint64(mb.index)) + buf = binary.AppendUvarint(buf, mb.bytes) + buf = binary.AppendUvarint(buf, mb.first.seq) + buf = binary.AppendVarint(buf, mb.first.ts-baseTime) + buf = binary.AppendUvarint(buf, mb.last.seq) + buf = binary.AppendVarint(buf, mb.last.ts-baseTime) + + numDeleted := mb.dmap.Size() + buf = binary.AppendUvarint(buf, uint64(numDeleted)) + if numDeleted > 0 { + var scratch [8 * 1024]byte + dmap, _ := mb.dmap.Encode(scratch[:0]) + buf = append(buf, dmap...) + } + // If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index. + // We use this to quickly open this file on recovery. + if mb == fs.lmb { + lbi = mb.index + mb.ensureLastChecksumLoaded() + copy(lchk[0:], mb.lchk[:]) + } + mb.mu.RUnlock() + } + + // Place block index and hash onto the end. + buf = binary.AppendUvarint(buf, uint64(lbi)) + buf = append(buf, lchk[:]...) + + // Encrypt if needed. + if fs.prf != nil { + if err := fs.setupAEK(); err != nil { + fs.mu.Unlock() + return err + } + nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead()) + rand.Read(nonce) + buf = fs.aek.Seal(nonce, nonce, buf, nil) + } + + fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + + fs.hh.Reset() + fs.hh.Write(buf) + buf = fs.hh.Sum(buf) + + // Snapshot prior dirty count. + priorDirty := fs.dirty + // Release lock. + fs.mu.Unlock() + + // Write to a tmp file and rename. + const tmpPre = streamStreamStateFile + tsep + f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre) + if err != nil { + return err + } + tmpName := f.Name() + defer os.Remove(tmpName) + _, err = f.Write(buf) + f.Close() + if err != nil { + return err + } + + // Rename into position under our lock, clear prior dirty pending on success. + fs.mu.Lock() + if !fs.closed { + if err := os.Rename(tmpName, fn); err != nil { + fs.mu.Unlock() + return err + } + fs.dirty -= priorDirty + } + fs.mu.Unlock() + + return nil +} + +// Stop the current filestore. func (fs *fileStore) Stop() error { fs.mu.Lock() if fs.closed { fs.mu.Unlock() return ErrStoreClosed } - fs.closed = true - fs.lmb = nil fs.checkAndFlushAllBlocks() fs.closeAllMsgBlocks(false) @@ -6635,6 +6937,21 @@ func (fs *fileStore) Stop() error { fs.cancelSyncTimer() fs.cancelAgeChk() + // Release the state flusher loop. + close(fs.qch) + + // Wait for the state flush loop to exit. + fsld := fs.fsld + fs.mu.Unlock() + <-fsld + // Write full state if needed. If not dirty this is a no-op. + fs.writeFullState() + fs.mu.Lock() + + // Mark as closed. + fs.closed = true + fs.lmb = nil + // We should update the upper usage layer on a stop. cb, bytes := fs.scb, int64(fs.state.Bytes) @@ -6722,37 +7039,32 @@ func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includ // Can't use join path here, tar only recognizes relative paths with forward slashes. msgPre := msgDir + "/" - var bbuf []byte + const minLen = 32 + sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen { + if fs.aek != nil { + ns := fs.aek.NonceSize() + buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil) + if err == nil { + // Redo hash checksum at end on plaintext. + fs.hh.Reset() + fs.hh.Write(buf) + buf = fs.hh.Sum(buf) + } + } + if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil { + return + } + } + // Now do messages themselves. for _, mb := range blks { if mb.pendingWriteSize() > 0 { mb.flushPendingMsgs() } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } mb.mu.Lock() - buf, err := os.ReadFile(mb.ifn) - if err != nil { - mb.mu.Unlock() - writeErr(fmt.Sprintf("Could not read message block [%d] index file: %v", mb.index, err)) - return - } - // Check for encryption. - if mb.aek != nil && len(buf) > 0 { - buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil) - if err != nil { - mb.mu.Unlock() - writeErr(fmt.Sprintf("Could not decrypt message block [%d] index file: %v", mb.index, err)) - return - } - } - if writeFile(msgPre+fmt.Sprintf(indexScan, mb.index), buf) != nil { - mb.mu.Unlock() - return - } // We could stream but don't want to hold the lock and prevent changes, so just read in and // release the lock for now. bbuf, err = mb.loadBlock(bbuf) @@ -6777,16 +7089,8 @@ func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includ writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err)) return } - - // Make sure we snapshot the per subject info. - mb.writePerSubjectInfo() - buf, err = os.ReadFile(mb.sfn) - // If not there that is ok and not fatal. - if err == nil && writeFile(msgPre+fmt.Sprintf(fssScan, mb.index), buf) != nil { - mb.mu.Unlock() - return - } mb.mu.Unlock() + // Do this one unlocked. if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil { return @@ -6872,7 +7176,7 @@ func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumer pw.SetWriteDeadline(time.Now().Add(deadline)) } - // We can add to our stream while snapshotting but not delete anything. + // We can add to our stream while snapshotting but not "user" delete anything. var state StreamState fs.FastState(&state) @@ -7551,7 +7855,7 @@ func (o *consumerFileStore) encryptState(buf []byte) []byte { } // TODO(dlc) - Optimize on space usage a bit? nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) return o.aek.Seal(nonce, nonce, buf, nil) } @@ -7643,7 +7947,7 @@ func (cfs *consumerFileStore) writeConsumerMeta() error { // Encrypt if needed. if cfs.aek != nil { nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) b = cfs.aek.Seal(nonce, nonce, b, nil) } @@ -7660,14 +7964,6 @@ func (cfs *consumerFileStore) writeConsumerMeta() error { return nil } -// Make sure the header is correct. -func checkHeader(hdr []byte) error { - if hdr == nil || len(hdr) < 2 || hdr[0] != magic || hdr[1] != version { - return errCorruptState - } - return nil -} - // Consumer version. func checkConsumerHeader(hdr []byte) (uint8, error) { if hdr == nil || len(hdr) < 2 || hdr[0] != magic { diff --git a/server/filestore_test.go b/server/filestore_test.go index d43dd80c..b8676eea 100644 --- a/server/filestore_test.go +++ b/server/filestore_test.go @@ -19,7 +19,6 @@ import ( "crypto/hmac" crand "crypto/rand" "crypto/sha256" - "encoding/base64" "encoding/hex" "encoding/json" "errors" @@ -30,7 +29,6 @@ import ( "os" "path/filepath" "reflect" - "strings" "testing" "time" @@ -419,12 +417,8 @@ func TestFileStoreWriteExpireWrite(t *testing.T) { cexp := 10 * time.Millisecond fcfg.CacheExpire = cexp - fs, err := newFileStore( - fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() toSend := 10 @@ -483,9 +477,7 @@ func TestFileStoreWriteExpireWrite(t *testing.T) { func TestFileStoreMsgLimit(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxMsgs: 10}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -519,9 +511,7 @@ func TestFileStoreMsgLimit(t *testing.T) { func TestFileStoreMsgLimitBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxMsgs: 1}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -547,9 +537,7 @@ func TestFileStoreBytesLimit(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: int64(maxBytes)}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() for i := uint64(0); i < toStore; i++ { @@ -599,12 +587,8 @@ func TestFileStoreAgeLimit(t *testing.T) { fcfg.BlockSize = 256 - fs, err := newFileStore( - fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}) + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -659,9 +643,7 @@ func TestFileStoreAgeLimit(t *testing.T) { func TestFileStoreTimeStamps(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() last := time.Now().UnixNano() @@ -691,9 +673,7 @@ func TestFileStorePurge(t *testing.T) { fcfg.BlockSize = blkSize fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", make([]byte, 8*1024) @@ -794,7 +774,7 @@ func TestFileStorePurge(t *testing.T) { checkPurgeState(toStore * 2) - checkFor(t, time.Second, 10*time.Millisecond, func() error { + checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { if _, err := os.Stat(purgeDir); err == nil { return fmt.Errorf("purge directory still present") } @@ -823,9 +803,7 @@ func TestFileStoreCompact(t *testing.T) { time.Now(), prf, nil, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -885,9 +863,7 @@ func TestFileStoreCompactLastPlusOne(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", make([]byte, 10_000) @@ -925,9 +901,7 @@ func TestFileStoreCompactLastPlusOne(t *testing.T) { func TestFileStoreCompactMsgCountBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -969,9 +943,7 @@ func TestFileStoreCompactPerf(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1022,9 +994,7 @@ func TestFileStoreStreamTruncate(t *testing.T) { time.Now(), prf, nil, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() tseq := uint64(50) @@ -1107,9 +1077,7 @@ func TestFileStoreRemovePartialRecovery(t *testing.T) { fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1157,9 +1125,7 @@ func TestFileStoreRemoveOutOfOrderRecovery(t *testing.T) { fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1228,13 +1194,8 @@ func TestFileStoreAgeLimitRecovery(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.CacheExpire = 1 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}) + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -1274,9 +1235,7 @@ func TestFileStoreAgeLimitRecovery(t *testing.T) { func TestFileStoreBitRot(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -1342,9 +1301,7 @@ func TestFileStoreBitRot(t *testing.T) { func TestFileStoreEraseMsg(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1403,9 +1360,7 @@ func TestFileStoreEraseMsg(t *testing.T) { func TestFileStoreEraseAndNoIndexRecovery(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1429,10 +1384,10 @@ func TestFileStoreEraseAndNoIndexRecovery(t *testing.T) { t.Fatalf("Expected %d msgs, got %d", toStore/2, state.Msgs) } - // Stop and remove the index file. + // Stop and remove the optional index file. fs.Stop() ifn := filepath.Join(fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, 1)) - removeFile(t, ifn) + os.Remove(ifn) fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) if err != nil { @@ -1458,9 +1413,7 @@ func TestFileStoreMeta(t *testing.T) { mconfig := StreamConfig{Name: "ZZ-22-33", Storage: FileStorage, Subjects: []string{"foo.*"}, Replicas: 22} fs, err := newFileStore(fcfg, mconfig) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() metafile := filepath.Join(fcfg.StoreDir, JetStreamMetaFile) @@ -1489,9 +1442,13 @@ func TestFileStoreMeta(t *testing.T) { if err != nil { t.Fatalf("Error reading metafile checksum: %v", err) } + + fs.mu.Lock() fs.hh.Reset() fs.hh.Write(buf) mychecksum := hex.EncodeToString(fs.hh.Sum(nil)) + fs.mu.Unlock() + if mychecksum != string(checksum) { t.Fatalf("Checksums do not match, got %q vs %q", mychecksum, checksum) } @@ -1551,13 +1508,8 @@ func TestFileStoreMeta(t *testing.T) { func TestFileStoreWriteAndReadSameBlock(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World!") @@ -1578,13 +1530,8 @@ func TestFileStoreAndRetrieveMultiBlock(t *testing.T) { fcfg.BlockSize = 4 * storedMsgSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() for i := 0; i < 20; i++ { @@ -1621,13 +1568,8 @@ func TestFileStoreCollapseDmap(t *testing.T) { fcfg.BlockSize = 4 * storedMsgSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() for i := 0; i < 10; i++ { @@ -1697,9 +1639,7 @@ func TestFileStoreReadCache(t *testing.T) { storedMsgSize := fileStoreMsgSize(subj, nil, msg) fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() toStore := 500 @@ -1750,9 +1690,7 @@ func TestFileStorePartialCacheExpiration(t *testing.T) { fcfg.CacheExpire = cexp fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() fs.StoreMsg("foo", nil, []byte("msg1")) @@ -1775,9 +1713,7 @@ func TestFileStorePartialIndexes(t *testing.T) { fcfg.CacheExpire = cexp fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() toSend := 5 @@ -1819,14 +1755,21 @@ func TestFileStorePartialIndexes(t *testing.T) { func TestFileStoreSnapshot(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { subj, msg := "foo", []byte("Hello Snappy!") + scfg := StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage} - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) defer fs.Stop() toSend := 2233 @@ -1903,13 +1846,8 @@ func TestFileStoreSnapshot(t *testing.T) { } fcfg.StoreDir = rstoreDir - fsr, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Error restoring from snapshot: %v", err) - } + fsr, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) defer fsr.Stop() state := fs.State() rstate := fsr.State() @@ -1924,7 +1862,6 @@ func TestFileStoreSnapshot(t *testing.T) { if !reflect.DeepEqual(rstate, state) { t.Fatalf("Restored state does not match:\n%+v\n\n%+v", rstate, state) } - } // Simple case first. @@ -2006,9 +1943,7 @@ func TestFileStoreSnapshot(t *testing.T) { func TestFileStoreConsumer(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("obs22", &ConsumerConfig{}) @@ -2193,9 +2128,7 @@ func TestFileStoreWriteFailures(t *testing.T) { subj, msg := "foo", []byte("Hello Write Failures!") fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() var lseq uint64 @@ -2289,13 +2222,8 @@ func TestFileStorePerf(t *testing.T) { friendlyBytes(int64(toStore*storedMsgSize)), ) - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2431,13 +2359,8 @@ func TestFileStoreReadBackMsgPerf(t *testing.T) { friendlyBytes(int64(toStore*storedMsgSize)), ) - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2480,13 +2403,8 @@ func TestFileStoreStoreLimitRemovePerf(t *testing.T) { toStore := 1 * 1024 * 1024 * 1024 / storedMsgSize testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() fs.RegisterStorageUpdates(func(md, bd int64, seq uint64, subj string) {}) @@ -2537,13 +2455,8 @@ func TestFileStorePubPerfWithSmallBlkSize(t *testing.T) { fcfg.BlockSize = FileStoreMinBlkSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2623,7 +2536,6 @@ func TestFileStoreConsumerRedeliveredLost(t *testing.T) { t.Fatalf("Did not clear pending correctly") } if len(state.Redelivered) != 0 { - fmt.Printf("redelivered is %+v\n", state.Redelivered) t.Fatalf("Did not clear redelivered correctly") } }) @@ -2632,9 +2544,7 @@ func TestFileStoreConsumerRedeliveredLost(t *testing.T) { func TestFileStoreConsumerFlusher(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("o22", &ConsumerConfig{}) @@ -2665,9 +2575,7 @@ func TestFileStoreConsumerFlusher(t *testing.T) { func TestFileStoreConsumerDeliveredUpdates(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Simple consumer, no ack policy configured. @@ -2721,9 +2629,7 @@ func TestFileStoreConsumerDeliveredUpdates(t *testing.T) { func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Simple consumer, no ack policy configured. @@ -2828,7 +2734,7 @@ func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { t.Fatalf("Unexpected error getting state: %v", err) } if !reflect.DeepEqual(nstate, state) { - t.Fatalf("States don't match!") + t.Fatalf("States don't match! NEW %+v OLD %+v", nstate, state) } }) } @@ -2836,9 +2742,7 @@ func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { func TestFileStoreStreamStateDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, toStore := "foo", uint64(10) @@ -2885,9 +2789,7 @@ func TestFileStoreStreamStateDeleted(t *testing.T) { func TestFileStoreStreamDeleteDirNotEmpty(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, toStore := "foo", uint64(10) @@ -2920,9 +2822,7 @@ func TestFileStoreConsumerPerf(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("o22", &ConsumerConfig{AckPolicy: AckExplicit}) @@ -2982,27 +2882,13 @@ func TestFileStoreConsumerPerf(t *testing.T) { }) } -func TestFileStoreStreamIndexBug(t *testing.T) { - // https://github.com/nats-io/jetstream/issues/406 - badIdxBytes, _ := base64.StdEncoding.DecodeString("FgGBkw7D/f8/772iDPDIgbU=") - dir := t.TempDir() - fn := filepath.Join(dir, "1.idx") - os.WriteFile(fn, badIdxBytes, 0644) - mb := &msgBlock{index: 1, ifn: fn} - if err := mb.readIndexInfo(); err == nil || !strings.Contains(err.Error(), "short index") { - t.Fatalf("Expected error during readIndexInfo(): %v", err) - } -} - // Reported by Ivan. func TestFileStoreStreamDeleteCacheBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.CacheExpire = 50 * time.Millisecond fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -3023,77 +2909,13 @@ func TestFileStoreStreamDeleteCacheBug(t *testing.T) { }) } -// https://github.com/nats-io/nats-server/issues/2068 -func TestFileStoreStreamPurgeAndDirtyRestartBug(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - // Load up some messages. - num, subj, hdr, msg := 100, "foo", []byte("name:derek"), []byte("Hello World") - for i := 0; i < num; i++ { - fs.StoreMsg(subj, hdr, msg) - } - // Now purge - fs.Purge() - - // Snapshot state. - state := fs.State() - if state.FirstSeq != uint64(num+1) || state.LastSeq != uint64(num) { - t.Fatalf("Unexpected state: %+v", state) - } - - // Now we will stop the store and corrupt the index such that on restart it will do a rebuild. - fs.mu.Lock() - lmb := fs.lmb - fs.mu.Unlock() - - lmb.mu.RLock() - ifn := lmb.ifn - lmb.mu.RUnlock() - - fs.Stop() - - fd, err := os.OpenFile(ifn, os.O_RDWR, 0644) - if err != nil { - t.Fatalf("Error opening the index file: %v", err) - } - defer fd.Close() - fi, _ := fd.Stat() - if _, err = fd.WriteAt([]byte{1, 1}, fi.Size()-2); err != nil { - t.Fatalf("Error writing the index file: %v", err) - } - fd.Close() - - // Restart - fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - state = fs.State() - if state.FirstSeq != uint64(num+1) || state.LastSeq != uint64(num) { - t.Fatalf("Unexpected state: %+v", state) - } - }) -} - // rip func TestFileStoreStreamFailToRollBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 512 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: 300}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: 300}) + require_NoError(t, err) defer fs.Stop() // Make sure we properly roll underlying blocks. @@ -3152,9 +2974,7 @@ func TestFileStoreExpireMsgsOnStart(t *testing.T) { startFS := func() *fileStore { t.Helper() fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) return fs } @@ -3235,34 +3055,7 @@ func TestFileStoreExpireMsgsOnStart(t *testing.T) { if index >= len(fs.blks) { t.Fatalf("Out of range, wanted %d but only %d blks", index, len(fs.blks)) } - mb := fs.blks[index] fs.mu.RUnlock() - - var errStr string - - mb.mu.RLock() - // We will do a readIndex op on our clone and then compare. - mbc := &msgBlock{fs: fs, ifn: mb.ifn} - if err := mbc.readIndexInfo(); err != nil { - mb.mu.RUnlock() - t.Fatalf("Error during readIndexInfo: %v", err) - } - // Check state as represented by index info. - if mb.msgs != mbc.msgs { - errStr = fmt.Sprintf("msgs do not match: %d vs %d", mb.msgs, mbc.msgs) - } else if mb.bytes != mbc.bytes { - errStr = fmt.Sprintf("bytes do not match: %d vs %d", mb.bytes, mbc.bytes) - } else if mb.first != mbc.first { - errStr = fmt.Sprintf("first state does not match: %d vs %d", mb.first, mbc.first) - } else if mb.last != mbc.last { - errStr = fmt.Sprintf("last state does not match: %d vs %d", mb.last, mbc.last) - } else if !reflect.DeepEqual(mb.dmap, mbc.dmap) { - errStr = fmt.Sprintf("deleted map does not match: %+v vs %+v", mb.dmap, mbc.dmap) - } - mb.mu.RUnlock() - if errStr != _EMPTY_ { - t.Fatal(errStr) - } } lastSeqForBlk := func(index int) uint64 { @@ -3380,9 +3173,7 @@ func TestFileStoreSparseCompaction(t *testing.T) { var fs *fileStore fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() msg := bytes.Repeat([]byte("ABC"), 33) // ~100bytes @@ -3517,9 +3308,7 @@ func TestFileStoreSparseCompactionWithInteriorDeletes(t *testing.T) { var fs *fileStore fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() for i := 1; i <= 1000; i++ { @@ -3561,12 +3350,8 @@ func TestFileStorePurgeExKeepOneBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() fill := bytes.Repeat([]byte("X"), 128) @@ -3593,41 +3378,10 @@ func TestFileStorePurgeExKeepOneBug(t *testing.T) { }) } -func TestFileStoreRemoveLastWriteIndex(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - for i := 0; i < 10; i++ { - fs.StoreMsg("foo", nil, []byte("msg")) - } - for i := 0; i < 10; i++ { - fs.RemoveMsg(uint64(i + 1)) - } - - fs.mu.Lock() - fname := fs.lmb.ifn - fs.mu.Unlock() - - fi, err := os.Stat(fname) - if err != nil { - t.Fatalf("Error getting stats for index file %q: %v", fname, err) - } - if fi.Size() == 0 { - t.Fatalf("Index file %q size is 0", fname) - } - }) -} - func TestFileStoreFilteredPendingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() fs.StoreMsg("foo", nil, []byte("msg")) @@ -3658,9 +3412,7 @@ func TestFileStoreFetchPerf(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Will create 25k msg blocks. @@ -3696,10 +3448,7 @@ func TestFileStoreCompactReclaimHeadSpace(t *testing.T) { fcfg.BlockSize = 4 * 1024 * 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -3970,10 +3719,7 @@ func TestFileStoreRebuildStateDmapAccountingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1024 * 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4012,7 +3758,7 @@ func TestFileStoreRebuildStateDmapAccountingBug(t *testing.T) { require_NoError(t, err) mb.mu.Lock() - _, err = mb.rebuildStateLocked() + _, _, err = mb.rebuildStateLocked() require_NoError(t, err) mb.mu.Unlock() @@ -4024,10 +3770,7 @@ func TestFileStorePurgeExWithSubject(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1000 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Subjects: []string{"foo.>"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Subjects: []string{"foo.>"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4094,15 +3837,6 @@ func TestFileStoreShortIndexWriteBug(t *testing.T) { t.Fatalf("Expected first sequence of 101 vs %d", state.FirstSeq) } - // I noticed that we also would dangle an open ifd when we did closeAndKeepIndex(), check that we do not anymore. - fs.mu.RLock() - mb := fs.lmb - mb.mu.RLock() - hasIfd := mb.ifd != nil - mb.mu.RUnlock() - fs.mu.RUnlock() - require_False(t, hasIfd) - // Now restart.. fs.Stop() fs, err = newFileStoreWithCreated( @@ -4249,10 +3983,7 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { fcfg.BlockSize = 1024 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4263,13 +3994,6 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { require_NoError(t, err) } - checkNoMeta := func() { - t.Helper() - if _, hasAnyFSS := fs.reportMeta(); hasAnyFSS { - t.Fatalf("Expected no mbs to have fss state") - } - } - // Test that on restart we do not have extensize metadata but do have correct number of subjects/keys. // Only thing really needed for store state / stream info. fs.Stop() @@ -4294,11 +4018,6 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { return nil }) - // Load by sequence should not load meta. - _, err = fs.LoadMsg(1, nil) - require_NoError(t, err) - checkNoMeta() - // LoadLast, which is what KV uses, should load meta and succeed. _, err = fs.LoadLastMsg("kv.22", nil) require_NoError(t, err) @@ -4317,10 +4036,7 @@ func TestFileStoreMaxMsgsPerSubject(t *testing.T) { fcfg.BlockSize = 128 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4401,10 +4117,7 @@ func TestFileStoreSubjectStateCacheExpiration(t *testing.T) { fcfg.BlockSize = 32 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 2}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 2}) require_NoError(t, err) defer fs.Stop() @@ -4531,10 +4244,7 @@ func TestFileStoreEncrypted(t *testing.T) { // Make sure we do not go through block loads when we know no subjects will exists, e.g. raft. func TestFileStoreNoFSSWhenNoSubjects(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4577,10 +4287,7 @@ func TestFileStoreNoFSSBugAfterRemoveFirst(t *testing.T) { fcfg.BlockSize = 8 * 1024 * 1024 fcfg.CacheExpire = 200 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.bar.*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.bar.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4620,10 +4327,7 @@ func TestFileStoreNoFSSBugAfterRemoveFirst(t *testing.T) { func TestFileStoreNoFSSAfterRecover(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4661,10 +4365,7 @@ func TestFileStoreNoFSSAfterRecover(t *testing.T) { func TestFileStoreFSSCloseAndKeepOnExpireOnRecoverBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { ttl := 100 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}) require_NoError(t, err) defer fs.Stop() @@ -4722,59 +4423,12 @@ func TestFileStoreExpireOnRecoverSubjectAccounting(t *testing.T) { }) } -func TestFileStoreFSSBadStateBug(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - _, _, err = fs.StoreMsg("foo", nil, nil) - require_NoError(t, err) - _, _, err = fs.StoreMsg("foo", nil, nil) - require_NoError(t, err) - - // Force write of fss. - mb := fs.getFirstBlock() - mb.mu.Lock() - mb.writePerSubjectInfo() - fssFile := filepath.Join(fcfg.StoreDir, msgDir, fmt.Sprintf(fssScan, 1)) - buf, err := os.ReadFile(fssFile) - require_NoError(t, err) - mb.mu.Unlock() - - // Now remove one of them. - fs.RemoveMsg(1) - fs.Stop() - - // Now put back wrong fss with msgs == 2 - err = os.WriteFile(fssFile, buf, defaultFilePerms) - require_NoError(t, err) - - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - if fss := fs.SubjectsState("foo")["foo"]; fss.Msgs != 1 { - t.Fatalf("Got bad state on restart: %+v", fss) - } - }) -} - func TestFileStoreFSSExpireNumPendingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { cexp := 100 * time.Millisecond fcfg.CacheExpire = cexp - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"KV.>"}, MaxMsgsPer: 1, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"KV.>"}, MaxMsgsPer: 1, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4793,10 +4447,7 @@ func TestFileStoreFSSExpireNumPendingBug(t *testing.T) { // https://github.com/nats-io/nats-server/issues/3484 func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.>"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.>"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4817,11 +4468,10 @@ func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { // Simulate swapping out the fss state and reading it back in with only one subject // present in the block. if mb.fss != nil { - mb.writePerSubjectInfo() mb.fss = nil } // Now load info back in. - mb.readPerSubjectInfo(true) + mb.generatePerSubjectInfo() mb.mu.Unlock() // Now add in a different subject. @@ -4838,10 +4488,7 @@ func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { func TestFileStoreOutOfSpaceRebuildState(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4884,10 +4531,7 @@ func TestFileStoreRebuildStateProperlyWithMaxMsgsPerSubject(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4903,55 +4547,14 @@ func TestFileStoreRebuildStateProperlyWithMaxMsgsPerSubject(t *testing.T) { require_NoError(t, err) } - checkState := func() { - var ss StreamState - fs.FastState(&ss) - if ss.NumSubjects != 3 { - t.Fatalf("Expected NumSubjects of 3, got %d", ss.NumSubjects) - } - if ss.Msgs != 3 { - t.Fatalf("Expected NumMsgs of 3, got %d", ss.Msgs) - } + var ss StreamState + fs.FastState(&ss) + if ss.NumSubjects != 3 { + t.Fatalf("Expected NumSubjects of 3, got %d", ss.NumSubjects) } - - checkState() - - // Stop filestore but invalidate the idx files by removing them. - // This will simulate a server panic or kill -9 scenario. - fs.Stop() - - fs.mu.RLock() - for _, mb := range fs.blks { - mb.removeIndexFile() + if ss.Msgs != 3 { + t.Fatalf("Expected NumMsgs of 3, got %d", ss.Msgs) } - fs.mu.RUnlock() - - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) - require_NoError(t, err) - defer fs.Stop() - - checkState() - - // Make sure we wrote all index files from recovery. - fs.mu.RLock() - for _, mb := range fs.blks { - mb.mu.Lock() - if err := mb.readIndexInfo(); err != nil { - mb.mu.Unlock() - fs.mu.RUnlock() - t.Fatalf("Unexpected error reading index info: %v", err) - } - if mb.msgs == 0 { - mb.mu.Unlock() - fs.mu.RUnlock() - t.Fatalf("Expected msgs for all blks, got none for index %d", mb.index) - } - mb.mu.Unlock() - } - fs.mu.RUnlock() }) } @@ -4999,13 +4602,9 @@ func TestFileStoreUpdateMaxMsgsPerSubject(t *testing.T) { func TestFileStoreBadFirstAndFailedExpireAfterRestart(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 256 - ttl := time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}) require_NoError(t, err) defer fs.Stop() @@ -5069,10 +4668,7 @@ func TestFileStoreBadFirstAndFailedExpireAfterRestart(t *testing.T) { func TestFileStoreCompactAllWithDanglingLMB(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5096,10 +4692,7 @@ func TestFileStoreStateWithBlkFirstDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5135,11 +4728,20 @@ func TestFileStoreStateWithBlkFirstDeleted(t *testing.T) { func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 + scfg := StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage} - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5155,20 +4757,16 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // First block fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 0) + require_True(t, len(fs.blks) > 0) mfn := fs.blks[0].mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5179,21 +4777,17 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // Last block fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 0) + require_True(t, len(fs.blks) > 0) require_True(t, fs.lmb != nil) mfn = fs.lmb.mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5206,20 +4800,16 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // Interior block. fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 3) + require_True(t, len(fs.blks) > 3) mfn = fs.blks[len(fs.blks)-3].mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5236,10 +4826,7 @@ func TestFileStoreAllFilteredStateWithDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5285,10 +4872,7 @@ func TestFileStoreStreamTruncateResetMultiBlock(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5332,10 +4916,7 @@ func TestFileStoreStreamCompactMultiBlockSubjectInfo(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5358,59 +4939,10 @@ func TestFileStoreStreamCompactMultiBlockSubjectInfo(t *testing.T) { }) } -func TestFileStoreOnlyWritePerSubjectInfoOnExpireWithUpdate(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fcfg.CacheExpire = 100 * time.Millisecond - - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - for i := 0; i < 1000; i++ { - subj := fmt.Sprintf("foo.%d", i) - _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) - require_NoError(t, err) - } - - // Grab first msg block. - fs.mu.RLock() - mb := fs.blks[0] - fs.mu.RUnlock() - - needsUpdate := func() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - return mb.fssNeedsWrite - } - require_True(t, needsUpdate()) - time.Sleep(2 * fcfg.CacheExpire) - require_False(t, needsUpdate()) - - // Make sure reads do not trigger an update. - _, err = fs.LoadMsg(1, nil) - require_NoError(t, err) - require_False(t, needsUpdate()) - - // Remove will though. - _, err = fs.RemoveMsg(1) - require_NoError(t, err) - require_True(t, needsUpdate()) - - // We should update then clear. - time.Sleep(2 * fcfg.CacheExpire) - require_False(t, needsUpdate()) - }) -} - func TestFileStoreSubjectsTotals(t *testing.T) { // No need for all permutations here. storeDir := t.TempDir() - fcfg := FileStoreConfig{ - StoreDir: storeDir, - } + fcfg := FileStoreConfig{StoreDir: storeDir} fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5485,70 +5017,6 @@ func TestFileStoreSubjectsTotals(t *testing.T) { } } -func TestFileStoreNewWriteIndexInfo(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fcfg.BlockSize = defaultLargeBlockSize - - fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - require_NoError(t, err) - defer fs.Stop() - - // Fill a block. - numToFill := 254200 - for i := 0; i < numToFill; i++ { - _, _, err := fs.StoreMsg("A", nil, []byte("OK")) - require_NoError(t, err) - } - - // Maximize interior deletes for testing the new AVL sequence set. - for seq := uint64(2); seq < uint64(numToFill); seq++ { - removed, err := fs.RemoveMsg(seq) - require_NoError(t, err) - require_True(t, removed) - } - // Grab first block - fs.mu.RLock() - mb := fs.blks[0] - fs.mu.RUnlock() - - mb.mu.Lock() - start := time.Now() - err = mb.writeIndexInfoLocked() - if err != nil { - mb.mu.Unlock() - t.Fatalf("Unexpected error: %v", err) - } - elapsed := time.Since(start) - if elapsed > 3*time.Millisecond { - mb.mu.Unlock() - t.Errorf("Unexpected elapsed time: %v", elapsed) - } - fi, err := os.Stat(mb.ifn) - mb.mu.Unlock() - - require_NoError(t, err) - require_True(t, fi.Size() < 34*1024) // Just over 32k - - mb.mu.Lock() - mb.dmap.Empty() - err = mb.readIndexInfo() - numMsgs := mb.msgs - firstSeq := mb.first.seq - lastSeq := mb.last.seq - mb.mu.Unlock() - // Make sure consistent. - require_NoError(t, err) - require_True(t, numMsgs == 2) - require_True(t, firstSeq == 1) - require_True(t, lastSeq == uint64(numToFill)) - - fs.Stop() - fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - require_NoError(t, err) - defer fs.Stop() - }) -} - func TestFileStoreConsumerStoreEncodeAfterRestart(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) @@ -5593,6 +5061,7 @@ func TestFileStoreNumPendingLargeNumBlks(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 10_000 @@ -5603,12 +5072,12 @@ func TestFileStoreNumPendingLargeNumBlks(t *testing.T) { start := time.Now() total, _ := fs.NumPending(4000, "zzz", false) - require_True(t, time.Since(start) < 5*time.Millisecond) + require_True(t, time.Since(start) < 10*time.Millisecond) require_True(t, total == 6001) start = time.Now() total, _ = fs.NumPending(6000, "zzz", false) - require_True(t, time.Since(start) < 5*time.Millisecond) + require_True(t, time.Since(start) < 10*time.Millisecond) require_True(t, total == 4001) // Now delete a message in first half and second half. @@ -5635,6 +5104,7 @@ func TestFileStoreSkipMsgAndNumBlocks(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 10_000 @@ -5667,6 +5137,7 @@ func TestFileStoreRestoreEncryptedWithNoKeyFuncFails(t *testing.T) { prf, nil, ) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 100 @@ -5752,6 +5223,7 @@ func TestFileStoreRecaluclateFirstForSubjBug(t *testing.T) { func TestFileStoreKeepWithDeletedMsgsBug(t *testing.T) { fs, err := newFileStore(FileStoreConfig{StoreDir: t.TempDir()}, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() msg := bytes.Repeat([]byte("A"), 19) for i := 0; i < 5; i++ { @@ -5768,3 +5240,478 @@ func TestFileStoreKeepWithDeletedMsgsBug(t *testing.T) { require_NoError(t, err) require_True(t, n == 3) } + +/////////////////////////////////////////////////////////////////////////// +// New WAL based architecture tests +/////////////////////////////////////////////////////////////////////////// + +func TestFileStoreFullStateBasics(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 100 + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + subj, msgLen, recLen := "A", 19, uint64(50) + msgA := bytes.Repeat([]byte("A"), msgLen) + msgZ := bytes.Repeat([]byte("Z"), msgLen) + + // Send 2 msgs and stop, check for presence of our full state file. + fs.StoreMsg(subj, nil, msgA) + fs.StoreMsg(subj, nil, msgZ) + require_True(t, fs.numMsgBlocks() == 1) + + // Make sure there is a full state file after we do a stop. + fs.Stop() + + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + if _, err := os.Stat(sfile); err != nil { + t.Fatalf("Expected stream state file but got %v", err) + } + + // Read it in and make sure len > 0. + buf, err := os.ReadFile(sfile) + require_NoError(t, err) + require_True(t, len(buf) > 0) + + // Now make sure we recover properly. + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Make sure there are no old idx or fss files. + matches, err := filepath.Glob(filepath.Join(fcfg.StoreDir, msgDir, "%d.fss")) + require_NoError(t, err) + require_True(t, len(matches) == 0) + matches, err = filepath.Glob(filepath.Join(fcfg.StoreDir, msgDir, "%d.idx")) + require_NoError(t, err) + require_True(t, len(matches) == 0) + + state := fs.State() + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 2) + + // Now make sure we can read in values. + var smv StoreMsg + sm, err := fs.LoadMsg(1, &smv) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgA)) + + sm, err = fs.LoadMsg(2, &smv) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgZ)) + + // Now add in 1 more here to split the lmb. + fs.StoreMsg(subj, nil, msgZ) + + // Now stop the filestore and replace the old stream state and make sure we recover correctly. + fs.Stop() + + // Regrab the stream state + buf, err = os.ReadFile(sfile) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Add in one more. + fs.StoreMsg(subj, nil, msgZ) + fs.Stop() + + // Put old stream state back with only 3. + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + state = fs.State() + require_True(t, state.Msgs == 4) + require_True(t, state.Bytes == 4*recLen) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 4) + require_True(t, fs.numMsgBlocks() == 2) + + // Make sure we are tracking subjects correctly. + fs.mu.RLock() + psi := *fs.psim[subj] + fs.mu.RUnlock() + + require_True(t, psi.total == 4) + require_True(t, psi.fblk == 1) + require_True(t, psi.lblk == 2) + + // Store 1 more + fs.StoreMsg(subj, nil, msgA) + fs.Stop() + // Put old stream state back with only 3. + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + state = fs.State() + require_True(t, state.Msgs == 5) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 5) + require_True(t, fs.numMsgBlocks() == 3) + // Make sure we are tracking subjects correctly. + fs.mu.RLock() + psi = *fs.psim[subj] + fs.mu.RUnlock() + require_True(t, psi.total == 5) + require_True(t, psi.fblk == 1) + require_True(t, psi.lblk == 3) + }) +} + +func TestFileStoreFullStatePurge(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 132 // Leave room for tombstones. + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + subj, msg := "A", bytes.Repeat([]byte("A"), 19) + + // Should be 2 per block, so 5 blocks. + for i := 0; i < 10; i++ { + fs.StoreMsg(subj, nil, msg) + } + n, err := fs.Purge() + require_NoError(t, err) + require_True(t, n == 10) + state := fs.State() + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Add in more 10 more total, some B some C. + for i := 0; i < 5; i++ { + fs.StoreMsg("B", nil, msg) + fs.StoreMsg("C", nil, msg) + } + + n, err = fs.PurgeEx("B", 0, 0) + require_NoError(t, err) + require_True(t, n == 5) + + state = fs.State() + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Purge with keep. + n, err = fs.PurgeEx(_EMPTY_, 0, 2) + require_NoError(t, err) + require_True(t, n == 3) + + state = fs.State() + + // Do some quick checks here, keep had a bug. + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 18) + require_True(t, state.LastSeq == 20) + + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Make sure we can survive a purge with no full stream state and have the correct first sequence. + // This used to be provided by the idx file and is now tombstones and the full stream state snapshot. + n, err = fs.Purge() + require_NoError(t, err) + require_True(t, n == 2) + state = fs.State() + fs.Stop() + + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + os.Remove(sfile) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + }) +} + +func TestFileStoreFullStateTestUserRemoveWAL(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 132 // Leave room for tombstones. + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + msgLen := 19 + msgA := bytes.Repeat([]byte("A"), msgLen) + msgZ := bytes.Repeat([]byte("Z"), msgLen) + + // Store 2 msgs and delete first. + fs.StoreMsg("A", nil, msgA) + fs.StoreMsg("Z", nil, msgZ) + fs.RemoveMsg(1) + + // Check we can load things properly since the block will have a tombstone now for seq 1. + sm, err := fs.LoadMsg(2, nil) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgZ)) + + require_True(t, fs.numMsgBlocks() == 1) + state := fs.State() + fs.Stop() + + // Grab the state from this stop. + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + buf, err := os.ReadFile(sfile) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Check we can load things properly since the block will have a tombstone now for seq 1. + _, err = fs.LoadMsg(2, nil) + require_NoError(t, err) + _, err = fs.LoadMsg(1, nil) + require_Error(t, err, ErrStoreMsgNotFound) + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + + // Store 2 more msgs and delete 2 & 4. + fs.StoreMsg("A", nil, msgA) + fs.StoreMsg("Z", nil, msgZ) + fs.RemoveMsg(2) + fs.RemoveMsg(4) + + state = fs.State() + require_True(t, len(state.Deleted) == state.NumDeleted) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + + // Now close again and put back old stream state. + // This will test that we can remember user deletes by placing tombstones in the lmb/wal. + fs.Stop() + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + }) +} + +func TestFileStoreFullStateTestSysRemovals(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 100 + scfg := StreamConfig{ + Name: "zzz", + Subjects: []string{"*"}, + MaxMsgs: 10, + MaxMsgsPer: 1, + Storage: FileStorage, + } + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + msgLen := 19 + msg := bytes.Repeat([]byte("A"), msgLen) + + for _, subj := range []string{"A", "B", "A", "B"} { + fs.StoreMsg(subj, nil, msg) + } + + state := fs.State() + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 3) + require_True(t, state.LastSeq == 4) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + for _, subj := range []string{"C", "D", "E", "F", "G", "H", "I", "J"} { + fs.StoreMsg(subj, nil, msg) + } + + state = fs.State() + require_True(t, state.Msgs == 10) + require_True(t, state.FirstSeq == 3) + require_True(t, state.LastSeq == 12) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Goes over limit + fs.StoreMsg("ZZZ", nil, msg) + + state = fs.State() + require_True(t, state.Msgs == 10) + require_True(t, state.FirstSeq == 4) + require_True(t, state.LastSeq == 13) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + }) +} + +/////////////////////////////////////////////////////////////////////////// +// Benchmarks +/////////////////////////////////////////////////////////////////////////// + +func Benchmark_FileStoreSelectMsgBlock(b *testing.B) { + // We use small block size to create lots of blocks for this test. + fs, err := newFileStore( + FileStoreConfig{StoreDir: b.TempDir(), BlockSize: 128}, + StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) + if err != nil { + b.Fatalf("Unexpected error: %v", err) + } + defer fs.Stop() + + subj, msg := "A", bytes.Repeat([]byte("ABC"), 33) // ~100bytes + + // Add in a bunch of blocks. + for i := 0; i < 1000; i++ { + fs.StoreMsg(subj, nil, msg) + } + if fs.numMsgBlocks() < 1000 { + b.Fatalf("Expected at least 1000 blocks, got %d", fs.numMsgBlocks()) + } + + fs.mu.RLock() + defer fs.mu.RUnlock() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, mb := fs.selectMsgBlockWithIndex(1) + if mb == nil { + b.Fatalf("Expected a non-nil mb") + } + } + b.StopTimer() +} diff --git a/server/jetstream_test.go b/server/jetstream_test.go index 59ba472c..9356dc5b 100644 --- a/server/jetstream_test.go +++ b/server/jetstream_test.go @@ -12232,6 +12232,7 @@ func TestJetStreamServerEncryption(t *testing.T) { // Check stream meta. checkEncrypted := func() { + t.Helper() checkKeyFile(filepath.Join(sdir, JetStreamMetaFileKey)) checkFor(filepath.Join(sdir, JetStreamMetaFile), "TEST", "foo", "bar", "baz", "max_msgs", "max_bytes") // Check a message block. @@ -15465,47 +15466,6 @@ func TestJetStreamStorageReservedBytes(t *testing.T) { } } -func TestJetStreamRecoverStreamWithDeletedMessagesNonCleanShutdown(t *testing.T) { - s := RunBasicJetStreamServer(t) - defer s.Shutdown() - - nc, js := jsClientConnect(t, s) - defer nc.Close() - - _, err := js.AddStream(&nats.StreamConfig{Name: "T"}) - require_NoError(t, err) - - for i := 0; i < 100; i++ { - js.Publish("T", []byte("OK")) - } - - js.DeleteMsg("T", 22) - - // Now we need a non-clean shutdown. - // For this use case that means we do *not* write the fss file. - sd := s.JetStreamConfig().StoreDir - fss := filepath.Join(sd, "$G", "streams", "T", "msgs", "1.fss") - - // Stop current - nc.Close() - s.Shutdown() - - // Remove fss file to simulate a non-clean shutdown. - err = os.Remove(fss) - require_NoError(t, err) - - // Restart. - s = RunJetStreamServerOnPort(-1, sd) - defer s.Shutdown() - - nc, js = jsClientConnect(t, s) - defer nc.Close() - - // Make sure we recovered our stream - _, err = js.StreamInfo("T") - require_NoError(t, err) -} - func TestJetStreamRestoreBadStream(t *testing.T) { s := RunBasicJetStreamServer(t) defer s.Shutdown() @@ -20387,9 +20347,8 @@ func TestJetStreamMsgBlkFailOnKernelFault(t *testing.T) { sd := s.JetStreamConfig().StoreDir s.Shutdown() - // Zero out the last block. - err = os.WriteFile(lmbf, nil, defaultFilePerms) - require_NoError(t, err) + // Remove block. + require_NoError(t, os.Remove(lmbf)) s = RunJetStreamServerOnPort(-1, sd) defer s.Shutdown() @@ -21263,14 +21222,8 @@ func TestJetStreamMaxBytesIgnored(t *testing.T) { sd := s.JetStreamConfig().StoreDir s.Shutdown() - // We will remove the idx file and truncate the blk and fss files. + // We will truncate blk file. mdir := filepath.Join(sd, "$G", "streams", "TEST", "msgs") - // Remove idx - err = os.Remove(filepath.Join(mdir, "1.idx")) - require_NoError(t, err) - // Truncate fss - err = os.WriteFile(filepath.Join(mdir, "1.fss"), nil, defaultFilePerms) - require_NoError(t, err) // Truncate blk err = os.WriteFile(filepath.Join(mdir, "1.blk"), nil, defaultFilePerms) require_NoError(t, err) diff --git a/server/norace_test.go b/server/norace_test.go index f43b46ef..6bbf7791 100644 --- a/server/norace_test.go +++ b/server/norace_test.go @@ -3120,10 +3120,9 @@ func TestNoRaceJetStreamFileStoreCompaction(t *testing.T) { for i := 0; i < toSend; i++ { js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) } - select { case <-js.PublishAsyncComplete(): - case <-time.After(time.Second): + case <-time.After(10 * time.Second): t.Fatalf("Did not receive completion signal") } @@ -5218,6 +5217,10 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { for { select { case <-qch: + select { + case <-js.PublishAsyncComplete(): + case <-time.After(10 * time.Second): + } return default: // Send as fast as we can. @@ -5227,7 +5230,7 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { }() } - time.Sleep(100 * time.Millisecond) + time.Sleep(200 * time.Millisecond) // Now let's scale up to an R3. cfg.Replicas = 3 @@ -5277,7 +5280,7 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { t.Fatalf("Expected to see messages increase, got %d", si.State.Msgs) } - checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { + checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { // Make sure they are all the same from a state perspective. // Leader will have the expected state. lmset, err := c.streamLeader("$G", "TEST").GlobalAccount().lookupStream("TEST") @@ -8742,6 +8745,7 @@ func TestNoRaceFilestoreBinaryStreamSnapshotEncodingLargeGaps(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 128) numMsgs := 20_000 diff --git a/server/stream.go b/server/stream.go index f7b8f203..9bb254f3 100644 --- a/server/stream.go +++ b/server/stream.go @@ -971,8 +971,8 @@ func (mset *stream) lastSeqAndCLFS() (uint64, uint64) { } func (mset *stream) clearCLFS() uint64 { - mset.mu.Lock() - defer mset.mu.Unlock() + mset.clMu.Lock() + defer mset.clMu.Unlock() clfs := mset.clfs mset.clfs, mset.clseq = 0, 0 return clfs