diff --git a/server/filestore.go b/server/filestore.go index 11c1f558..57f61806 100644 --- a/server/filestore.go +++ b/server/filestore.go @@ -18,6 +18,7 @@ import ( "bytes" "crypto/aes" "crypto/cipher" + "crypto/rand" "crypto/sha256" "encoding/binary" "encoding/hex" @@ -35,8 +36,6 @@ import ( "sync/atomic" "time" - crand "crypto/rand" - "github.com/klauspost/compress/s2" "github.com/minio/highwayhash" "github.com/nats-io/nats-server/v2/server/avl" @@ -158,6 +157,7 @@ type fileStore struct { srv *Server mu sync.RWMutex state StreamState + tombs []uint64 ld *LostStreamData scb StorageUpdateHandler ageChk *time.Timer @@ -173,8 +173,11 @@ type fileStore struct { psim map[string]*psi hh hash.Hash64 qch chan struct{} + fch chan struct{} + fsld chan struct{} cfs []ConsumerStore sips int + dirty int closed bool fip bool receivedAny bool @@ -193,8 +196,6 @@ type msgBlock struct { nonce []byte mfn string mfd *os.File - ifn string - ifd *os.File cmp StoreCompression // Effective compression at the time of loading the block liwsz int64 index uint32 @@ -202,9 +203,7 @@ type msgBlock struct { rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk. msgs uint64 // User visible message count. fss map[string]*SimpleState - sfn string kfn string - lwits int64 lwts int64 llts int64 lrts int64 @@ -224,10 +223,6 @@ type msgBlock struct { noTrack bool closed bool - // To avoid excessive writes when expiring cache. - // These can be big. - fssNeedsWrite bool - // Used to mock write failures. mockWriteErr bool } @@ -269,8 +264,10 @@ const ( newScan = "%d.new" // used to scan index file names. indexScan = "%d.idx" - // used to load per subject meta information. - fssScan = "%d.fss" + // to look for orphans + indexScanAll = "*.idx" + // to look for orphans + fssScanAll = "*.fss" // used to store our block encryption key. keyScan = "%d.key" // to look for orphans @@ -301,6 +298,9 @@ const ( JetStreamMetaFileSum = "meta.sum" JetStreamMetaFileKey = "meta.key" + // This is the full snapshotted state for the stream. + streamStreamStateFile = "index.db" + // AEK key sizes minMetaKeySize = 64 minBlkKeySize = 64 @@ -326,10 +326,6 @@ const ( FileStoreMaxBlkSize = maxBlockSize // Check for bad record length value due to corrupt data. rlBadThresh = 32 * 1024 * 1024 - // Time threshold to write index info. - wiThresh = int64(30 * time.Second) - // Time threshold to write index info for non FIFO cases - winfThresh = int64(2 * time.Second) // Checksum size for hash for msg records. recordHashSize = 8 ) @@ -385,6 +381,8 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim prf: prf, oldprf: oldprf, qch: make(chan struct{}), + fch: make(chan struct{}, 1), + fsld: make(chan struct{}), } // Set flush in place to AsyncFlush which by default is false. @@ -415,9 +413,64 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim } } - // Recover our message state. - if err := fs.recoverMsgs(); err != nil { - return nil, err + // Attempt to recover our state. + err = fs.recoverFullState() + if err != nil { + // Hold onto state + prior := fs.state + // Reset anything that could have been set from above. + fs.state = StreamState{} + fs.psim = make(map[string]*psi) + fs.bim = make(map[uint32]*msgBlock) + fs.blks = nil + fs.tombs = nil + + // Recover our message state the old way + if err := fs.recoverMsgs(); err != nil { + return nil, err + } + + // Check if our prior remember a last past where we can see. + if fs.ld != nil && prior.LastSeq > fs.state.LastSeq { + fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime + if _, err = fs.newMsgBlockForWrite(); err != nil { + return nil, err + } + } + // Since we recovered here, make sure to kick ourselves to write out our stream state. + fs.dirty++ + defer fs.kickFlushStateLoop() + // Also make sure we get rid of old idx and fss files on return. + defer func() { + os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll)) + os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll)) + }() + } + + // Check if we have any left over tombstones to process. + if len(fs.tombs) > 0 { + fs.mu.Lock() + for _, seq := range fs.tombs { + fs.removeMsg(seq, false, false, false) + } + // Not needed after this phase. + fs.tombs = nil + fs.mu.Unlock() + } + + // Limits checks and enforcement. + fs.enforceMsgLimit() + fs.enforceBytesLimit() + + // Do age checks too, make sure to call in place. + if fs.cfg.MaxAge != 0 { + fs.expireMsgsOnRecover() + fs.startAgeChk() + } + + // If we have max msgs per subject make sure the is also enforced. + if fs.cfg.MaxMsgsPer > 0 { + fs.enforceMsgPerSubjectLimit() } // If the stream has an initial sequence number then make sure we @@ -456,6 +509,9 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) + // Spin up the go routine that will write out or full state stream index. + go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld) + return fs, nil } @@ -606,7 +662,7 @@ func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cip const seedSize = 32 seed = make([]byte, seedSize) - if n, err := crand.Read(seed); err != nil || n != seedSize { + if n, err := rand.Read(seed); err != nil || n != seedSize { return nil, nil, nil, nil, err } @@ -617,7 +673,7 @@ func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cip // Generate our nonce. Use same buffer to hold encrypted seed. nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) bek, err = genBlockEncryptionKey(sc, seed[:], nonce) if err != nil { @@ -641,9 +697,37 @@ func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, e return nil, errUnknownCipher } -// Write out meta and the checksum. // Lock should be held. -func (fs *fileStore) writeStreamMeta() error { +func (fs *fileStore) recoverAEK() error { + if fs.prf != nil && fs.aek == nil { + ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)) + if err != nil { + return err + } + rb, err := fs.prf([]byte(fs.cfg.Name)) + if err != nil { + return err + } + kek, err := genEncryptionKey(fs.fcfg.Cipher, rb) + if err != nil { + return err + } + ns := kek.NonceSize() + seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) + if err != nil { + return err + } + aek, err := genEncryptionKey(fs.fcfg.Cipher, seed) + if err != nil { + return err + } + fs.aek = aek + } + return nil +} + +// Lock should be held. +func (fs *fileStore) setupAEK() error { if fs.prf != nil && fs.aek == nil { key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name) if err != nil { @@ -659,6 +743,15 @@ func (fs *fileStore) writeStreamMeta() error { // Set our aek. fs.aek = key } + return nil +} + +// Write out meta and the checksum. +// Lock should be held. +func (fs *fileStore) writeStreamMeta() error { + if err := fs.setupAEK(); err != nil { + return err + } meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile) if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { @@ -671,7 +764,7 @@ func (fs *fileStore) writeStreamMeta() error { // Encrypt if needed. if fs.aek != nil { nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) b = fs.aek.Seal(nonce, nonce, b, nil) } @@ -750,74 +843,95 @@ func (fs *fileStore) noTrackSubjects() bool { return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0) } -// Lock held on entry -func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, error) { +// Will init the basics for a message block. +func (fs *fileStore) initMsgBlock(index uint32) *msgBlock { mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects()} mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) - mb.mfn = filepath.Join(mdir, fi.Name()) - mb.ifn = filepath.Join(mdir, fmt.Sprintf(indexScan, index)) - mb.sfn = filepath.Join(mdir, fmt.Sprintf(fssScan, index)) + mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index)) if mb.hh == nil { key := sha256.Sum256(fs.hashKeyForBlock(index)) mb.hh, _ = highwayhash.New64(key[:]) } + return mb +} + +// Lock for fs should be held. +func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error { + if fs.prf == nil { + return nil + } var createdKeys bool - - // Check if encryption is enabled. - if fs.prf != nil { - ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) + if err != nil { + // We do not seem to have keys even though we should. Could be a plaintext conversion. + // Create the keys and we will double check below. + if err := fs.genEncryptionKeysForBlock(mb); err != nil { + return err + } + createdKeys = true + } else { + if len(ekey) < minBlkKeySize { + return errBadKeySize + } + // Recover key encryption key. + rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) if err != nil { - // We do not seem to have keys even though we should. Could be a plaintext conversion. - // Create the keys and we will double check below. - if err := fs.genEncryptionKeysForBlock(mb); err != nil { - return nil, err - } - createdKeys = true - } else { - if len(ekey) < minBlkKeySize { - return nil, errBadKeySize - } - // Recover key encryption key. - rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) - if err != nil { - return nil, err - } + return err + } - sc := fs.fcfg.Cipher - kek, err := genEncryptionKey(sc, rb) - if err != nil { - return nil, err - } - ns := kek.NonceSize() - seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) - if err != nil { - // We may be here on a cipher conversion, so attempt to convert. - if err = mb.convertCipher(); err != nil { - return nil, err - } - } else { - mb.seed, mb.nonce = seed, ekey[:ns] - } - mb.aek, err = genEncryptionKey(sc, mb.seed) - if err != nil { - return nil, err - } - if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { - return nil, err + sc := fs.fcfg.Cipher + kek, err := genEncryptionKey(sc, rb) + if err != nil { + return err + } + ns := kek.NonceSize() + seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) + if err != nil { + // We may be here on a cipher conversion, so attempt to convert. + if err = mb.convertCipher(); err != nil { + return err } + } else { + mb.seed, mb.nonce = seed, ekey[:ns] + } + mb.aek, err = genEncryptionKey(sc, mb.seed) + if err != nil { + return err + } + if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { + return err } } // If we created keys here, let's check the data and if it is plaintext convert here. if createdKeys { if err := mb.convertToEncrypted(); err != nil { - return nil, err + return err } } + return nil +} + +// Load a last checksum if needed from the block file. +// Lock should be held. +func (mb *msgBlock) ensureLastChecksumLoaded() { + var empty [8]byte + if mb.lchk != empty { + return + } + copy(mb.lchk[0:], mb.lastChecksum()) +} + +// Lock held on entry +func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { + mb := fs.initMsgBlock(index) + fs.loadEncryptionForMsgBlock(mb) + // Open up the message file, but we will try to recover from the index file. // We will check that the last checksums match. file, err := os.Open(mb.mfn) @@ -840,7 +954,7 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e copy(lchk[0:], buf[len(buf)-checksumSize:]) } } else { - file.ReadAt(lchk[:], fi.Size()-checksumSize) + file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) } } @@ -862,9 +976,14 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e } // If we get data loss rebuilding the message block state record that with the fs itself. - if ld, _ := mb.rebuildState(); ld != nil { + ld, tombs, _ := mb.rebuildState() + if ld != nil { fs.addLostData(ld) } + // Collect all tombstones. + if len(tombs) > 0 { + fs.tombs = append(fs.tombs, tombs...) + } if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { fs.populateGlobalPerSubjectInfo(mb) @@ -872,10 +991,9 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint32) (*msgBlock, e mb.tryForceExpireCacheLocked() } - // Rewrite this to make sure we are sync'd. - mb.writeIndexInfo() mb.closeFDs() fs.addMsgBlock(mb) + return mb, nil } @@ -889,12 +1007,6 @@ func (fs *fileStore) lostData() *LostStreamData { return &nld } -func (fs *fileStore) rebuildState(ld *LostStreamData) { - fs.mu.Lock() - defer fs.mu.Unlock() - fs.rebuildStateLocked(ld) -} - // Lock should be held. func (fs *fileStore) addLostData(ld *LostStreamData) { if ld == nil { @@ -910,6 +1022,12 @@ func (fs *fileStore) addLostData(ld *LostStreamData) { } } +func (fs *fileStore) rebuildState(ld *LostStreamData) { + fs.mu.Lock() + defer fs.mu.Unlock() + fs.rebuildStateLocked(ld) +} + // Lock should be held. func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) { fs.addLostData(ld) @@ -1007,9 +1125,6 @@ func (mb *msgBlock) convertCipher() error { if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil { return err } - // If we are here we want to delete other meta, e.g. idx, fss. - os.Remove(mb.ifn) - os.Remove(mb.sfn) return nil } return fmt.Errorf("unable to recover keys") @@ -1035,30 +1150,24 @@ func (mb *msgBlock) convertToEncrypted() error { if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil { return err } - if buf, err = os.ReadFile(mb.ifn); err == nil && len(buf) > 0 { - if err := checkNewHeader(buf); err != nil { - return err - } - buf = mb.aek.Seal(buf[:0], mb.nonce, buf, nil) - if err := os.WriteFile(mb.ifn, buf, defaultFilePerms); err != nil { - return err - } - } return nil } -func (mb *msgBlock) rebuildState() (*LostStreamData, error) { +// Rebuild the state of the blk based on what we have on disk in the N.blk file. +// We will return any lost data, and we will return any delete tombstones we encountered. +func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) { mb.mu.Lock() defer mb.mu.Unlock() return mb.rebuildStateLocked() } -func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { +// Rebuild the state of the blk based on what we have on disk in the N.blk file. +// Lock should be held. +func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { startLastSeq := mb.last.seq // Remove the .fss file and clear any cache we have set. mb.clearCacheAndOffset() - mb.removePerSubjectInfoLocked() buf, err := mb.loadBlock(nil) if err != nil || len(buf) == 0 { @@ -1077,7 +1186,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { mb.dmap.Empty() mb.first.seq = mb.last.seq + 1 } - return ld, err + return ld, nil, err } // Clear state we need to rebuild. @@ -1090,14 +1199,14 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { // Recreate to reset counter. mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) if err != nil { - return nil, err + return nil, nil, err } mb.bek.XORKeyStream(buf, buf) } // Check for compression. if buf, err = mb.decompressIfNeeded(buf); err != nil { - return nil, err + return nil, nil, err } mb.rbytes = uint64(len(buf)) @@ -1144,10 +1253,17 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { return &ld } + // For tombstones that we find and collect. + var ( + tombstones []uint64 + minTombstoneSeq uint64 + minTombstoneTs int64 + ) + for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { if index+msgHdrSize > lbuf { truncate(index) - return gatherLost(lbuf - index), nil + return gatherLost(lbuf - index), tombstones, nil } hdr := buf[index : index+msgHdrSize] @@ -1160,24 +1276,39 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { // Do some quick sanity checks here. if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { truncate(index) - return gatherLost(lbuf - index), errBadMsg + return gatherLost(lbuf - index), tombstones, errBadMsg } seq := le.Uint64(hdr[4:]) ts := int64(le.Uint64(hdr[12:])) + // Check if this is a delete tombstone. + if seq&tbit != 0 { + seq = seq &^ tbit + // Need to process this here and make sure we have accounted for this properly. + tombstones = append(tombstones, seq) + index += rl + if minTombstoneSeq == 0 || seq < minTombstoneSeq { + minTombstoneSeq, minTombstoneTs = seq, ts + } + continue + } + // This is an old erased message, or a new one that we can track. if seq == 0 || seq&ebit != 0 || seq < mb.first.seq { seq = seq &^ ebit - // Only add to dmap if past recorded first seq and non-zero. - if seq != 0 && seq >= mb.first.seq { - addToDmap(seq) - } - index += rl if seq >= mb.first.seq { + // Only add to dmap if past recorded first seq and non-zero. + if seq != 0 { + addToDmap(seq) + } mb.last.seq = seq mb.last.ts = ts + if mb.msgs == 0 { + mb.first.seq, mb.first.ts = seq+1, 0 + } } + index += rl continue } @@ -1188,13 +1319,11 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts } - deleted := mb.dmap.Exists(seq) - // Always set last. mb.last.seq = seq mb.last.ts = ts - if !deleted { + if !mb.dmap.Exists(seq) { data := buf[index+msgHdrSize : index+rl] if hh := mb.hh; hh != nil { hh.Reset() @@ -1208,7 +1337,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { checksum := hh.Sum(nil) if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) { truncate(index) - return gatherLost(lbuf - index), errBadMsg + return gatherLost(lbuf - index), tombstones, errBadMsg } copy(mb.lchk[0:], checksum) } @@ -1235,7 +1364,6 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { subj := mb.subjString(data[:slen]) mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } } // Advance to next record. @@ -1243,16 +1371,388 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { } // For empty msg blocks make sure we recover last seq correctly based off of first. - if mb.msgs == 0 && mb.first.seq > 0 { - mb.last.seq = mb.first.seq - 1 + // Or if we seem to have no messages but had a tombstone, which we use to remember + // sequences and timestamps now, use that to properly setup the first and last. + if mb.msgs == 0 { + if mb.first.seq > 0 { + mb.last.seq = mb.first.seq - 1 + } else if mb.first.seq == 0 && minTombstoneSeq > 0 { + mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0 + if mb.last.seq == 0 { + mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs + } + } } - // Update our fss file if needed. - if len(mb.fss) > 0 { - mb.writePerSubjectInfo() + return nil, tombstones, nil +} + +// Used when we scan the msg blocks. +type blockFiles struct { + blksSeen map[uint32]struct{} + maxIndex uint32 +} + +// This will grab all the block files. +func (fs *fileStore) grabMsgBlockFiles(ch chan *blockFiles) { + f, err := os.Open(filepath.Join(fs.fcfg.StoreDir, msgDir)) + if err != nil { + ch <- nil + return + } + defer f.Close() + + dirs, err := f.ReadDir(-1) + if err != nil { + ch <- nil + return } - return nil, nil + result := &blockFiles{blksSeen: make(map[uint32]struct{})} + + for _, fi := range dirs { + var index uint32 + if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { + result.blksSeen[index] = struct{}{} + if index > result.maxIndex { + result.maxIndex = index + } + } + } + ch <- result +} + +// recoverFullState will attempt to receover our last full state and re-process any state changes +// that happened afterwards. +func (fs *fileStore) recoverFullState() (rerr error) { + // Grab all the msgBlock files in parallel in case there are many. + rch := make(chan *blockFiles, 1) + go fs.grabMsgBlockFiles(rch) + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Check for any left over purged messages. + <-dios + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) + if _, err := os.Stat(pdir); err == nil { + os.RemoveAll(pdir) + } + // Grab our stream state file and load it in. + fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + buf, err := os.ReadFile(fn) + dios <- struct{}{} + + if err != nil { + return err + } + + const minLen = 32 + if len(buf) < minLen { + os.Remove(fn) + return errCorruptState + } + + // The highwayhash will be on the end. Check that it still matches. + h := buf[len(buf)-highwayhash.Size64:] + buf = buf[:len(buf)-highwayhash.Size64] + fs.hh.Reset() + fs.hh.Write(buf) + if !bytes.Equal(h, fs.hh.Sum(nil)) { + os.Remove(fn) + return errCorruptState + } + + // Decrypt if needed. + if fs.prf != nil { + // We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile + // since snapshots strip encryption. + if err := fs.recoverAEK(); err == nil { + ns := fs.aek.NonceSize() + buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil) + if err != nil { + return err + } + } + } + + if buf[0] != fullStateMagic || buf[1] != fullStateVersion { + os.Remove(fn) + return errCorruptState + } + + bi := hdrLen + + readU64 := func() uint64 { + if bi < 0 { + return 0 + } + v, n := binary.Uvarint(buf[bi:]) + if n <= 0 { + bi = -1 + return 0 + } + bi += n + return v + } + readI64 := func() int64 { + if bi < 0 { + return 0 + } + v, n := binary.Varint(buf[bi:]) + if n <= 0 { + bi = -1 + return -1 + } + bi += n + return v + } + + setTime := func(t *time.Time, ts int64) { + if ts == 0 { + *t = time.Time{} + } else { + *t = time.Unix(0, ts).UTC() + } + } + + var state StreamState + state.Msgs = readU64() + state.Bytes = readU64() + state.FirstSeq = readU64() + baseTime := readI64() + setTime(&state.FirstTime, baseTime) + state.LastSeq = readU64() + setTime(&state.LastTime, readI64()) + + // Check for per subject info. + if numSubjects := int(readU64()); numSubjects > 0 { + fs.psim = make(map[string]*psi, numSubjects) + for i := 0; i < numSubjects; i++ { + if lsubj := int(readU64()); lsubj > 0 { + if bi+lsubj > len(buf) { + os.Remove(fn) + return errCorruptState + } + subj := fs.subjString(buf[bi : bi+lsubj]) + bi += lsubj + psi := &psi{total: readU64(), fblk: uint32(readU64())} + if psi.total > 1 { + psi.lblk = uint32(readU64()) + } else { + psi.lblk = psi.fblk + } + fs.psim[subj] = psi + } + } + } + + if numBlocks := readU64(); numBlocks > 0 { + fs.blks = make([]*msgBlock, 0, numBlocks) + for i := 0; i < int(numBlocks); i++ { + index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64() + if bi < 0 { + break + } + mb := fs.initMsgBlock(index) + mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes + mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime + if numDeleted > 0 { + dmap, n, err := avl.Decode(buf[bi:]) + if err != nil { + os.Remove(fn) + return errCorruptState + } + mb.dmap = *dmap + mb.msgs -= numDeleted + bi += n + } + fs.addMsgBlock(mb) + } + } + + // Pull in last block index for the block that had last checksum when we wrote the full state. + blkIndex := uint32(readU64()) + var lchk [8]byte + if bi+len(lchk) > len(buf) { + bi = -1 + } else { + copy(lchk[0:], buf[bi:bi+len(lchk)]) + } + + // Check if we had any errors. + if bi < 0 { + os.Remove(fn) + return errCorruptState + } + + // Grab the max blk index we see from scanning the directory. The full snapshot has the index that was lmb when + // we created it, so with that and max we know blocks to process. We do this in parallel in casee lots of blks. + blkFiles := <-rch + + defer func() { + // Make sure we saw all of our blk files. + for _, mb := range fs.blks { + if _, ok := blkFiles.blksSeen[mb.index]; !ok { + if ld, _, _ := mb.rebuildState(); ld != nil { + // If we have lost data make sure we track here. + fs.addLostData(ld) + rerr = errCorruptState + } + } + } + }() + + // Move into place our state, msgBlks and subject info. + fs.state = state + + // If our saved state is past what we see on disk, fallback and rebuild. + if blkFiles != nil && blkFiles.maxIndex < blkIndex { + return errPriorState + } + + // First let's check the happy path, open the blk file that was the lmb when we created the full state. + // See if we have the last block available. + var matched bool + var mb *msgBlock + if mb = fs.bim[blkIndex]; mb != nil { + matched = bytes.Equal(mb.lastChecksum(), lchk[:]) + if matched && blkIndex == blkFiles.maxIndex { + return nil + } + // Remove the last message block since we will re-process below. + fs.removeMsgBlockFromList(mb) + } + + // If we are here we did not match the happy path. + // We need to go through and find our checksum. This should be in blkIndex, but might not be. + start, stop := blkIndex, blkFiles.maxIndex + if matched { + start++ + } + + for bi := start; bi <= stop; bi++ { + nmb, err := fs.recoverMsgBlock(bi) + if err != nil { + return err + } + if nmb != nil { + // Check if we have to account for a partial message block. + if !matched && mb != nil && mb.index == nmb.index { + if err := fs.adjustAccounting(mb, nmb); err != nil { + return err + } + } + // Update top level accounting. + if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq { + fs.state.FirstSeq = nmb.first.seq + fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() + } + if nmb.last.seq > fs.state.LastSeq { + fs.state.LastSeq = nmb.last.seq + fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() + } + fs.state.Msgs += nmb.msgs + fs.state.Bytes += nmb.bytes + } + } + + return nil +} + +// adjustAccounting will be called when a stream state was only partially accounted for +// with a message block, e.g. additional records were added after the stream state. +// Lock should be held. +func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error { + nmb.mu.Lock() + defer nmb.mu.Unlock() + + // First make sure the new block is loaded. + if nmb.cacheNotLoaded() { + nmb.loadMsgsWithLock() + } + nmb.ensurePerSubjectInfoLoaded() + + lookupAndAdjust := func(seq uint64) error { + var smv StoreMsg + // Lookup the message. + sm, err := nmb.cacheLookup(seq, &smv) + if err != nil { + return err + } + // Since we found it we just need to adjust fs totals and psim. + fs.state.Msgs-- + fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) + if len(sm.subj) > 0 && fs.psim != nil { + fs.removePerSubject(sm.subj) + } + return nil + } + + // Walk all the original mb's sequences that were included in the stream state. + for seq := mb.first.seq; seq <= mb.last.seq; seq++ { + // If we had already declared it deleted we can move on since you can not undelete. + if mb.dmap.Exists(seq) { + continue + } + // Lookup the message. + if err := lookupAndAdjust(seq); err != nil { + return err + } + } + + // Now check to see if we had a higher first for the recovered state mb vs nmb. + if nmb.first.seq < mb.first.seq { + for seq := nmb.first.seq; seq < mb.first.seq; seq++ { + // Lookup the message. + if err := lookupAndAdjust(seq); err != nil { + return err + } + } + // Now set first for nmb. + nmb.first = mb.first + } + + return nil +} + +// Grabs last checksum for the named block file. +// Takes into account encryption etc. +func (mb *msgBlock) lastChecksum() []byte { + f, err := os.Open(mb.mfn) + if err != nil { + return nil + } + defer f.Close() + + var lchk [8]byte + if fi, _ := f.Stat(); fi != nil { + mb.rbytes = uint64(fi.Size()) + } + if mb.rbytes < checksumSize { + return nil + } + // Encrypted? + // Check for encryption, we do not load keys on startup anymore so might need to load them here. + if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { + if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { + return nil + } + } + if mb.bek != nil { + if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { + bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) + if err != nil { + return nil + } + mb.bek = bek + mb.bek.XORKeyStream(buf, buf) + copy(lchk[0:], buf[len(buf)-checksumSize:]) + } + } else { + f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) + } + return lchk[:] } func (fs *fileStore) recoverMsgs() error { @@ -1260,55 +1760,65 @@ func (fs *fileStore) recoverMsgs() error { defer fs.mu.Unlock() // Check for any left over purged messages. - pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) <-dios + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) if _, err := os.Stat(pdir); err == nil { os.RemoveAll(pdir) } + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + f, err := os.Open(mdir) + if err != nil { + dios <- struct{}{} + return errNotReadable + } + dirs, err := f.ReadDir(-1) + f.Close() dios <- struct{}{} - mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) - fis, err := os.ReadDir(mdir) if err != nil { return errNotReadable } - // Recover all of the msg blocks. - // These can come in a random order, so account for that. - for _, fi := range fis { - var index uint32 + indices := make(sort.IntSlice, 0, len(dirs)) + var index int + for _, fi := range dirs { if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { - finfo, err := fi.Info() - if err != nil { - return err + indices = append(indices, index) + } + } + indices.Sort() + + // Recover all of the msg blocks. + // We now guarantee they are coming in order. + for _, index := range indices { + if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil { + // This is a truncate block with possibly no index. If the OS got shutdown + // out from underneath of us this is possible. + if mb.first.seq == 0 { + mb.dirtyCloseWithRemove(true) + fs.removeMsgBlockFromList(mb) + continue } - if mb, err := fs.recoverMsgBlock(finfo, index); err == nil && mb != nil { - // This is a truncate block with possibly no index. If the OS got shutdown - // out from underneath of us this is possible. - if mb.first.seq == 0 { - mb.dirtyCloseWithRemove(true) - fs.removeMsgBlockFromList(mb) - continue - } - if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq + if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { + fs.state.FirstSeq = mb.first.seq + if mb.first.ts == 0 { + fs.state.FirstTime = time.Time{} + } else { fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } - if mb.last.seq > fs.state.LastSeq { - fs.state.LastSeq = mb.last.seq - fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() - } - fs.state.Msgs += mb.msgs - fs.state.Bytes += mb.bytes - } else { - return err } + if mb.last.seq > fs.state.LastSeq { + fs.state.LastSeq = mb.last.seq + fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() + } + fs.state.Msgs += mb.msgs + fs.state.Bytes += mb.bytes + } else { + return err } } - // Now make sure to sort blks for efficient lookup later with selectMsgBlock(). if len(fs.blks) > 0 { - sort.Slice(fs.blks, func(i, j int) bool { return fs.blks[i].index < fs.blks[j].index }) fs.lmb = fs.blks[len(fs.blks)-1] } else { _, err = fs.newMsgBlockForWrite() @@ -1319,12 +1829,7 @@ func (fs *fileStore) recoverMsgs() error { var emptyBlks []*msgBlock for _, mb := range fs.blks { if mb.msgs == 0 && mb.rbytes == 0 { - if mb == fs.lmb { - mb.first.seq, mb.first.ts = mb.last.seq+1, 0 - mb.closeAndKeepIndex(false) - } else { - emptyBlks = append(emptyBlks, mb) - } + emptyBlks = append(emptyBlks, mb) } } for _, mb := range emptyBlks { @@ -1354,21 +1859,6 @@ func (fs *fileStore) recoverMsgs() error { } } - // Limits checks and enforcement. - fs.enforceMsgLimit() - fs.enforceBytesLimit() - - // Do age checks too, make sure to call in place. - if fs.cfg.MaxAge != 0 { - fs.expireMsgsOnRecover() - fs.startAgeChk() - } - - // If we have max msgs per subject make sure the is also enforced. - if fs.cfg.MaxMsgsPer > 0 { - fs.enforceMsgPerSubjectLimit() - } - return nil } @@ -1376,8 +1866,10 @@ func (fs *fileStore) recoverMsgs() error { // We will treat this differently in case we have a recovery // that will expire alot of messages on startup. // Should only be called on startup. -// Lock should be held. func (fs *fileStore) expireMsgsOnRecover() { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.state.Msgs == 0 { return } @@ -1387,15 +1879,20 @@ func (fs *fileStore) expireMsgsOnRecover() { var deleted int var nts int64 - deleteEmptyBlock := func(mb *msgBlock) bool { - // If we are the last keep state to remember first sequence. + // If we expire all make sure to write out a tombstone. Need to be done by hand here, + // usually taken care of by fs.removeMsgBlock() but we do not call that here. + var last msgId + + deleteEmptyBlock := func(mb *msgBlock) { + // If we are the last keep state to remember first/last sequence. + // Do this part by hand since not deleting one by one. if mb == fs.lmb { - // Do this part by hand since not deleting one by one. - mb.first.seq, mb.first.ts = mb.last.seq+1, 0 - mb.closeAndKeepIndex(false) - // Clear any global subject state. - fs.psim = make(map[string]*psi) - return false + last = mb.last + } + // Make sure we do subject cleanup as well. + mb.ensurePerSubjectInfoLoaded() + for subj := range mb.fss { + fs.removePerSubject(subj) } // Make sure we do subject cleanup as well. mb.ensurePerSubjectInfoLoaded() @@ -1404,7 +1901,6 @@ func (fs *fileStore) expireMsgsOnRecover() { } mb.dirtyCloseWithRemove(true) deleted++ - return true } for _, mb := range fs.blks { @@ -1418,11 +1914,8 @@ func (fs *fileStore) expireMsgsOnRecover() { if mb.last.ts <= minAge { purged += mb.msgs bytes += mb.bytes - didRemove := deleteEmptyBlock(mb) + deleteEmptyBlock(mb) mb.mu.Unlock() - if !didRemove { - mb.writeIndexInfo() - } continue } @@ -1488,14 +1981,10 @@ func (fs *fileStore) expireMsgsOnRecover() { mb.selectNextFirst() } // Check if empty after processing, could happen if tail of messages are all deleted. - needWriteIndex := true if mb.msgs == 0 { - needWriteIndex = !deleteEmptyBlock(mb) + deleteEmptyBlock(mb) } mb.mu.Unlock() - if needWriteIndex { - mb.writeIndexInfo() - } break } @@ -1532,6 +2021,15 @@ func (fs *fileStore) expireMsgsOnRecover() { } // Make sure to we properly set the fs first sequence and timestamp. fs.selectNextFirst() + + // Check if we have no messages and blocks left. + if fs.lmb == nil && last.seq != 0 { + if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { + lmb.writeTombstone(last.seq, last.ts) + } + // Clear any global subject state. + fs.psim = make(map[string]*psi) + } } func copyMsgBlocks(src []*msgBlock) []*msgBlock { @@ -1584,12 +2082,17 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor mb.mu.Lock() defer mb.mu.Unlock() - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - return nil, false, err - } - fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter} + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return nil, false, err + } + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + return nil, false, err + } + } + // If we only have 1 subject currently and it matches our filter we can also set isAll. if !isAll && len(mb.fss) == 1 { _, isAll = mb.fss[filter] @@ -1630,12 +2133,6 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor return nil, false, ErrStoreMsgNotFound } - if mb.cacheNotLoaded() { - if err := mb.loadMsgsWithLock(); err != nil { - return nil, false, err - } - } - if sm == nil { sm = new(StoreMsg) } @@ -1647,10 +2144,10 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor continue } expireOk := seq == mb.last.seq && mb.llseq == seq + if isAll { + return fsm, expireOk, nil + } if doLinearScan { - if isAll { - return fsm, expireOk, nil - } if wc && subjectIsSubsetMatch(fsm.subj, filter) { return fsm, expireOk, nil } else if !wc && fsm.subj == filter { @@ -2268,11 +2765,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { if lmb := fs.lmb; lmb != nil { index = lmb.index + 1 - // Make sure to write out our index file if needed. - if lmb.indexNeedsUpdate() { - lmb.writeIndexInfo() - } - // Determine if we can reclaim any resources here. if fs.fip { lmb.mu.Lock() @@ -2291,6 +2783,13 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { mb.mu.Lock() mb.setupWriteCache(rbuf) mb.fss = make(map[string]*SimpleState) + + // Set cache time to creation time to start. + ts := time.Now().UnixNano() + mb.llts, mb.lwts = 0, ts + // Remember our last sequence number. + mb.first.seq = fs.state.LastSeq + 1 + mb.last.seq = fs.state.LastSeq mb.mu.Unlock() // Now do local hash. @@ -2310,17 +2809,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { } mb.mfd = mfd - mb.ifn = filepath.Join(mdir, fmt.Sprintf(indexScan, mb.index)) - ifd, err := os.OpenFile(mb.ifn, os.O_CREATE|os.O_RDWR, defaultFilePerms) - if err != nil { - mb.dirtyCloseWithRemove(true) - return nil, fmt.Errorf("Error creating msg index file [%q]: %v", mb.mfn, err) - } - mb.ifd = ifd - - // For subject based info. - mb.sfn = filepath.Join(mdir, fmt.Sprintf(fssScan, mb.index)) - // Check if encryption is enabled. if fs.prf != nil { if err := fs.genEncryptionKeysForBlock(mb); err != nil { @@ -2328,16 +2816,6 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { } } - // Set cache time to creation time to start. - ts := time.Now().UnixNano() - // Race detector wants these protected. - mb.mu.Lock() - mb.llts, mb.lwts = 0, ts - // Remember our last sequence number. - mb.first.seq = fs.state.LastSeq + 1 - mb.last.seq = fs.state.LastSeq - mb.mu.Unlock() - // If we know we will need this so go ahead and spin up. if !fs.fip { mb.spinUpFlushLoop() @@ -2346,6 +2824,10 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { // Add to our list of blocks and mark as last. fs.addMsgBlock(mb) + if fs.dirty > 0 { + fs.kickFlushStateLoop() + } + return mb, nil } @@ -2426,6 +2908,10 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in return err } + // Mark dirty here since we added in a new message. + // We do not kick the flusher, that happens on new msg block for write or Stop(). + fs.dirty++ + // Adjust top level tracking of per subject msg counts. if len(subj) > 0 { index := fs.lmb.index @@ -2471,6 +2957,12 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in } } } + } else if mb := fs.selectMsgBlock(fseq); mb != nil { + // If we are here we could not remove fseq from above, so rebuild. + var ld *LostStreamData + if ld, _, _ = mb.rebuildState(); ld != nil { + fs.rebuildStateLocked(ld) + } } } @@ -2547,10 +3039,6 @@ func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { mb.last.ts = nowts mb.first.seq = seq + 1 mb.first.ts = nowts - // Take care of index if needed. - if nowts-mb.lwits > wiThresh { - mb.writeIndexInfoLocked() - } } else { needsRecord = true mb.dmap.Insert(seq) @@ -2593,15 +3081,12 @@ func (fs *fileStore) rebuildFirst() { return } - fmb.removeIndexFile() - ld, _ := fmb.rebuildState() + ld, _, _ := fmb.rebuildState() fmb.mu.RLock() isEmpty := fmb.msgs == 0 fmb.mu.RUnlock() if isEmpty { fs.removeMsgBlock(fmb) - } else { - fmb.writeIndexInfo() } fs.selectNextFirst() fs.rebuildStateLocked(ld) @@ -2708,9 +3193,7 @@ func (fs *fileStore) enforceMsgPerSubjectLimit() { // Clear any global subject state. fs.psim = make(map[string]*psi) for _, mb := range fs.blks { - mb.removeIndexFile() - ld, err := mb.rebuildState() - mb.writeIndexInfo() + ld, _, err := mb.rebuildState() if err != nil && ld != nil { fs.addLostData(ld) } @@ -2771,7 +3254,6 @@ func (fs *fileStore) enforceMsgPerSubjectLimit() { // Now write updated index for all affected msgBlks. for mb := range blks { - mb.writeIndexInfo() mb.tryForceExpireCacheLocked() } } @@ -2804,7 +3286,6 @@ func (fs *fileStore) removePerSubject(subj string) { if len(subj) == 0 { return } - // We do not update sense of fblk here but will do so when we resolve during lookup. if info, ok := fs.psim[subj]; ok { info.total-- @@ -2836,7 +3317,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( fsUnlock() return false, ErrStoreClosed } - if fs.sips > 0 { + if !viaLimits && fs.sips > 0 { fsUnlock() return false, ErrStoreSnapshotInProgress } @@ -2944,6 +3425,9 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( mb.bytes = 0 } + // Mark as dirty for stream state. + fs.dirty++ + // If we are tracking subjects here make sure we update that accounting. mb.ensurePerSubjectInfoLoaded() @@ -2960,9 +3444,6 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( fifo := seq == mb.first.seq isLastBlock := mb == fs.lmb isEmpty := mb.msgs == 0 - // If we are removing the message via limits we do not need to write the index file here. - // If viaLimits this means on a restart we will properly cleanup these messages regardless. - shouldWriteIndex := !isEmpty && !viaLimits if fifo { mb.selectNextFirst() @@ -2970,7 +3451,11 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( // Can update this one in place. if seq == fs.state.FirstSeq { fs.state.FirstSeq = mb.first.seq // new one. - fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() + if mb.first.ts == 0 { + fs.state.FirstTime = time.Time{} + } else { + fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() + } } } } else if !isEmpty { @@ -2990,70 +3475,40 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( } } - var firstSeqNeedsUpdate bool - - // Decide how we want to clean this up. If last block and the only block left we will hold into index. - if isEmpty { - if isLastBlock { - mb.closeAndKeepIndex(viaLimits) - // We do not need to writeIndex since just did above. - shouldWriteIndex = false - } else { - fs.removeMsgBlock(mb) - } - firstSeqNeedsUpdate = seq == fs.state.FirstSeq - } - - var qch, fch chan struct{} - if shouldWriteIndex { - qch, fch = mb.qch, mb.fch - } - cb := fs.scb - if secure { if ld, _ := mb.flushPendingMsgsLocked(); ld != nil { // We have the mb lock here, this needs the mb locks so do in its own go routine. go fs.rebuildState(ld) } } - // Check if we need to write the index file and we are flush in place (fip). - if shouldWriteIndex && fs.fip { - // Check if this is the first message, common during expirations etc. - threshold := wiThresh - if !fifo { - // For out-of-order deletes, we will have a shorter threshold, but - // still won't write the index for every single delete. - threshold = winfThresh - } - if time.Now().UnixNano()-mb.lwits > threshold { - mb.writeIndexInfoLocked() - } + + // If empty remove this block and check if we need to update first sequence. + // We will write a tombstone at the end. + var firstSeqNeedsUpdate bool + if isEmpty { + fs.removeMsgBlock(mb) + firstSeqNeedsUpdate = seq == fs.state.FirstSeq } mb.mu.Unlock() - // Kick outside of lock. - if !fs.fip && shouldWriteIndex { - if qch == nil { - mb.spinUpFlushLoop() - } - select { - case fch <- struct{}{}: - default: - } - } - - // If we emptied the current message block and the seq was state.First.Seq + // If we emptied the current message block and the seq was state.FirstSeq // then we need to jump message blocks. We will also write the index so // we don't lose track of the first sequence. if firstSeqNeedsUpdate { fs.selectNextFirst() - // Write out the new first message block if we have one. - // We can ignore if we really have not changed message blocks from above. - if len(fs.blks) > 0 && fs.blks[0] != mb { - fmb := fs.blks[0] - fmb.writeIndexInfo() - } } + + // Check if we need to write a deleted record tombstone. + // This is for user initiated removes or to hold the first seq + // when the last block is empty. + if !viaLimits || (isEmpty && isLastBlock) { + if lmb := fs.lmb; sm != nil && lmb != nil { + lmb.writeTombstone(sm.seq, sm.ts) + } + fs.kickFlushStateLoop() + } + + cb := fs.scb fs.mu.Unlock() // Storage updates. @@ -3116,10 +3571,12 @@ func (mb *msgBlock) compact() { } // Only need to process non-deleted messages. seq := le.Uint64(hdr[4:]) + if !isDeleted(seq) { // Normal message here. nbuf = append(nbuf, buf[index:index+rl]...) - if !firstSet { + // Do not set based on tombstone. + if !firstSet && seq&tbit == 0 { firstSet = true mb.first.seq = seq } @@ -3167,7 +3624,6 @@ func (mb *msgBlock) compact() { } // Remove index file and wipe delete map, then rebuild. - mb.removeIndexFileLocked() mb.deleteDmap() mb.rebuildStateLocked() @@ -3269,22 +3725,6 @@ func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { mb.setInFlusher() defer mb.clearInFlusher() - // Will use to test if we have meta data updates. - var firstSeq, lastSeq uint64 - var dmapLen int - - infoChanged := func() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - var changed bool - if firstSeq != mb.first.seq || lastSeq != mb.last.seq || dmapLen != mb.dmap.Size() { - changed = true - firstSeq, lastSeq = mb.first.seq, mb.last.seq - dmapLen = mb.dmap.Size() - } - return changed - } - for { select { case <-fch: @@ -3319,9 +3759,6 @@ func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { } } } - if infoChanged() { - mb.writeIndexInfo() - } case <-qch: return } @@ -3340,7 +3777,7 @@ func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error { // Randomize record data := make([]byte, rl-emptyRecordLen) - crand.Read(data) + rand.Read(data) // Now write to underlying buffer. var b bytes.Buffer @@ -3502,8 +3939,6 @@ func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { mb.mu.Unlock() - // Write our index file. - mb.writeIndexInfo() // Load msgs again. mb.loadMsgs() @@ -3708,11 +4143,7 @@ func (mb *msgBlock) expireCacheLocked() { // Check if we can clear out our fss and idx unless under force expire. // We used to hold onto the idx longer but removes need buf now so no point. - mb.writePerSubjectInfo() mb.fss = nil - if mb.indexNeedsUpdateLocked() { - mb.writeIndexInfoLocked() - } mb.clearCache() } @@ -3793,9 +4224,6 @@ func (fs *fileStore) checkAndFlushAllBlocks() { fs.rebuildStateLocked(ld) } } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } } } @@ -3810,7 +4238,8 @@ func (fs *fileStore) checkMsgs() *LostStreamData { fs.psim = make(map[string]*psi) for _, mb := range fs.blks { - if ld, err := mb.rebuildState(); err != nil && ld != nil { + // FIXME(dlc) - check tombstones here too? + if ld, _, err := mb.rebuildState(); err != nil && ld != nil { // Rebuild fs state too. mb.fs.rebuildStateLocked(ld) } @@ -3842,16 +4271,18 @@ func (mb *msgBlock) enableForWriting(fip bool) error { return nil } +// Helper function to place a delete tombstone. +// Lock should be held. +func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error { + return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true) +} + // Will write the message record to the underlying message block. // filestore lock will be held. func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error { mb.mu.Lock() defer mb.mu.Unlock() - // Make sure we have a cache setup. - if mb.cache == nil { - mb.setupWriteCache(nil) - } // Enable for writing if our mfd is not open. if mb.mfd == nil { if err := mb.enableForWriting(flush); err != nil { @@ -3859,6 +4290,11 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte } } + // Make sure we have a cache setup. + if mb.cache == nil { + mb.setupWriteCache(nil) + } + // Check if we are tracking per subject for our simple state. // Do this before changing the cache that would trigger a flush pending msgs call // if we needed to regenerate the per subject info. @@ -3872,7 +4308,6 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte } else { mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } // Indexing @@ -3926,20 +4361,23 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte // Update write through cache. // Write to msg record. mb.cache.buf = append(mb.cache.buf, checksum...) - // Write index - mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) mb.cache.lrl = uint32(rl) - if mb.cache.fseq == 0 { - mb.cache.fseq = seq - } // Set cache timestamp for last store. mb.lwts = ts - // Decide if we write index info if flushing in place. - writeIndex := ts-mb.lwits > wiThresh - // Accounting - mb.updateAccounting(seq, ts, rl) + // Only update index and do accounting if not a delete tombstone. + if seq&tbit == 0 { + // Strip ebit if set. + seq = seq &^ ebit + if mb.cache.fseq == 0 { + mb.cache.fseq = seq + } + // Write index + mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) + // Accounting + mb.updateAccounting(seq, ts, rl) + } fch, werr := mb.fch, mb.werr @@ -3953,11 +4391,6 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte if err != nil { return err } - if writeIndex { - // If this fails still proceed on since the write above succeeded. - // We can recover this condition. - mb.writeIndexInfoLocked() - } } else { // Kick the flusher here. kickFlusher(fch) @@ -4002,10 +4435,6 @@ func (mb *msgBlock) closeFDsLockedNoCheck() { mb.mfd.Close() mb.mfd = nil } - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } } // bytesPending returns the buffer to be used for writing to the underlying file. @@ -4044,7 +4473,7 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { seq = seq &^ ebit } - if mb.first.seq == 0 || mb.first.ts == 0 { + if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq { mb.first.seq = seq mb.first.ts = ts } @@ -4052,7 +4481,6 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = ts mb.rbytes += rl - // Only update this accounting if message is not a deleted message. if !isDeleted { mb.bytes += rl mb.msgs++ @@ -4070,6 +4498,10 @@ func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg } // Grab our current last message block. mb := fs.lmb + + // Mark as dirty for stream state. + fs.dirty++ + if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize { if mb != nil && fs.fcfg.Compression != NoCompression { // We've now reached the end of this message block, if we want @@ -4253,21 +4685,14 @@ func (fs *fileStore) syncBlocks() { if mb.pendingWriteSize() > 0 { mb.flushPendingMsgs() } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } // Do actual sync. Hold lock for consistency. mb.mu.Lock() if !mb.closed { if mb.mfd != nil { mb.mfd.Sync() } - if mb.ifd != nil { - mb.ifd.Truncate(mb.liwsz) - mb.ifd.Sync() - } // See if we can close FDs due to being idle. - if mb.ifd != nil || mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { + if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { mb.dirtyCloseWithRemove(false) } } @@ -4365,27 +4790,38 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { buf = append(mb.cache.buf, buf...) } - lbuf := uint32(len(buf)) + // Create FSS if we should track. + if !mb.noTrack { + mb.fss = make(map[string]*SimpleState) + } + lbuf := uint32(len(buf)) for index < lbuf { if index+msgHdrSize > lbuf { return errCorruptState } hdr := buf[index : index+msgHdrSize] - rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), le.Uint16(hdr[20:]) + rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:])) // Clear any headers bit that could be set. rl &^= hbit dlen := int(rl) - msgHdrSize // Do some quick sanity checks here. - if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { + if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { // This means something is off. // TODO(dlc) - Add into bad list? return errCorruptState } - // Clear erase bit. + // Check for tombstones which we can skip in terms of indexing. + if seq&tbit != 0 { + index += rl + continue + } + + // Clear any erase bits. + erased := seq&ebit != 0 seq = seq &^ ebit // We defer checksum checks to individual msg cache lookups to amortorize costs and @@ -4400,16 +4836,29 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { } } pseq = seq - + // Add to our index. idx = append(idx, index) mb.cache.lrl = uint32(rl) // Adjust if we guessed wrong. if seq != 0 && seq < fseq { fseq = seq } + + // Handle FSS inline here. + if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { + bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)] + if ss := mb.fss[string(bsubj)]; ss != nil { + ss.Msgs++ + ss.Last = seq + } else { + subj := mb.subjString(bsubj) + mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} + } + } } index += rl } + mb.cache.buf = buf mb.cache.idx = idx mb.cache.fseq = fseq @@ -4490,12 +4939,10 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { for lbb := lob; lbb > 0; lbb = len(buf) { n, err := mb.writeAt(buf, woff) if err != nil { - mb.removePerSubjectInfoLocked() - mb.removeIndexFileLocked() mb.dirtyCloseWithRemove(false) - fsLostData, _ := mb.rebuildStateLocked() + ld, _, _ := mb.rebuildStateLocked() mb.werr = err - return fsLostData, err + return ld, err } // Update our write offset. woff += int64(n) @@ -4613,11 +5060,22 @@ func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { } n, err := io.ReadFull(f, buf) + // On success capture raw bytes size. + if err == nil { + mb.rbytes = uint64(n) + } return buf[:n], err } // Lock should be held. func (mb *msgBlock) loadMsgsWithLock() error { + // Check for encryption, we do not load keys on startup anymore so might need to load them here. + if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { + if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { + return err + } + } + // Check to see if we are loading already. if mb.loading { return nil @@ -4685,7 +5143,7 @@ checkCache: if err := mb.indexCacheBuf(buf); err != nil { if err == errCorruptState { var ld *LostStreamData - if ld, err = mb.rebuildStateLocked(); ld != nil { + if ld, _, err = mb.rebuildStateLocked(); ld != nil { // We do not know if fs is locked or not at this point. // This should be an exceptional condition so do so in Go routine. go mb.fs.rebuildState(ld) @@ -4732,26 +5190,27 @@ var ( errNoPending = errors.New("message block does not have pending data") errNotReadable = errors.New("storage directory not readable") errCorruptState = errors.New("corrupt state file") + errPriorState = errors.New("prior state file") errPendingData = errors.New("pending data still present") errNoEncryption = errors.New("encryption not enabled") errBadKeySize = errors.New("encryption bad key size") errNoMsgBlk = errors.New("no message block") - errMsgBlkClosed = errors.New("message block is closed") errMsgBlkTooBig = errors.New("message block size exceeded int capacity") errUnknownCipher = errors.New("unknown cipher") - errDIOStalled = errors.New("IO is stalled") errNoMainKey = errors.New("encrypted store encountered with no main key") ) -// Used for marking messages that have had their checksums checked. -// Used to signal a message record with headers. -const hbit = 1 << 31 - -// Used for marking erased messages sequences. -const ebit = 1 << 63 - -// Used to mark a bad index as deleted. -const dbit = 1 << 30 +const ( + // Used for marking messages that have had their checksums checked. + // Used to signal a message record with headers. + hbit = 1 << 31 + // Used for marking erased messages sequences. + ebit = 1 << 63 + // Used for marking tombstone sequences. + tbit = 1 << 62 + // Used to mark a bad index as deleted. + dbit = 1 << 30 +) // Will do a lookup from cache. // Lock should be held. @@ -4806,7 +5265,7 @@ func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { return nil, err } - // Deleted messages that are decoded return a 0 for seqeunce. + // Deleted messages that are decoded return a 0 for sequence. if fsm.seq == 0 { return nil, errDeletedMsg } @@ -4974,19 +5433,19 @@ func subjFromBytes(b []byte) string { // Given the `key` byte slice, this function will return the subject // as an interned string of `key` or a configured subject as to minimize memory allocations. // Lock should be held. -func (mb *msgBlock) subjString(skey []byte) string { - if len(skey) == 0 { +func (fs *fileStore) subjString(skey []byte) string { + if fs == nil || len(skey) == 0 { return _EMPTY_ } - if lsubjs := len(mb.fs.cfg.Subjects); lsubjs > 0 { + if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 { if lsubjs == 1 { // The cast for the comparison does not make a copy - if string(skey) == mb.fs.cfg.Subjects[0] { - return mb.fs.cfg.Subjects[0] + if string(skey) == fs.cfg.Subjects[0] { + return fs.cfg.Subjects[0] } } else { - for _, subj := range mb.fs.cfg.Subjects { + for _, subj := range fs.cfg.Subjects { if string(skey) == subj { return subj } @@ -4996,6 +5455,13 @@ func (mb *msgBlock) subjString(skey []byte) string { return subjFromBytes(skey) } +// Given the `key` byte slice, this function will return the subject +// as an interned string of `key` or a configured subject as to minimize memory allocations. +// Lock should be held. +func (mb *msgBlock) subjString(skey []byte) string { + return mb.fs.subjString(skey) +} + // LoadMsg will lookup the message by sequence number and return it if found. func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) { return fs.msgForSeq(seq, sm) @@ -5223,104 +5689,6 @@ func (mb *msgBlock) sinceLastWriteActivity() time.Duration { return time.Since(time.Unix(0, last).UTC()) } -// Determine if we need to write out this index info. -func (mb *msgBlock) indexNeedsUpdate() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - return mb.indexNeedsUpdateLocked() -} - -// Determine if we need to write out this index info. -// Lock should be held. -func (mb *msgBlock) indexNeedsUpdateLocked() bool { - return mb.lwits < mb.lwts || mb.lwits < mb.lrts -} - -// Write index info to the appropriate file. -// Filestore lock should be held. -func (mb *msgBlock) writeIndexInfo() error { - mb.mu.Lock() - defer mb.mu.Unlock() - return mb.writeIndexInfoLocked() -} - -// Write index info to the appropriate file. -// Filestore lock and mb lock should be held. -func (mb *msgBlock) writeIndexInfoLocked() error { - if mb.closed { - return errMsgBlkClosed - } - - // HEADER: magic version msgs bytes fseq fts lseq lts ndel checksum - // Make large enough to hold almost all possible maximum interior delete scenarios. - var hdr [42 * 1024]byte - - // Write header - hdr[0] = magic - hdr[1] = newVersion - - n := hdrLen - n += binary.PutUvarint(hdr[n:], mb.msgs) - n += binary.PutUvarint(hdr[n:], mb.bytes) - n += binary.PutUvarint(hdr[n:], mb.first.seq) - n += binary.PutVarint(hdr[n:], mb.first.ts) - n += binary.PutUvarint(hdr[n:], mb.last.seq) - n += binary.PutVarint(hdr[n:], mb.last.ts) - n += binary.PutUvarint(hdr[n:], uint64(mb.dmap.Size())) - buf := append(hdr[:n], mb.lchk[:]...) - - // Append a delete map if needed - if !mb.dmap.IsEmpty() { - // Always attempt to tack it onto end. - dmap, err := mb.dmap.Encode(hdr[len(buf):]) - if err != nil { - return err - } - if len(dmap) < cap(hdr)-len(buf) { - buf = hdr[:len(buf)+len(dmap)] - } else { - buf = append(buf, dmap...) - } - } - - // Open our FD if needed. - if mb.ifd == nil { - ifd, err := os.OpenFile(mb.ifn, os.O_CREATE|os.O_RDWR, defaultFilePerms) - if err != nil { - return err - } - if fi, _ := ifd.Stat(); fi != nil { - mb.liwsz = fi.Size() - } - mb.ifd = ifd - } - - // Encrypt if needed. - if mb.aek != nil { - buf = mb.aek.Seal(buf[:0], mb.nonce, buf, nil) - } - - // Check if this will be a short write, and if so truncate before writing here. - // We only really need to truncate if we are encryptyed or we have dmap entries. - // If no dmap entries readIndexInfo does the right thing in the presence of extra data left over. - if int64(len(buf)) < mb.liwsz && (mb.aek != nil || !mb.dmap.IsEmpty()) { - if err := mb.ifd.Truncate(0); err != nil { - mb.werr = err - return err - } - } - - var err error - if n, err = mb.ifd.WriteAt(buf, 0); err == nil { - mb.lwits = time.Now().UnixNano() - mb.liwsz = int64(n) - mb.werr = nil - } else { - mb.werr = err - } - return err -} - func checkNewHeader(hdr []byte) error { if hdr == nil || len(hdr) < 2 || hdr[0] != magic || (hdr[1] != version && hdr[1] != newVersion) { @@ -5331,7 +5699,8 @@ func checkNewHeader(hdr []byte) error { // readIndexInfo will read in the index information for the message block. func (mb *msgBlock) readIndexInfo() error { - buf, err := os.ReadFile(mb.ifn) + ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index)) + buf, err := os.ReadFile(ifn) if err != nil { return err } @@ -5350,7 +5719,7 @@ func (mb *msgBlock) readIndexInfo() error { } if err := checkNewHeader(buf); err != nil { - defer os.Remove(mb.ifn) + defer os.Remove(ifn) return fmt.Errorf("bad index file") } @@ -5392,13 +5761,13 @@ func (mb *msgBlock) readIndexInfo() error { // Check if this is a short write index file. if bi < 0 || bi+checksumSize > len(buf) { - os.Remove(mb.ifn) + os.Remove(ifn) return fmt.Errorf("short index file") } // Check for consistency if accounting. If something is off bail and we will rebuild. if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen { - os.Remove(mb.ifn) + os.Remove(ifn) return fmt.Errorf("accounting inconsistent") } @@ -5430,17 +5799,6 @@ func (mb *msgBlock) readIndexInfo() error { return nil } -func syncAndClose(mfd, ifd *os.File) { - if mfd != nil { - mfd.Sync() - mfd.Close() - } - if ifd != nil { - ifd.Sync() - ifd.Close() - } -} - // Will return total number of cache loads. func (fs *fileStore) cacheLoads() uint64 { var tl uint64 @@ -5604,8 +5962,6 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint mb.tryForceExpireCacheLocked() } mb.mu.Unlock() - // Update our index info on disk. - mb.writeIndexInfo() // Check if we should break out of top level too. if maxp > 0 && purged >= maxp { @@ -5616,9 +5972,12 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint fs.selectNextFirst() } + fs.dirty++ cb := fs.scb fs.mu.Unlock() + fs.kickFlushStateLoop() + if cb != nil { cb(-int64(purged), -int64(bytes), 0, _EMPTY_) } @@ -5655,6 +6014,10 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { fs.blks = nil fs.lmb = nil fs.bim = make(map[uint32]*msgBlock) + // Clear any per subject tracking. + fs.psim = make(map[string]*psi) + // Mark dirty + fs.dirty++ // Move the msgs directory out of the way, will delete out of band. // FIXME(dlc) - These can error and we need to change api above to propagate? @@ -5666,7 +6029,9 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { os.RemoveAll(pdir) } os.Rename(mdir, pdir) + go os.RemoveAll(pdir) + // Create new one. os.MkdirAll(mdir, defaultDirPerms) @@ -5681,14 +6046,17 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { fs.state.FirstSeq = fseq fs.state.LastSeq = fseq - 1 } - fs.lmb.first.seq = fs.state.FirstSeq - fs.lmb.last.seq = fs.state.LastSeq - fs.lmb.last.ts = fs.state.LastTime.UnixNano() - fs.lmb.writeIndexInfo() + lmb := fs.lmb + lmb.first.seq = fs.state.FirstSeq + lmb.last.seq = fs.state.LastSeq + lmb.last.ts = fs.state.LastTime.UnixNano() - // Clear any per subject tracking. - fs.psim = make(map[string]*psi) + if fs.lmb.last.seq > 1 { + // Leave a tombstone so we can remember our starting sequence in case + // full state becomes corrupted. + lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts) + } cb := fs.scb fs.mu.Unlock() @@ -5749,7 +6117,6 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { smb.mu.Lock() if smb.first.seq == seq { - isEmpty = smb.msgs == 0 goto SKIP } @@ -5832,18 +6199,12 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { } // Make sure to remove fss state. smb.fss = nil - smb.removePerSubjectInfoLocked() smb.clearCacheAndOffset() smb.rbytes = uint64(len(nbuf)) } } SKIP: - if !isEmpty { - // Make sure to write out our index info. - smb.writeIndexInfoLocked() - } - smb.mu.Unlock() if deleted > 0 { @@ -5873,6 +6234,9 @@ SKIP: } fs.state.Bytes -= bytes + fs.dirty++ + fs.kickFlushStateLoop() + cb := fs.scb fs.mu.Unlock() @@ -6004,6 +6368,9 @@ func (fs *fileStore) Truncate(seq uint64) error { // Reset our subject lookup info. fs.resetGlobalPerSubjectInfo() + fs.dirty++ + fs.kickFlushStateLoop() + cb := fs.scb fs.mu.Unlock() @@ -6028,29 +6395,6 @@ func (fs *fileStore) numMsgBlocks() int { return len(fs.blks) } -// Will remove our index file. -func (mb *msgBlock) removeIndexFile() { - mb.mu.RLock() - defer mb.mu.RUnlock() - mb.removeIndexFileLocked() -} - -func (mb *msgBlock) removeIndexFileLocked() { - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } - if mb.ifn != _EMPTY_ { - os.Remove(mb.ifn) - } -} - -func (mb *msgBlock) removePerSubjectInfoLocked() { - if mb.sfn != _EMPTY_ { - os.Remove(mb.sfn) - } -} - // Will add a new msgBlock. // Lock should be held. func (fs *fileStore) addMsgBlock(mb *msgBlock) { @@ -6082,37 +6426,14 @@ func (fs *fileStore) removeMsgBlock(mb *msgBlock) { fs.removeMsgBlockFromList(mb) // Check for us being last message block if mb == fs.lmb { + last := mb.last // Creating a new message write block requires that the lmb lock is not held. mb.mu.Unlock() - fs.newMsgBlockForWrite() - mb.mu.Lock() - } -} - -// When we have an empty block but want to keep the index for timestamp info etc. -// Lock should be held. -func (mb *msgBlock) closeAndKeepIndex(viaLimits bool) { - // We will leave a 0 length blk marker. - if mb.mfd != nil { - mb.mfd.Truncate(0) - } else { - // We were closed, so just write out an empty file. - os.WriteFile(mb.mfn, nil, defaultFilePerms) - } - // Make sure to write the index file so we can remember last seq and ts. - mb.writeIndexInfoLocked() - // Close - mb.dirtyCloseWithRemove(false) - - // Make sure to remove fss state. - mb.fss = nil - mb.removePerSubjectInfoLocked() - - // If we are encrypted we should reset our bek counter. - if mb.bek != nil { - if bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce); err == nil { - mb.bek = bek + // Write the tombstone to remember since this was last block. + if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { + lmb.writeTombstone(last.seq, last.ts) } + mb.mu.Lock() } } @@ -6134,13 +6455,8 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { mb.ctmr.Stop() mb.ctmr = nil } - // Check if we are tracking by subject. - if mb.fss != nil { - if !remove { - mb.writePerSubjectInfo() - } - mb.fss = nil - } + // Clear any tracking by subject. + mb.fss = nil // Close cache mb.clearCacheAndOffset() // Quit our loops. @@ -6152,26 +6468,16 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { mb.mfd.Close() mb.mfd = nil } - if mb.ifd != nil { - mb.ifd.Close() - mb.ifd = nil - } if remove { - if mb.ifn != _EMPTY_ { - os.Remove(mb.ifn) - mb.ifn = _EMPTY_ - } if mb.mfn != _EMPTY_ { os.Remove(mb.mfn) mb.mfn = _EMPTY_ } - if mb.sfn != _EMPTY_ { - os.Remove(mb.sfn) - mb.sfn = _EMPTY_ - } if mb.kfn != _EMPTY_ { os.Remove(mb.kfn) } + // Since we are removing a block kick the state flusher. + mb.fs.kickFlushStateLoop() } } @@ -6186,7 +6492,6 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { if ss.Msgs == 1 { delete(mb.fss, subj) - mb.fssNeedsWrite = true // Mark dirty return } @@ -6200,7 +6505,6 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { ss.First = ss.Last } ss.firstNeedsUpdate = false - mb.fssNeedsWrite = true // Mark dirty return } @@ -6248,7 +6552,6 @@ func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si continue } ss.First = seq - mb.fssNeedsWrite = true // Mark dirty return } } @@ -6266,17 +6569,12 @@ func (fs *fileStore) resetGlobalPerSubjectInfo() { // Lock should be held. func (mb *msgBlock) resetPerSubjectInfo() error { mb.fss = nil - mb.removePerSubjectInfoLocked() - return mb.generatePerSubjectInfo(true) + return mb.generatePerSubjectInfo() } // generatePerSubjectInfo will generate the per subject info via the raw msg block. -func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { - if !hasLock { - mb.mu.Lock() - defer mb.mu.Unlock() - } - +// Lock should be held. +func (mb *msgBlock) generatePerSubjectInfo() error { // Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info. if mb.msgs == 0 { return nil @@ -6286,6 +6584,10 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { if err := mb.loadMsgsWithLock(); err != nil { return err } + // indexCaceheBuf can produce fss now, so if non-nil we are good. + if mb.fss != nil { + return nil + } } // Create new one regardless. @@ -6312,7 +6614,6 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { } else { mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } - mb.fssNeedsWrite = true } } @@ -6324,38 +6625,6 @@ func (mb *msgBlock) generatePerSubjectInfo(hasLock bool) error { return nil } -func (mb *msgBlock) loadPerSubjectInfo() ([]byte, error) { - const ( - fileHashIndex = 16 - mbHashIndex = 8 - minFileSize = 24 - ) - - buf, err := os.ReadFile(mb.sfn) - if err != nil { - return nil, err - } - - if len(buf) < minFileSize || checkHeader(buf) != nil { - return nil, errors.New("short fss state") - } - - // Check that we did not have any bit flips. - mb.hh.Reset() - mb.hh.Write(buf[0 : len(buf)-fileHashIndex]) - fhash := buf[len(buf)-fileHashIndex : len(buf)-mbHashIndex] - if checksum := mb.hh.Sum(nil); !bytes.Equal(checksum, fhash) { - return nil, errors.New("corrupt fss state") - } - - // Make sure it matches the last update recorded. - if !bytes.Equal(buf[len(buf)-mbHashIndex:], mb.lchk[:]) { - return nil, errors.New("outdated fss state") - } - - return buf, nil -} - // Helper to make sure fss loaded if we are tracking. // Lock should be held func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { @@ -6366,8 +6635,7 @@ func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { mb.fss = make(map[string]*SimpleState) return nil } - // Load from file. - return mb.readPerSubjectInfo(true) + return mb.generatePerSubjectInfo() } // Called on recovery to populate the global psim state. @@ -6376,23 +6644,10 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { mb.mu.Lock() defer mb.mu.Unlock() - if err := mb.readPerSubjectInfo(true); err != nil { + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { return } - // Quick sanity check. - // TODO(dlc) - This is here to auto-clear a bug. - fssMsgs := uint64(0) - for subj, ss := range mb.fss { - if len(subj) > 0 { - fssMsgs += ss.Msgs - } - } - // If we are off rebuild. - if fssMsgs != mb.msgs { - mb.generatePerSubjectInfo(true) - } - // Now populate psim. for subj, ss := range mb.fss { if len(subj) > 0 { @@ -6408,113 +6663,6 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { } } -// readPerSubjectInfo will attempt to restore the per subject information. -func (mb *msgBlock) readPerSubjectInfo(hasLock bool) error { - if mb.noTrack { - return nil - } - - buf, err := mb.loadPerSubjectInfo() - // On failure re-generate. - if err != nil { - return mb.generatePerSubjectInfo(hasLock) - } - - bi := hdrLen - readU64 := func() uint64 { - if bi < 0 { - return 0 - } - num, n := binary.Uvarint(buf[bi:]) - if n <= 0 { - bi = -1 - return 0 - } - bi += n - return num - } - - numEntries := readU64() - fss := make(map[string]*SimpleState, numEntries) - - if !hasLock { - mb.mu.Lock() - } - for i := uint64(0); i < numEntries; i++ { - lsubj := readU64() - // Make a copy or use a configured subject (to avoid mem allocation) - subj := mb.subjString(buf[bi : bi+int(lsubj)]) - bi += int(lsubj) - msgs, first, last := readU64(), readU64(), readU64() - fss[subj] = &SimpleState{Msgs: msgs, First: first, Last: last} - } - mb.fss = fss - mb.fssNeedsWrite = false - - // Make sure we run the cache expire timer. - if len(mb.fss) > 0 { - mb.llts = time.Now().UnixNano() - mb.startCacheExpireTimer() - } - - if !hasLock { - mb.mu.Unlock() - } - - return nil -} - -// writePerSubjectInfo will write out per subject information if we are tracking per subject. -// Lock should be held. -func (mb *msgBlock) writePerSubjectInfo() error { - // Raft groups do not have any subjects. - if len(mb.fss) == 0 || len(mb.sfn) == 0 || !mb.fssNeedsWrite { - return nil - } - var scratch [4 * binary.MaxVarintLen64]byte - var b bytes.Buffer - b.WriteByte(magic) - b.WriteByte(version) - n := binary.PutUvarint(scratch[0:], uint64(len(mb.fss))) - b.Write(scratch[0:n]) - for subj, ss := range mb.fss { - if ss.firstNeedsUpdate { - mb.recalculateFirstForSubj(subj, ss.First, ss) - } - n := binary.PutUvarint(scratch[0:], uint64(len(subj))) - b.Write(scratch[0:n]) - b.WriteString(subj) - // Encode all three parts of our simple state into same scratch buffer. - n = binary.PutUvarint(scratch[0:], ss.Msgs) - n += binary.PutUvarint(scratch[n:], ss.First) - n += binary.PutUvarint(scratch[n:], ss.Last) - b.Write(scratch[0:n]) - } - // Calculate hash for this information. - mb.hh.Reset() - mb.hh.Write(b.Bytes()) - b.Write(mb.hh.Sum(nil)) - // Now copy over checksum from the block itself, this allows us to know if we are in sync. - b.Write(mb.lchk[:]) - - // Gate this for when we have a large number of blocks expiring at the same time. - // Since we have the lock we would rather fail here then block. - // This is an optional structure that can be rebuilt on restart. - var err error - select { - case <-dios: - if err = os.WriteFile(mb.sfn, b.Bytes(), defaultFilePerms); err == nil { - // Clear write flag if no error. - mb.fssNeedsWrite = false - } - dios <- struct{}{} - default: - err = errDIOStalled - } - - return err -} - // Close the message block. func (mb *msgBlock) close(sync bool) { if mb == nil { @@ -6533,12 +6681,7 @@ func (mb *msgBlock) close(sync bool) { mb.ctmr = nil } - // Check if we are tracking by subject. - if len(mb.fss) > 0 && mb.fssNeedsWrite { - mb.writePerSubjectInfo() - } mb.fss = nil - mb.fssNeedsWrite = false // Close cache mb.clearCacheAndOffset() @@ -6547,18 +6690,13 @@ func (mb *msgBlock) close(sync bool) { close(mb.qch) mb.qch = nil } - if sync { - syncAndClose(mb.mfd, mb.ifd) - } else { - if mb.mfd != nil { - mb.mfd.Close() - } - if mb.ifd != nil { - mb.ifd.Close() + if mb.mfd != nil { + if sync { + mb.mfd.Sync() } + mb.mfd.Close() } mb.mfd = nil - mb.ifd = nil // Mark as closed. mb.closed = true } @@ -6620,14 +6758,178 @@ func (fs *fileStore) cancelSyncTimer() { } } +const ( + fullStateMagic = uint8(11) + fullStateVersion = uint8(1) +) + +// This go routine runs and receives kicks to write out our full stream state index. +// This will get kicked when we create a new block or when we delete a block in general. +// This is also called during Stop(). +func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) { + for { + select { + case <-fch: + fs.writeFullState() + case <-qch: + close(done) + return + } + } +} + +// Kick the flusher. +func (fs *fileStore) kickFlushStateLoop() { + kickFlusher(fs.fch) +} + +// Helper since unixnano of zero time undefined. +func timestampNormalized(t time.Time) int64 { + if t.IsZero() { + return 0 + } + return t.UnixNano() +} + +// This will write the full binary state for the stream. +// This plus everything new since last hash will be the total recovered state. +// This state dump will have the following. +// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp) +// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present. +// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset). +// 4. Last block index and hash of record inclusive to this stream state. +func (fs *fileStore) writeFullState() error { + fs.mu.Lock() + + if fs.closed || fs.dirty == 0 { + fs.mu.Unlock() + return nil + } + + var _buf [32 * 1024]byte + _buf[0], _buf[1] = fullStateMagic, fullStateVersion + buf := _buf[:hdrLen] + + buf = binary.AppendUvarint(buf, fs.state.Msgs) + buf = binary.AppendUvarint(buf, fs.state.Bytes) + buf = binary.AppendUvarint(buf, fs.state.FirstSeq) + buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime)) + buf = binary.AppendUvarint(buf, fs.state.LastSeq) + buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime)) + + // Do per subject information map if applicable. + numSubjects := len(fs.psim) + buf = binary.AppendUvarint(buf, uint64(numSubjects)) + + if numSubjects > 0 { + for subj, psi := range fs.psim { + buf = binary.AppendUvarint(buf, uint64(len(subj))) + buf = append(buf, subj...) + buf = binary.AppendUvarint(buf, psi.total) + buf = binary.AppendUvarint(buf, uint64(psi.fblk)) + if psi.total > 1 { + buf = binary.AppendUvarint(buf, uint64(psi.lblk)) + } + } + } + + // Now walk all blocks and write out first and last and optional dmap encoding. + var lbi uint32 + var lchk [8]byte + + nb := len(fs.blks) + buf = binary.AppendUvarint(buf, uint64(nb)) + + // Use basetime to save some space. + baseTime := timestampNormalized(fs.state.FirstTime) + + for _, mb := range fs.blks { + mb.mu.RLock() + buf = binary.AppendUvarint(buf, uint64(mb.index)) + buf = binary.AppendUvarint(buf, mb.bytes) + buf = binary.AppendUvarint(buf, mb.first.seq) + buf = binary.AppendVarint(buf, mb.first.ts-baseTime) + buf = binary.AppendUvarint(buf, mb.last.seq) + buf = binary.AppendVarint(buf, mb.last.ts-baseTime) + + numDeleted := mb.dmap.Size() + buf = binary.AppendUvarint(buf, uint64(numDeleted)) + if numDeleted > 0 { + var scratch [8 * 1024]byte + dmap, _ := mb.dmap.Encode(scratch[:0]) + buf = append(buf, dmap...) + } + // If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index. + // We use this to quickly open this file on recovery. + if mb == fs.lmb { + lbi = mb.index + mb.ensureLastChecksumLoaded() + copy(lchk[0:], mb.lchk[:]) + } + mb.mu.RUnlock() + } + + // Place block index and hash onto the end. + buf = binary.AppendUvarint(buf, uint64(lbi)) + buf = append(buf, lchk[:]...) + + // Encrypt if needed. + if fs.prf != nil { + if err := fs.setupAEK(); err != nil { + fs.mu.Unlock() + return err + } + nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead()) + rand.Read(nonce) + buf = fs.aek.Seal(nonce, nonce, buf, nil) + } + + fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + + fs.hh.Reset() + fs.hh.Write(buf) + buf = fs.hh.Sum(buf) + + // Snapshot prior dirty count. + priorDirty := fs.dirty + // Release lock. + fs.mu.Unlock() + + // Write to a tmp file and rename. + const tmpPre = streamStreamStateFile + tsep + f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre) + if err != nil { + return err + } + tmpName := f.Name() + defer os.Remove(tmpName) + _, err = f.Write(buf) + f.Close() + if err != nil { + return err + } + + // Rename into position under our lock, clear prior dirty pending on success. + fs.mu.Lock() + if !fs.closed { + if err := os.Rename(tmpName, fn); err != nil { + fs.mu.Unlock() + return err + } + fs.dirty -= priorDirty + } + fs.mu.Unlock() + + return nil +} + +// Stop the current filestore. func (fs *fileStore) Stop() error { fs.mu.Lock() if fs.closed { fs.mu.Unlock() return ErrStoreClosed } - fs.closed = true - fs.lmb = nil fs.checkAndFlushAllBlocks() fs.closeAllMsgBlocks(false) @@ -6635,6 +6937,21 @@ func (fs *fileStore) Stop() error { fs.cancelSyncTimer() fs.cancelAgeChk() + // Release the state flusher loop. + close(fs.qch) + + // Wait for the state flush loop to exit. + fsld := fs.fsld + fs.mu.Unlock() + <-fsld + // Write full state if needed. If not dirty this is a no-op. + fs.writeFullState() + fs.mu.Lock() + + // Mark as closed. + fs.closed = true + fs.lmb = nil + // We should update the upper usage layer on a stop. cb, bytes := fs.scb, int64(fs.state.Bytes) @@ -6722,37 +7039,32 @@ func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includ // Can't use join path here, tar only recognizes relative paths with forward slashes. msgPre := msgDir + "/" - var bbuf []byte + const minLen = 32 + sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen { + if fs.aek != nil { + ns := fs.aek.NonceSize() + buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil) + if err == nil { + // Redo hash checksum at end on plaintext. + fs.hh.Reset() + fs.hh.Write(buf) + buf = fs.hh.Sum(buf) + } + } + if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil { + return + } + } + // Now do messages themselves. for _, mb := range blks { if mb.pendingWriteSize() > 0 { mb.flushPendingMsgs() } - if mb.indexNeedsUpdate() { - mb.writeIndexInfo() - } mb.mu.Lock() - buf, err := os.ReadFile(mb.ifn) - if err != nil { - mb.mu.Unlock() - writeErr(fmt.Sprintf("Could not read message block [%d] index file: %v", mb.index, err)) - return - } - // Check for encryption. - if mb.aek != nil && len(buf) > 0 { - buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil) - if err != nil { - mb.mu.Unlock() - writeErr(fmt.Sprintf("Could not decrypt message block [%d] index file: %v", mb.index, err)) - return - } - } - if writeFile(msgPre+fmt.Sprintf(indexScan, mb.index), buf) != nil { - mb.mu.Unlock() - return - } // We could stream but don't want to hold the lock and prevent changes, so just read in and // release the lock for now. bbuf, err = mb.loadBlock(bbuf) @@ -6777,16 +7089,8 @@ func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includ writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err)) return } - - // Make sure we snapshot the per subject info. - mb.writePerSubjectInfo() - buf, err = os.ReadFile(mb.sfn) - // If not there that is ok and not fatal. - if err == nil && writeFile(msgPre+fmt.Sprintf(fssScan, mb.index), buf) != nil { - mb.mu.Unlock() - return - } mb.mu.Unlock() + // Do this one unlocked. if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil { return @@ -6872,7 +7176,7 @@ func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumer pw.SetWriteDeadline(time.Now().Add(deadline)) } - // We can add to our stream while snapshotting but not delete anything. + // We can add to our stream while snapshotting but not "user" delete anything. var state StreamState fs.FastState(&state) @@ -7551,7 +7855,7 @@ func (o *consumerFileStore) encryptState(buf []byte) []byte { } // TODO(dlc) - Optimize on space usage a bit? nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) return o.aek.Seal(nonce, nonce, buf, nil) } @@ -7643,7 +7947,7 @@ func (cfs *consumerFileStore) writeConsumerMeta() error { // Encrypt if needed. if cfs.aek != nil { nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead()) - crand.Read(nonce) + rand.Read(nonce) b = cfs.aek.Seal(nonce, nonce, b, nil) } @@ -7660,14 +7964,6 @@ func (cfs *consumerFileStore) writeConsumerMeta() error { return nil } -// Make sure the header is correct. -func checkHeader(hdr []byte) error { - if hdr == nil || len(hdr) < 2 || hdr[0] != magic || hdr[1] != version { - return errCorruptState - } - return nil -} - // Consumer version. func checkConsumerHeader(hdr []byte) (uint8, error) { if hdr == nil || len(hdr) < 2 || hdr[0] != magic { diff --git a/server/filestore_test.go b/server/filestore_test.go index d43dd80c..b8676eea 100644 --- a/server/filestore_test.go +++ b/server/filestore_test.go @@ -19,7 +19,6 @@ import ( "crypto/hmac" crand "crypto/rand" "crypto/sha256" - "encoding/base64" "encoding/hex" "encoding/json" "errors" @@ -30,7 +29,6 @@ import ( "os" "path/filepath" "reflect" - "strings" "testing" "time" @@ -419,12 +417,8 @@ func TestFileStoreWriteExpireWrite(t *testing.T) { cexp := 10 * time.Millisecond fcfg.CacheExpire = cexp - fs, err := newFileStore( - fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() toSend := 10 @@ -483,9 +477,7 @@ func TestFileStoreWriteExpireWrite(t *testing.T) { func TestFileStoreMsgLimit(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxMsgs: 10}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -519,9 +511,7 @@ func TestFileStoreMsgLimit(t *testing.T) { func TestFileStoreMsgLimitBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxMsgs: 1}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -547,9 +537,7 @@ func TestFileStoreBytesLimit(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: int64(maxBytes)}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() for i := uint64(0); i < toStore; i++ { @@ -599,12 +587,8 @@ func TestFileStoreAgeLimit(t *testing.T) { fcfg.BlockSize = 256 - fs, err := newFileStore( - fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}) + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -659,9 +643,7 @@ func TestFileStoreAgeLimit(t *testing.T) { func TestFileStoreTimeStamps(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() last := time.Now().UnixNano() @@ -691,9 +673,7 @@ func TestFileStorePurge(t *testing.T) { fcfg.BlockSize = blkSize fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", make([]byte, 8*1024) @@ -794,7 +774,7 @@ func TestFileStorePurge(t *testing.T) { checkPurgeState(toStore * 2) - checkFor(t, time.Second, 10*time.Millisecond, func() error { + checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { if _, err := os.Stat(purgeDir); err == nil { return fmt.Errorf("purge directory still present") } @@ -823,9 +803,7 @@ func TestFileStoreCompact(t *testing.T) { time.Now(), prf, nil, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -885,9 +863,7 @@ func TestFileStoreCompactLastPlusOne(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", make([]byte, 10_000) @@ -925,9 +901,7 @@ func TestFileStoreCompactLastPlusOne(t *testing.T) { func TestFileStoreCompactMsgCountBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -969,9 +943,7 @@ func TestFileStoreCompactPerf(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1022,9 +994,7 @@ func TestFileStoreStreamTruncate(t *testing.T) { time.Now(), prf, nil, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() tseq := uint64(50) @@ -1107,9 +1077,7 @@ func TestFileStoreRemovePartialRecovery(t *testing.T) { fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1157,9 +1125,7 @@ func TestFileStoreRemoveOutOfOrderRecovery(t *testing.T) { fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1228,13 +1194,8 @@ func TestFileStoreAgeLimitRecovery(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.CacheExpire = 1 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxAge: maxAge}) + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -1274,9 +1235,7 @@ func TestFileStoreAgeLimitRecovery(t *testing.T) { func TestFileStoreBitRot(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Store some messages. Does not really matter how many. @@ -1342,9 +1301,7 @@ func TestFileStoreBitRot(t *testing.T) { func TestFileStoreEraseMsg(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1403,9 +1360,7 @@ func TestFileStoreEraseMsg(t *testing.T) { func TestFileStoreEraseAndNoIndexRecovery(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -1429,10 +1384,10 @@ func TestFileStoreEraseAndNoIndexRecovery(t *testing.T) { t.Fatalf("Expected %d msgs, got %d", toStore/2, state.Msgs) } - // Stop and remove the index file. + // Stop and remove the optional index file. fs.Stop() ifn := filepath.Join(fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, 1)) - removeFile(t, ifn) + os.Remove(ifn) fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) if err != nil { @@ -1458,9 +1413,7 @@ func TestFileStoreMeta(t *testing.T) { mconfig := StreamConfig{Name: "ZZ-22-33", Storage: FileStorage, Subjects: []string{"foo.*"}, Replicas: 22} fs, err := newFileStore(fcfg, mconfig) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() metafile := filepath.Join(fcfg.StoreDir, JetStreamMetaFile) @@ -1489,9 +1442,13 @@ func TestFileStoreMeta(t *testing.T) { if err != nil { t.Fatalf("Error reading metafile checksum: %v", err) } + + fs.mu.Lock() fs.hh.Reset() fs.hh.Write(buf) mychecksum := hex.EncodeToString(fs.hh.Sum(nil)) + fs.mu.Unlock() + if mychecksum != string(checksum) { t.Fatalf("Checksums do not match, got %q vs %q", mychecksum, checksum) } @@ -1551,13 +1508,8 @@ func TestFileStoreMeta(t *testing.T) { func TestFileStoreWriteAndReadSameBlock(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World!") @@ -1578,13 +1530,8 @@ func TestFileStoreAndRetrieveMultiBlock(t *testing.T) { fcfg.BlockSize = 4 * storedMsgSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() for i := 0; i < 20; i++ { @@ -1621,13 +1568,8 @@ func TestFileStoreCollapseDmap(t *testing.T) { fcfg.BlockSize = 4 * storedMsgSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() for i := 0; i < 10; i++ { @@ -1697,9 +1639,7 @@ func TestFileStoreReadCache(t *testing.T) { storedMsgSize := fileStoreMsgSize(subj, nil, msg) fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() toStore := 500 @@ -1750,9 +1690,7 @@ func TestFileStorePartialCacheExpiration(t *testing.T) { fcfg.CacheExpire = cexp fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() fs.StoreMsg("foo", nil, []byte("msg1")) @@ -1775,9 +1713,7 @@ func TestFileStorePartialIndexes(t *testing.T) { fcfg.CacheExpire = cexp fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() toSend := 5 @@ -1819,14 +1755,21 @@ func TestFileStorePartialIndexes(t *testing.T) { func TestFileStoreSnapshot(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { subj, msg := "foo", []byte("Hello Snappy!") + scfg := StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage} - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) defer fs.Stop() toSend := 2233 @@ -1903,13 +1846,8 @@ func TestFileStoreSnapshot(t *testing.T) { } fcfg.StoreDir = rstoreDir - fsr, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Error restoring from snapshot: %v", err) - } + fsr, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) defer fsr.Stop() state := fs.State() rstate := fsr.State() @@ -1924,7 +1862,6 @@ func TestFileStoreSnapshot(t *testing.T) { if !reflect.DeepEqual(rstate, state) { t.Fatalf("Restored state does not match:\n%+v\n\n%+v", rstate, state) } - } // Simple case first. @@ -2006,9 +1943,7 @@ func TestFileStoreSnapshot(t *testing.T) { func TestFileStoreConsumer(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("obs22", &ConsumerConfig{}) @@ -2193,9 +2128,7 @@ func TestFileStoreWriteFailures(t *testing.T) { subj, msg := "foo", []byte("Hello Write Failures!") fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() var lseq uint64 @@ -2289,13 +2222,8 @@ func TestFileStorePerf(t *testing.T) { friendlyBytes(int64(toStore*storedMsgSize)), ) - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2431,13 +2359,8 @@ func TestFileStoreReadBackMsgPerf(t *testing.T) { friendlyBytes(int64(toStore*storedMsgSize)), ) - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2480,13 +2403,8 @@ func TestFileStoreStoreLimitRemovePerf(t *testing.T) { toStore := 1 * 1024 * 1024 * 1024 / storedMsgSize testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() fs.RegisterStorageUpdates(func(md, bd int64, seq uint64, subj string) {}) @@ -2537,13 +2455,8 @@ func TestFileStorePubPerfWithSmallBlkSize(t *testing.T) { fcfg.BlockSize = FileStoreMinBlkSize - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() start := time.Now() @@ -2623,7 +2536,6 @@ func TestFileStoreConsumerRedeliveredLost(t *testing.T) { t.Fatalf("Did not clear pending correctly") } if len(state.Redelivered) != 0 { - fmt.Printf("redelivered is %+v\n", state.Redelivered) t.Fatalf("Did not clear redelivered correctly") } }) @@ -2632,9 +2544,7 @@ func TestFileStoreConsumerRedeliveredLost(t *testing.T) { func TestFileStoreConsumerFlusher(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("o22", &ConsumerConfig{}) @@ -2665,9 +2575,7 @@ func TestFileStoreConsumerFlusher(t *testing.T) { func TestFileStoreConsumerDeliveredUpdates(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Simple consumer, no ack policy configured. @@ -2721,9 +2629,7 @@ func TestFileStoreConsumerDeliveredUpdates(t *testing.T) { func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Simple consumer, no ack policy configured. @@ -2828,7 +2734,7 @@ func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { t.Fatalf("Unexpected error getting state: %v", err) } if !reflect.DeepEqual(nstate, state) { - t.Fatalf("States don't match!") + t.Fatalf("States don't match! NEW %+v OLD %+v", nstate, state) } }) } @@ -2836,9 +2742,7 @@ func TestFileStoreConsumerDeliveredAndAckUpdates(t *testing.T) { func TestFileStoreStreamStateDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, toStore := "foo", uint64(10) @@ -2885,9 +2789,7 @@ func TestFileStoreStreamStateDeleted(t *testing.T) { func TestFileStoreStreamDeleteDirNotEmpty(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, toStore := "foo", uint64(10) @@ -2920,9 +2822,7 @@ func TestFileStoreConsumerPerf(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() o, err := fs.ConsumerStore("o22", &ConsumerConfig{AckPolicy: AckExplicit}) @@ -2982,27 +2882,13 @@ func TestFileStoreConsumerPerf(t *testing.T) { }) } -func TestFileStoreStreamIndexBug(t *testing.T) { - // https://github.com/nats-io/jetstream/issues/406 - badIdxBytes, _ := base64.StdEncoding.DecodeString("FgGBkw7D/f8/772iDPDIgbU=") - dir := t.TempDir() - fn := filepath.Join(dir, "1.idx") - os.WriteFile(fn, badIdxBytes, 0644) - mb := &msgBlock{index: 1, ifn: fn} - if err := mb.readIndexInfo(); err == nil || !strings.Contains(err.Error(), "short index") { - t.Fatalf("Expected error during readIndexInfo(): %v", err) - } -} - // Reported by Ivan. func TestFileStoreStreamDeleteCacheBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.CacheExpire = 50 * time.Millisecond fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() subj, msg := "foo", []byte("Hello World") @@ -3023,77 +2909,13 @@ func TestFileStoreStreamDeleteCacheBug(t *testing.T) { }) } -// https://github.com/nats-io/nats-server/issues/2068 -func TestFileStoreStreamPurgeAndDirtyRestartBug(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - // Load up some messages. - num, subj, hdr, msg := 100, "foo", []byte("name:derek"), []byte("Hello World") - for i := 0; i < num; i++ { - fs.StoreMsg(subj, hdr, msg) - } - // Now purge - fs.Purge() - - // Snapshot state. - state := fs.State() - if state.FirstSeq != uint64(num+1) || state.LastSeq != uint64(num) { - t.Fatalf("Unexpected state: %+v", state) - } - - // Now we will stop the store and corrupt the index such that on restart it will do a rebuild. - fs.mu.Lock() - lmb := fs.lmb - fs.mu.Unlock() - - lmb.mu.RLock() - ifn := lmb.ifn - lmb.mu.RUnlock() - - fs.Stop() - - fd, err := os.OpenFile(ifn, os.O_RDWR, 0644) - if err != nil { - t.Fatalf("Error opening the index file: %v", err) - } - defer fd.Close() - fi, _ := fd.Stat() - if _, err = fd.WriteAt([]byte{1, 1}, fi.Size()-2); err != nil { - t.Fatalf("Error writing the index file: %v", err) - } - fd.Close() - - // Restart - fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - state = fs.State() - if state.FirstSeq != uint64(num+1) || state.LastSeq != uint64(num) { - t.Fatalf("Unexpected state: %+v", state) - } - }) -} - // rip func TestFileStoreStreamFailToRollBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 512 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: 300}, - ) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage, MaxBytes: 300}) + require_NoError(t, err) defer fs.Stop() // Make sure we properly roll underlying blocks. @@ -3152,9 +2974,7 @@ func TestFileStoreExpireMsgsOnStart(t *testing.T) { startFS := func() *fileStore { t.Helper() fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) return fs } @@ -3235,34 +3055,7 @@ func TestFileStoreExpireMsgsOnStart(t *testing.T) { if index >= len(fs.blks) { t.Fatalf("Out of range, wanted %d but only %d blks", index, len(fs.blks)) } - mb := fs.blks[index] fs.mu.RUnlock() - - var errStr string - - mb.mu.RLock() - // We will do a readIndex op on our clone and then compare. - mbc := &msgBlock{fs: fs, ifn: mb.ifn} - if err := mbc.readIndexInfo(); err != nil { - mb.mu.RUnlock() - t.Fatalf("Error during readIndexInfo: %v", err) - } - // Check state as represented by index info. - if mb.msgs != mbc.msgs { - errStr = fmt.Sprintf("msgs do not match: %d vs %d", mb.msgs, mbc.msgs) - } else if mb.bytes != mbc.bytes { - errStr = fmt.Sprintf("bytes do not match: %d vs %d", mb.bytes, mbc.bytes) - } else if mb.first != mbc.first { - errStr = fmt.Sprintf("first state does not match: %d vs %d", mb.first, mbc.first) - } else if mb.last != mbc.last { - errStr = fmt.Sprintf("last state does not match: %d vs %d", mb.last, mbc.last) - } else if !reflect.DeepEqual(mb.dmap, mbc.dmap) { - errStr = fmt.Sprintf("deleted map does not match: %+v vs %+v", mb.dmap, mbc.dmap) - } - mb.mu.RUnlock() - if errStr != _EMPTY_ { - t.Fatal(errStr) - } } lastSeqForBlk := func(index int) uint64 { @@ -3380,9 +3173,7 @@ func TestFileStoreSparseCompaction(t *testing.T) { var fs *fileStore fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() msg := bytes.Repeat([]byte("ABC"), 33) // ~100bytes @@ -3517,9 +3308,7 @@ func TestFileStoreSparseCompactionWithInteriorDeletes(t *testing.T) { var fs *fileStore fs, err := newFileStore(fcfg, cfg) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() for i := 1; i <= 1000; i++ { @@ -3561,12 +3350,8 @@ func TestFileStorePurgeExKeepOneBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) + require_NoError(t, err) defer fs.Stop() fill := bytes.Repeat([]byte("X"), 128) @@ -3593,41 +3378,10 @@ func TestFileStorePurgeExKeepOneBug(t *testing.T) { }) } -func TestFileStoreRemoveLastWriteIndex(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer fs.Stop() - - for i := 0; i < 10; i++ { - fs.StoreMsg("foo", nil, []byte("msg")) - } - for i := 0; i < 10; i++ { - fs.RemoveMsg(uint64(i + 1)) - } - - fs.mu.Lock() - fname := fs.lmb.ifn - fs.mu.Unlock() - - fi, err := os.Stat(fname) - if err != nil { - t.Fatalf("Error getting stats for index file %q: %v", fname, err) - } - if fi.Size() == 0 { - t.Fatalf("Index file %q size is 0", fname) - } - }) -} - func TestFileStoreFilteredPendingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() fs.StoreMsg("foo", nil, []byte("msg")) @@ -3658,9 +3412,7 @@ func TestFileStoreFetchPerf(t *testing.T) { fcfg.AsyncFlush = true fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + require_NoError(t, err) defer fs.Stop() // Will create 25k msg blocks. @@ -3696,10 +3448,7 @@ func TestFileStoreCompactReclaimHeadSpace(t *testing.T) { fcfg.BlockSize = 4 * 1024 * 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -3970,10 +3719,7 @@ func TestFileStoreRebuildStateDmapAccountingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1024 * 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4012,7 +3758,7 @@ func TestFileStoreRebuildStateDmapAccountingBug(t *testing.T) { require_NoError(t, err) mb.mu.Lock() - _, err = mb.rebuildStateLocked() + _, _, err = mb.rebuildStateLocked() require_NoError(t, err) mb.mu.Unlock() @@ -4024,10 +3770,7 @@ func TestFileStorePurgeExWithSubject(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1000 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "TEST", Subjects: []string{"foo.>"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "TEST", Subjects: []string{"foo.>"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4094,15 +3837,6 @@ func TestFileStoreShortIndexWriteBug(t *testing.T) { t.Fatalf("Expected first sequence of 101 vs %d", state.FirstSeq) } - // I noticed that we also would dangle an open ifd when we did closeAndKeepIndex(), check that we do not anymore. - fs.mu.RLock() - mb := fs.lmb - mb.mu.RLock() - hasIfd := mb.ifd != nil - mb.mu.RUnlock() - fs.mu.RUnlock() - require_False(t, hasIfd) - // Now restart.. fs.Stop() fs, err = newFileStoreWithCreated( @@ -4249,10 +3983,7 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { fcfg.BlockSize = 1024 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4263,13 +3994,6 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { require_NoError(t, err) } - checkNoMeta := func() { - t.Helper() - if _, hasAnyFSS := fs.reportMeta(); hasAnyFSS { - t.Fatalf("Expected no mbs to have fss state") - } - } - // Test that on restart we do not have extensize metadata but do have correct number of subjects/keys. // Only thing really needed for store state / stream info. fs.Stop() @@ -4294,11 +4018,6 @@ func TestFileStoreExpireSubjectMeta(t *testing.T) { return nil }) - // Load by sequence should not load meta. - _, err = fs.LoadMsg(1, nil) - require_NoError(t, err) - checkNoMeta() - // LoadLast, which is what KV uses, should load meta and succeed. _, err = fs.LoadLastMsg("kv.22", nil) require_NoError(t, err) @@ -4317,10 +4036,7 @@ func TestFileStoreMaxMsgsPerSubject(t *testing.T) { fcfg.BlockSize = 128 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4401,10 +4117,7 @@ func TestFileStoreSubjectStateCacheExpiration(t *testing.T) { fcfg.BlockSize = 32 fcfg.CacheExpire = time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 2}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"kv.>"}, Storage: FileStorage, MaxMsgsPer: 2}) require_NoError(t, err) defer fs.Stop() @@ -4531,10 +4244,7 @@ func TestFileStoreEncrypted(t *testing.T) { // Make sure we do not go through block loads when we know no subjects will exists, e.g. raft. func TestFileStoreNoFSSWhenNoSubjects(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4577,10 +4287,7 @@ func TestFileStoreNoFSSBugAfterRemoveFirst(t *testing.T) { fcfg.BlockSize = 8 * 1024 * 1024 fcfg.CacheExpire = 200 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.bar.*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.bar.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4620,10 +4327,7 @@ func TestFileStoreNoFSSBugAfterRemoveFirst(t *testing.T) { func TestFileStoreNoFSSAfterRecover(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4661,10 +4365,7 @@ func TestFileStoreNoFSSAfterRecover(t *testing.T) { func TestFileStoreFSSCloseAndKeepOnExpireOnRecoverBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { ttl := 100 * time.Millisecond - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}) require_NoError(t, err) defer fs.Stop() @@ -4722,59 +4423,12 @@ func TestFileStoreExpireOnRecoverSubjectAccounting(t *testing.T) { }) } -func TestFileStoreFSSBadStateBug(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - _, _, err = fs.StoreMsg("foo", nil, nil) - require_NoError(t, err) - _, _, err = fs.StoreMsg("foo", nil, nil) - require_NoError(t, err) - - // Force write of fss. - mb := fs.getFirstBlock() - mb.mu.Lock() - mb.writePerSubjectInfo() - fssFile := filepath.Join(fcfg.StoreDir, msgDir, fmt.Sprintf(fssScan, 1)) - buf, err := os.ReadFile(fssFile) - require_NoError(t, err) - mb.mu.Unlock() - - // Now remove one of them. - fs.RemoveMsg(1) - fs.Stop() - - // Now put back wrong fss with msgs == 2 - err = os.WriteFile(fssFile, buf, defaultFilePerms) - require_NoError(t, err) - - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - if fss := fs.SubjectsState("foo")["foo"]; fss.Msgs != 1 { - t.Fatalf("Got bad state on restart: %+v", fss) - } - }) -} - func TestFileStoreFSSExpireNumPendingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { cexp := 100 * time.Millisecond fcfg.CacheExpire = cexp - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"KV.>"}, MaxMsgsPer: 1, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"KV.>"}, MaxMsgsPer: 1, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4793,10 +4447,7 @@ func TestFileStoreFSSExpireNumPendingBug(t *testing.T) { // https://github.com/nats-io/nats-server/issues/3484 func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.>"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.>"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4817,11 +4468,10 @@ func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { // Simulate swapping out the fss state and reading it back in with only one subject // present in the block. if mb.fss != nil { - mb.writePerSubjectInfo() mb.fss = nil } // Now load info back in. - mb.readPerSubjectInfo(true) + mb.generatePerSubjectInfo() mb.mu.Unlock() // Now add in a different subject. @@ -4838,10 +4488,7 @@ func TestFileStoreFilteredFirstMatchingBug(t *testing.T) { func TestFileStoreOutOfSpaceRebuildState(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -4884,10 +4531,7 @@ func TestFileStoreRebuildStateProperlyWithMaxMsgsPerSubject(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}) require_NoError(t, err) defer fs.Stop() @@ -4903,55 +4547,14 @@ func TestFileStoreRebuildStateProperlyWithMaxMsgsPerSubject(t *testing.T) { require_NoError(t, err) } - checkState := func() { - var ss StreamState - fs.FastState(&ss) - if ss.NumSubjects != 3 { - t.Fatalf("Expected NumSubjects of 3, got %d", ss.NumSubjects) - } - if ss.Msgs != 3 { - t.Fatalf("Expected NumMsgs of 3, got %d", ss.Msgs) - } + var ss StreamState + fs.FastState(&ss) + if ss.NumSubjects != 3 { + t.Fatalf("Expected NumSubjects of 3, got %d", ss.NumSubjects) } - - checkState() - - // Stop filestore but invalidate the idx files by removing them. - // This will simulate a server panic or kill -9 scenario. - fs.Stop() - - fs.mu.RLock() - for _, mb := range fs.blks { - mb.removeIndexFile() + if ss.Msgs != 3 { + t.Fatalf("Expected NumMsgs of 3, got %d", ss.Msgs) } - fs.mu.RUnlock() - - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage, MaxMsgsPer: 1}, - ) - require_NoError(t, err) - defer fs.Stop() - - checkState() - - // Make sure we wrote all index files from recovery. - fs.mu.RLock() - for _, mb := range fs.blks { - mb.mu.Lock() - if err := mb.readIndexInfo(); err != nil { - mb.mu.Unlock() - fs.mu.RUnlock() - t.Fatalf("Unexpected error reading index info: %v", err) - } - if mb.msgs == 0 { - mb.mu.Unlock() - fs.mu.RUnlock() - t.Fatalf("Expected msgs for all blks, got none for index %d", mb.index) - } - mb.mu.Unlock() - } - fs.mu.RUnlock() }) } @@ -4999,13 +4602,9 @@ func TestFileStoreUpdateMaxMsgsPerSubject(t *testing.T) { func TestFileStoreBadFirstAndFailedExpireAfterRestart(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 256 - ttl := time.Second - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage, MaxAge: ttl}) require_NoError(t, err) defer fs.Stop() @@ -5069,10 +4668,7 @@ func TestFileStoreBadFirstAndFailedExpireAfterRestart(t *testing.T) { func TestFileStoreCompactAllWithDanglingLMB(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5096,10 +4692,7 @@ func TestFileStoreStateWithBlkFirstDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5135,11 +4728,20 @@ func TestFileStoreStateWithBlkFirstDeleted(t *testing.T) { func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 4096 + scfg := StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage} - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5155,20 +4757,16 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // First block fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 0) + require_True(t, len(fs.blks) > 0) mfn := fs.blks[0].mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5179,21 +4777,17 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // Last block fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 0) + require_True(t, len(fs.blks) > 0) require_True(t, fs.lmb != nil) mfn = fs.lmb.mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5206,20 +4800,16 @@ func TestFileStoreMsgBlkFailOnKernelFaultLostDataReporting(t *testing.T) { // Interior block. fs.mu.RLock() - require_True(t, fs.numMsgBlocks() > 3) + require_True(t, len(fs.blks) > 3) mfn = fs.blks[len(fs.blks)-3].mfn fs.mu.RUnlock() fs.Stop() - err = os.WriteFile(mfn, nil, defaultFilePerms) - require_NoError(t, err) + require_NoError(t, os.Remove(mfn)) // Restart. - fs, err = newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) require_NoError(t, err) defer fs.Stop() @@ -5236,10 +4826,7 @@ func TestFileStoreAllFilteredStateWithDeleted(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 1024 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5285,10 +4872,7 @@ func TestFileStoreStreamTruncateResetMultiBlock(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5332,10 +4916,7 @@ func TestFileStoreStreamCompactMultiBlockSubjectInfo(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fcfg.BlockSize = 128 - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}, - ) + fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5358,59 +4939,10 @@ func TestFileStoreStreamCompactMultiBlockSubjectInfo(t *testing.T) { }) } -func TestFileStoreOnlyWritePerSubjectInfoOnExpireWithUpdate(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fcfg.CacheExpire = 100 * time.Millisecond - - fs, err := newFileStore( - fcfg, - StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}, - ) - require_NoError(t, err) - defer fs.Stop() - - for i := 0; i < 1000; i++ { - subj := fmt.Sprintf("foo.%d", i) - _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) - require_NoError(t, err) - } - - // Grab first msg block. - fs.mu.RLock() - mb := fs.blks[0] - fs.mu.RUnlock() - - needsUpdate := func() bool { - mb.mu.RLock() - defer mb.mu.RUnlock() - return mb.fssNeedsWrite - } - require_True(t, needsUpdate()) - time.Sleep(2 * fcfg.CacheExpire) - require_False(t, needsUpdate()) - - // Make sure reads do not trigger an update. - _, err = fs.LoadMsg(1, nil) - require_NoError(t, err) - require_False(t, needsUpdate()) - - // Remove will though. - _, err = fs.RemoveMsg(1) - require_NoError(t, err) - require_True(t, needsUpdate()) - - // We should update then clear. - time.Sleep(2 * fcfg.CacheExpire) - require_False(t, needsUpdate()) - }) -} - func TestFileStoreSubjectsTotals(t *testing.T) { // No need for all permutations here. storeDir := t.TempDir() - fcfg := FileStoreConfig{ - StoreDir: storeDir, - } + fcfg := FileStoreConfig{StoreDir: storeDir} fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*.*"}, Storage: FileStorage}) require_NoError(t, err) defer fs.Stop() @@ -5485,70 +5017,6 @@ func TestFileStoreSubjectsTotals(t *testing.T) { } } -func TestFileStoreNewWriteIndexInfo(t *testing.T) { - testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { - fcfg.BlockSize = defaultLargeBlockSize - - fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - require_NoError(t, err) - defer fs.Stop() - - // Fill a block. - numToFill := 254200 - for i := 0; i < numToFill; i++ { - _, _, err := fs.StoreMsg("A", nil, []byte("OK")) - require_NoError(t, err) - } - - // Maximize interior deletes for testing the new AVL sequence set. - for seq := uint64(2); seq < uint64(numToFill); seq++ { - removed, err := fs.RemoveMsg(seq) - require_NoError(t, err) - require_True(t, removed) - } - // Grab first block - fs.mu.RLock() - mb := fs.blks[0] - fs.mu.RUnlock() - - mb.mu.Lock() - start := time.Now() - err = mb.writeIndexInfoLocked() - if err != nil { - mb.mu.Unlock() - t.Fatalf("Unexpected error: %v", err) - } - elapsed := time.Since(start) - if elapsed > 3*time.Millisecond { - mb.mu.Unlock() - t.Errorf("Unexpected elapsed time: %v", elapsed) - } - fi, err := os.Stat(mb.ifn) - mb.mu.Unlock() - - require_NoError(t, err) - require_True(t, fi.Size() < 34*1024) // Just over 32k - - mb.mu.Lock() - mb.dmap.Empty() - err = mb.readIndexInfo() - numMsgs := mb.msgs - firstSeq := mb.first.seq - lastSeq := mb.last.seq - mb.mu.Unlock() - // Make sure consistent. - require_NoError(t, err) - require_True(t, numMsgs == 2) - require_True(t, firstSeq == 1) - require_True(t, lastSeq == uint64(numToFill)) - - fs.Stop() - fs, err = newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) - require_NoError(t, err) - defer fs.Stop() - }) -} - func TestFileStoreConsumerStoreEncodeAfterRestart(t *testing.T) { testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Storage: FileStorage}) @@ -5593,6 +5061,7 @@ func TestFileStoreNumPendingLargeNumBlks(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 10_000 @@ -5603,12 +5072,12 @@ func TestFileStoreNumPendingLargeNumBlks(t *testing.T) { start := time.Now() total, _ := fs.NumPending(4000, "zzz", false) - require_True(t, time.Since(start) < 5*time.Millisecond) + require_True(t, time.Since(start) < 10*time.Millisecond) require_True(t, total == 6001) start = time.Now() total, _ = fs.NumPending(6000, "zzz", false) - require_True(t, time.Since(start) < 5*time.Millisecond) + require_True(t, time.Since(start) < 10*time.Millisecond) require_True(t, total == 4001) // Now delete a message in first half and second half. @@ -5635,6 +5104,7 @@ func TestFileStoreSkipMsgAndNumBlocks(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 10_000 @@ -5667,6 +5137,7 @@ func TestFileStoreRestoreEncryptedWithNoKeyFuncFails(t *testing.T) { prf, nil, ) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 100) numMsgs := 100 @@ -5752,6 +5223,7 @@ func TestFileStoreRecaluclateFirstForSubjBug(t *testing.T) { func TestFileStoreKeepWithDeletedMsgsBug(t *testing.T) { fs, err := newFileStore(FileStoreConfig{StoreDir: t.TempDir()}, StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() msg := bytes.Repeat([]byte("A"), 19) for i := 0; i < 5; i++ { @@ -5768,3 +5240,478 @@ func TestFileStoreKeepWithDeletedMsgsBug(t *testing.T) { require_NoError(t, err) require_True(t, n == 3) } + +/////////////////////////////////////////////////////////////////////////// +// New WAL based architecture tests +/////////////////////////////////////////////////////////////////////////// + +func TestFileStoreFullStateBasics(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 100 + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + subj, msgLen, recLen := "A", 19, uint64(50) + msgA := bytes.Repeat([]byte("A"), msgLen) + msgZ := bytes.Repeat([]byte("Z"), msgLen) + + // Send 2 msgs and stop, check for presence of our full state file. + fs.StoreMsg(subj, nil, msgA) + fs.StoreMsg(subj, nil, msgZ) + require_True(t, fs.numMsgBlocks() == 1) + + // Make sure there is a full state file after we do a stop. + fs.Stop() + + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + if _, err := os.Stat(sfile); err != nil { + t.Fatalf("Expected stream state file but got %v", err) + } + + // Read it in and make sure len > 0. + buf, err := os.ReadFile(sfile) + require_NoError(t, err) + require_True(t, len(buf) > 0) + + // Now make sure we recover properly. + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Make sure there are no old idx or fss files. + matches, err := filepath.Glob(filepath.Join(fcfg.StoreDir, msgDir, "%d.fss")) + require_NoError(t, err) + require_True(t, len(matches) == 0) + matches, err = filepath.Glob(filepath.Join(fcfg.StoreDir, msgDir, "%d.idx")) + require_NoError(t, err) + require_True(t, len(matches) == 0) + + state := fs.State() + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 2) + + // Now make sure we can read in values. + var smv StoreMsg + sm, err := fs.LoadMsg(1, &smv) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgA)) + + sm, err = fs.LoadMsg(2, &smv) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgZ)) + + // Now add in 1 more here to split the lmb. + fs.StoreMsg(subj, nil, msgZ) + + // Now stop the filestore and replace the old stream state and make sure we recover correctly. + fs.Stop() + + // Regrab the stream state + buf, err = os.ReadFile(sfile) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Add in one more. + fs.StoreMsg(subj, nil, msgZ) + fs.Stop() + + // Put old stream state back with only 3. + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + state = fs.State() + require_True(t, state.Msgs == 4) + require_True(t, state.Bytes == 4*recLen) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 4) + require_True(t, fs.numMsgBlocks() == 2) + + // Make sure we are tracking subjects correctly. + fs.mu.RLock() + psi := *fs.psim[subj] + fs.mu.RUnlock() + + require_True(t, psi.total == 4) + require_True(t, psi.fblk == 1) + require_True(t, psi.lblk == 2) + + // Store 1 more + fs.StoreMsg(subj, nil, msgA) + fs.Stop() + // Put old stream state back with only 3. + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + state = fs.State() + require_True(t, state.Msgs == 5) + require_True(t, state.FirstSeq == 1) + require_True(t, state.LastSeq == 5) + require_True(t, fs.numMsgBlocks() == 3) + // Make sure we are tracking subjects correctly. + fs.mu.RLock() + psi = *fs.psim[subj] + fs.mu.RUnlock() + require_True(t, psi.total == 5) + require_True(t, psi.fblk == 1) + require_True(t, psi.lblk == 3) + }) +} + +func TestFileStoreFullStatePurge(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 132 // Leave room for tombstones. + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + subj, msg := "A", bytes.Repeat([]byte("A"), 19) + + // Should be 2 per block, so 5 blocks. + for i := 0; i < 10; i++ { + fs.StoreMsg(subj, nil, msg) + } + n, err := fs.Purge() + require_NoError(t, err) + require_True(t, n == 10) + state := fs.State() + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Add in more 10 more total, some B some C. + for i := 0; i < 5; i++ { + fs.StoreMsg("B", nil, msg) + fs.StoreMsg("C", nil, msg) + } + + n, err = fs.PurgeEx("B", 0, 0) + require_NoError(t, err) + require_True(t, n == 5) + + state = fs.State() + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Purge with keep. + n, err = fs.PurgeEx(_EMPTY_, 0, 2) + require_NoError(t, err) + require_True(t, n == 3) + + state = fs.State() + + // Do some quick checks here, keep had a bug. + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 18) + require_True(t, state.LastSeq == 20) + + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Make sure we can survive a purge with no full stream state and have the correct first sequence. + // This used to be provided by the idx file and is now tombstones and the full stream state snapshot. + n, err = fs.Purge() + require_NoError(t, err) + require_True(t, n == 2) + state = fs.State() + fs.Stop() + + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + os.Remove(sfile) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + }) +} + +func TestFileStoreFullStateTestUserRemoveWAL(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 132 // Leave room for tombstones. + scfg := StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage} + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + msgLen := 19 + msgA := bytes.Repeat([]byte("A"), msgLen) + msgZ := bytes.Repeat([]byte("Z"), msgLen) + + // Store 2 msgs and delete first. + fs.StoreMsg("A", nil, msgA) + fs.StoreMsg("Z", nil, msgZ) + fs.RemoveMsg(1) + + // Check we can load things properly since the block will have a tombstone now for seq 1. + sm, err := fs.LoadMsg(2, nil) + require_NoError(t, err) + require_True(t, bytes.Equal(sm.msg, msgZ)) + + require_True(t, fs.numMsgBlocks() == 1) + state := fs.State() + fs.Stop() + + // Grab the state from this stop. + sfile := filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile) + buf, err := os.ReadFile(sfile) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // Check we can load things properly since the block will have a tombstone now for seq 1. + _, err = fs.LoadMsg(2, nil) + require_NoError(t, err) + _, err = fs.LoadMsg(1, nil) + require_Error(t, err, ErrStoreMsgNotFound) + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + + // Store 2 more msgs and delete 2 & 4. + fs.StoreMsg("A", nil, msgA) + fs.StoreMsg("Z", nil, msgZ) + fs.RemoveMsg(2) + fs.RemoveMsg(4) + + state = fs.State() + require_True(t, len(state.Deleted) == state.NumDeleted) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + + // Now close again and put back old stream state. + // This will test that we can remember user deletes by placing tombstones in the lmb/wal. + fs.Stop() + err = os.WriteFile(sfile, buf, defaultFilePerms) + require_NoError(t, err) + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state does not match:\n%+v\n%+v", + state, newState) + } + require_True(t, !state.FirstTime.IsZero()) + }) +} + +func TestFileStoreFullStateTestSysRemovals(t *testing.T) { + testFileStoreAllPermutations(t, func(t *testing.T, fcfg FileStoreConfig) { + fcfg.BlockSize = 100 + scfg := StreamConfig{ + Name: "zzz", + Subjects: []string{"*"}, + MaxMsgs: 10, + MaxMsgsPer: 1, + Storage: FileStorage, + } + + prf := func(context []byte) ([]byte, error) { + h := hmac.New(sha256.New, []byte("dlc22")) + if _, err := h.Write(context); err != nil { + return nil, err + } + return h.Sum(nil), nil + } + if fcfg.Cipher == NoCipher { + prf = nil + } + + fs, err := newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + // This yields an internal record length of 50 bytes. So 2 msgs per blk. + msgLen := 19 + msg := bytes.Repeat([]byte("A"), msgLen) + + for _, subj := range []string{"A", "B", "A", "B"} { + fs.StoreMsg(subj, nil, msg) + } + + state := fs.State() + require_True(t, state.Msgs == 2) + require_True(t, state.FirstSeq == 3) + require_True(t, state.LastSeq == 4) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + for _, subj := range []string{"C", "D", "E", "F", "G", "H", "I", "J"} { + fs.StoreMsg(subj, nil, msg) + } + + state = fs.State() + require_True(t, state.Msgs == 10) + require_True(t, state.FirstSeq == 3) + require_True(t, state.LastSeq == 12) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + + // Goes over limit + fs.StoreMsg("ZZZ", nil, msg) + + state = fs.State() + require_True(t, state.Msgs == 10) + require_True(t, state.FirstSeq == 4) + require_True(t, state.LastSeq == 13) + fs.Stop() + + fs, err = newFileStoreWithCreated(fcfg, scfg, time.Now(), prf, nil) + require_NoError(t, err) + defer fs.Stop() + + if newState := fs.State(); !reflect.DeepEqual(state, newState) { + t.Fatalf("Restore state after purge does not match:\n%+v\n%+v", + state, newState) + } + }) +} + +/////////////////////////////////////////////////////////////////////////// +// Benchmarks +/////////////////////////////////////////////////////////////////////////// + +func Benchmark_FileStoreSelectMsgBlock(b *testing.B) { + // We use small block size to create lots of blocks for this test. + fs, err := newFileStore( + FileStoreConfig{StoreDir: b.TempDir(), BlockSize: 128}, + StreamConfig{Name: "zzz", Subjects: []string{"*"}, Storage: FileStorage}) + if err != nil { + b.Fatalf("Unexpected error: %v", err) + } + defer fs.Stop() + + subj, msg := "A", bytes.Repeat([]byte("ABC"), 33) // ~100bytes + + // Add in a bunch of blocks. + for i := 0; i < 1000; i++ { + fs.StoreMsg(subj, nil, msg) + } + if fs.numMsgBlocks() < 1000 { + b.Fatalf("Expected at least 1000 blocks, got %d", fs.numMsgBlocks()) + } + + fs.mu.RLock() + defer fs.mu.RUnlock() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, mb := fs.selectMsgBlockWithIndex(1) + if mb == nil { + b.Fatalf("Expected a non-nil mb") + } + } + b.StopTimer() +} diff --git a/server/jetstream_test.go b/server/jetstream_test.go index 59ba472c..9356dc5b 100644 --- a/server/jetstream_test.go +++ b/server/jetstream_test.go @@ -12232,6 +12232,7 @@ func TestJetStreamServerEncryption(t *testing.T) { // Check stream meta. checkEncrypted := func() { + t.Helper() checkKeyFile(filepath.Join(sdir, JetStreamMetaFileKey)) checkFor(filepath.Join(sdir, JetStreamMetaFile), "TEST", "foo", "bar", "baz", "max_msgs", "max_bytes") // Check a message block. @@ -15465,47 +15466,6 @@ func TestJetStreamStorageReservedBytes(t *testing.T) { } } -func TestJetStreamRecoverStreamWithDeletedMessagesNonCleanShutdown(t *testing.T) { - s := RunBasicJetStreamServer(t) - defer s.Shutdown() - - nc, js := jsClientConnect(t, s) - defer nc.Close() - - _, err := js.AddStream(&nats.StreamConfig{Name: "T"}) - require_NoError(t, err) - - for i := 0; i < 100; i++ { - js.Publish("T", []byte("OK")) - } - - js.DeleteMsg("T", 22) - - // Now we need a non-clean shutdown. - // For this use case that means we do *not* write the fss file. - sd := s.JetStreamConfig().StoreDir - fss := filepath.Join(sd, "$G", "streams", "T", "msgs", "1.fss") - - // Stop current - nc.Close() - s.Shutdown() - - // Remove fss file to simulate a non-clean shutdown. - err = os.Remove(fss) - require_NoError(t, err) - - // Restart. - s = RunJetStreamServerOnPort(-1, sd) - defer s.Shutdown() - - nc, js = jsClientConnect(t, s) - defer nc.Close() - - // Make sure we recovered our stream - _, err = js.StreamInfo("T") - require_NoError(t, err) -} - func TestJetStreamRestoreBadStream(t *testing.T) { s := RunBasicJetStreamServer(t) defer s.Shutdown() @@ -20387,9 +20347,8 @@ func TestJetStreamMsgBlkFailOnKernelFault(t *testing.T) { sd := s.JetStreamConfig().StoreDir s.Shutdown() - // Zero out the last block. - err = os.WriteFile(lmbf, nil, defaultFilePerms) - require_NoError(t, err) + // Remove block. + require_NoError(t, os.Remove(lmbf)) s = RunJetStreamServerOnPort(-1, sd) defer s.Shutdown() @@ -21263,14 +21222,8 @@ func TestJetStreamMaxBytesIgnored(t *testing.T) { sd := s.JetStreamConfig().StoreDir s.Shutdown() - // We will remove the idx file and truncate the blk and fss files. + // We will truncate blk file. mdir := filepath.Join(sd, "$G", "streams", "TEST", "msgs") - // Remove idx - err = os.Remove(filepath.Join(mdir, "1.idx")) - require_NoError(t, err) - // Truncate fss - err = os.WriteFile(filepath.Join(mdir, "1.fss"), nil, defaultFilePerms) - require_NoError(t, err) // Truncate blk err = os.WriteFile(filepath.Join(mdir, "1.blk"), nil, defaultFilePerms) require_NoError(t, err) diff --git a/server/norace_test.go b/server/norace_test.go index f43b46ef..6bbf7791 100644 --- a/server/norace_test.go +++ b/server/norace_test.go @@ -3120,10 +3120,9 @@ func TestNoRaceJetStreamFileStoreCompaction(t *testing.T) { for i := 0; i < toSend; i++ { js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) } - select { case <-js.PublishAsyncComplete(): - case <-time.After(time.Second): + case <-time.After(10 * time.Second): t.Fatalf("Did not receive completion signal") } @@ -5218,6 +5217,10 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { for { select { case <-qch: + select { + case <-js.PublishAsyncComplete(): + case <-time.After(10 * time.Second): + } return default: // Send as fast as we can. @@ -5227,7 +5230,7 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { }() } - time.Sleep(100 * time.Millisecond) + time.Sleep(200 * time.Millisecond) // Now let's scale up to an R3. cfg.Replicas = 3 @@ -5277,7 +5280,7 @@ func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { t.Fatalf("Expected to see messages increase, got %d", si.State.Msgs) } - checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { + checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { // Make sure they are all the same from a state perspective. // Leader will have the expected state. lmset, err := c.streamLeader("$G", "TEST").GlobalAccount().lookupStream("TEST") @@ -8742,6 +8745,7 @@ func TestNoRaceFilestoreBinaryStreamSnapshotEncodingLargeGaps(t *testing.T) { } fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) require_NoError(t, err) + defer fs.Stop() subj, msg := "zzz", bytes.Repeat([]byte("X"), 128) numMsgs := 20_000 diff --git a/server/stream.go b/server/stream.go index f7b8f203..9bb254f3 100644 --- a/server/stream.go +++ b/server/stream.go @@ -971,8 +971,8 @@ func (mset *stream) lastSeqAndCLFS() (uint64, uint64) { } func (mset *stream) clearCLFS() uint64 { - mset.mu.Lock() - defer mset.mu.Unlock() + mset.clMu.Lock() + defer mset.clMu.Unlock() clfs := mset.clfs mset.clfs, mset.clseq = 0, 0 return clfs