nats-server/server/filestore.go

// Copyright 2019-2023 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package server

import (
	"archive/tar"
	"bytes"
	"crypto/aes"
	"crypto/cipher"
	"crypto/rand"
	"crypto/sha256"
	"encoding/binary"
	"encoding/hex"
	"encoding/json"
	"errors"
	"fmt"
	"hash"
	"io"
	"math"
	"net"
	"os"
	"path/filepath"
	"sort"
	"sync"
	"sync/atomic"
	"time"

	"github.com/klauspost/compress/s2"
	"github.com/minio/highwayhash"
	"github.com/nats-io/nats-server/v2/server/avl"
	"golang.org/x/crypto/chacha20"
	"golang.org/x/crypto/chacha20poly1305"
)

type FileStoreConfig struct {
	// Where the parent directory for all storage will be located.
	StoreDir string
	// BlockSize is the file block size. This also represents the maximum overhead size.
	BlockSize uint64
	// CacheExpire is how long with no activity until we expire the cache.
	CacheExpire time.Duration
	// SyncInterval is how often we sync to disk in the background.
	SyncInterval time.Duration
	// SyncAlways is when the stream should sync all data writes.
	SyncAlways bool
	// AsyncFlush allows async flush to batch write operations.
	AsyncFlush bool
	// Cipher is the cipher to use when encrypting.
	Cipher StoreCipher
	// Compression is the algorithm to use when compressing.
	Compression StoreCompression
}

// FileStreamInfo allows us to remember created time.
type FileStreamInfo struct {
	Created time.Time
	StreamConfig
}

type StoreCipher int

const (
	ChaCha StoreCipher = iota
	AES
	NoCipher
)

func (cipher StoreCipher) String() string {
	switch cipher {
	case ChaCha:
		return "ChaCha20-Poly1305"
	case AES:
		return "AES-GCM"
	case NoCipher:
		return "None"
	default:
		return "Unknown StoreCipher"
	}
}

type StoreCompression uint8

const (
	NoCompression StoreCompression = iota
	S2Compression
)

func (alg StoreCompression) String() string {
	switch alg {
	case NoCompression:
		return "None"
	case S2Compression:
		return "S2"
	default:
		return "Unknown StoreCompression"
	}
}

func (alg StoreCompression) MarshalJSON() ([]byte, error) {
	var str string
	switch alg {
	case S2Compression:
		str = "s2"
	case NoCompression:
		str = "none"
	default:
		return nil, fmt.Errorf("unknown compression algorithm")
	}
	return json.Marshal(str)
}

func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
	var str string
	if err := json.Unmarshal(b, &str); err != nil {
		return err
	}
	switch str {
	case "s2":
		*alg = S2Compression
	case "none":
		*alg = NoCompression
	default:
		return fmt.Errorf("unknown compression algorithm")
	}
	return nil
}

// File ConsumerInfo is used for creating consumer stores.
type FileConsumerInfo struct {
	Created time.Time
	Name    string
	ConsumerConfig
}

// Default file and directory permissions.
const (
	defaultDirPerms  = os.FileMode(0750)
	defaultFilePerms = os.FileMode(0640)
)

type psi struct {
	total uint64
	fblk  uint32
	lblk  uint32
}

type fileStore struct {
	srv         *Server
	mu          sync.RWMutex
	state       StreamState
	tombs       []uint64
	ld          *LostStreamData
	scb         StorageUpdateHandler
	ageChk      *time.Timer
	syncTmr     *time.Timer
	cfg         FileStreamInfo
	fcfg        FileStoreConfig
	prf         keyGen
	oldprf      keyGen
	aek         cipher.AEAD
	lmb         *msgBlock
	blks        []*msgBlock
	bim         map[uint32]*msgBlock
	psim        map[string]*psi
	hh          hash.Hash64
	qch         chan struct{}
	fch         chan struct{}
	fsld        chan struct{}
	cfs         []ConsumerStore
	sips        int
	dirty       int
	closed      bool
	fip         bool
	receivedAny bool
}

// Represents a message store block and its data.
type msgBlock struct {
	// Here for 32bit systems and atomic.
	first      msgId
	last       msgId
	mu         sync.RWMutex
	fs         *fileStore
	aek        cipher.AEAD
	bek        cipher.Stream
	seed       []byte
	nonce      []byte
	mfn        string
	mfd        *os.File
	cmp        StoreCompression // Effective compression at the time of loading the block
	liwsz      int64
	index      uint32
	bytes      uint64 // User visible bytes count.
	rbytes     uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
	msgs       uint64 // User visible message count.
	fss        map[string]*SimpleState
	kfn        string
	lwts       int64
	llts       int64
	lrts       int64
	llseq      uint64
	hh         hash.Hash64
	cache      *cache
	cloads     uint64
	cexp       time.Duration
	ctmr       *time.Timer
	werr       error
	dmap       avl.SequenceSet
	fch        chan struct{}
	qch        chan struct{}
	lchk       [8]byte
	loading    bool
	flusher    bool
	noTrack    bool
	needSync   bool
	syncAlways bool
	closed     bool

	// Used to mock write failures.
	mockWriteErr bool
}

// Write through caching layer that is also used on loading messages.
type cache struct {
	buf  []byte
	off  int
	wp   int
	idx  []uint32
	lrl  uint32
	fseq uint64
	nra  bool
}

type msgId struct {
	seq uint64
	ts  int64
}

const (
	// Magic is used to identify the file store files.
	magic = uint8(22)
	// Version
	version = uint8(1)
	// New IndexInfo Version
	newVersion = uint8(2)
	// hdrLen
	hdrLen = 2
	// This is where we keep the streams.
	streamsDir = "streams"
	// This is where we keep the message store blocks.
	msgDir = "msgs"
	// This is where we temporarily move the messages dir.
	purgeDir = "__msgs__"
	// used to scan blk file names.
	blkScan = "%d.blk"
	// used for compacted blocks that are staged.
	newScan = "%d.new"
	// used to scan index file names.
	indexScan = "%d.idx"
	// to look for orphans
	indexScanAll = "*.idx"
	// to look for orphans
	fssScanAll = "*.fss"
	// used to store our block encryption key.
	keyScan = "%d.key"
	// to look for orphans
	keyScanAll = "*.key"
	// This is where we keep state on consumers.
	consumerDir = "obs"
	// Index file for a consumer.
	consumerState = "o.dat"
	// The suffix that will be given to a new temporary block during compression.
	compressTmpSuffix = ".tmp"
	// This is where we keep state on templates.
	tmplsDir = "templates"
	// Maximum size of a write buffer we may consider for re-use.
	maxBufReuse = 2 * 1024 * 1024
	// default cache buffer expiration
	defaultCacheBufferExpiration = 5 * time.Second
	// default sync interval
	defaultSyncInterval = 2 * time.Minute
	// default idle timeout to close FDs.
	closeFDsIdle = 30 * time.Second
	// coalesceMinimum
	coalesceMinimum = 16 * 1024
	// maxFlushWait is maximum we will wait to gather messages to flush.
	maxFlushWait = 8 * time.Millisecond

	// Metafiles for streams and consumers.
	JetStreamMetaFile    = "meta.inf"
	JetStreamMetaFileSum = "meta.sum"
	JetStreamMetaFileKey = "meta.key"

	// This is the full snapshotted state for the stream.
	streamStreamStateFile = "index.db"

	// AEK key sizes
	minMetaKeySize = 64
	minBlkKeySize  = 64

	// Default stream block size.
	defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
	// Default for workqueue or interest based.
	defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
	// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
	// E.g. mirrors/sources etc.
	defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
	// Maximum size for the encrypted head block.
	maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
	// Default for KV based
	defaultKVBlockSize = defaultMediumBlockSize
	// max block size for now.
	maxBlockSize = defaultLargeBlockSize
	// Compact minimum threshold.
	compactMinimum = 2 * 1024 * 1024 // 2MB
	// FileStoreMinBlkSize is minimum size we will do for a blk size.
	FileStoreMinBlkSize = 32 * 1000 // 32kib
	// FileStoreMaxBlkSize is maximum size we will do for a blk size.
	FileStoreMaxBlkSize = maxBlockSize
	// Check for bad record length value due to corrupt data.
	rlBadThresh = 32 * 1024 * 1024
	// Checksum size for hash for msg records.
	recordHashSize = 8
)

func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
	return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
}

func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
	if cfg.Name == _EMPTY_ {
		return nil, fmt.Errorf("name required")
	}
	if cfg.Storage != FileStorage {
		return nil, fmt.Errorf("fileStore requires file storage type in config")
	}
	// Default values.
	if fcfg.BlockSize == 0 {
		fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
	}
	if fcfg.BlockSize > maxBlockSize {
		return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
	}
	if fcfg.CacheExpire == 0 {
		fcfg.CacheExpire = defaultCacheBufferExpiration
	}
	if fcfg.SyncInterval == 0 {
		fcfg.SyncInterval = defaultSyncInterval
	}

	// Check the directory
	if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
		if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
			return nil, fmt.Errorf("could not create storage directory - %v", err)
		}
	} else if stat == nil || !stat.IsDir() {
		return nil, fmt.Errorf("storage directory is not a directory")
	}
	tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
	if err != nil {
		return nil, fmt.Errorf("storage directory is not writable")
	}

	tmpfile.Close()
	<-dios
	os.Remove(tmpfile.Name())
	dios <- struct{}{}

	fs := &fileStore{
		fcfg:   fcfg,
		psim:   make(map[string]*psi),
		bim:    make(map[uint32]*msgBlock),
		cfg:    FileStreamInfo{Created: created, StreamConfig: cfg},
		prf:    prf,
		oldprf: oldprf,
		qch:    make(chan struct{}),
		fch:    make(chan struct{}, 1),
		fsld:   make(chan struct{}),
	}

	// Set flush in place to AsyncFlush which by default is false.
	fs.fip = !fcfg.AsyncFlush

	// Check if this is a new setup.
	mdir := filepath.Join(fcfg.StoreDir, msgDir)
	odir := filepath.Join(fcfg.StoreDir, consumerDir)
	if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
		return nil, fmt.Errorf("could not create message storage directory - %v", err)
	}
	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
		return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
	}

	// Create highway hash for message blocks. Use sha256 of directory as key.
	key := sha256.Sum256([]byte(cfg.Name))
	fs.hh, err = highwayhash.New64(key[:])
	if err != nil {
		return nil, fmt.Errorf("could not create hash: %v", err)
	}

	keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
	// Make sure we do not have an encrypted store underneath of us but no main key.
	if fs.prf == nil {
		if _, err := os.Stat(keyFile); err == nil {
			return nil, errNoMainKey
		}
	}

	// Attempt to recover our state.
	err = fs.recoverFullState()
	if err != nil {
		// Hold onto state
		prior := fs.state
		// Reset anything that could have been set from above.
		fs.state = StreamState{}
		fs.psim = make(map[string]*psi)
		fs.bim = make(map[uint32]*msgBlock)
		fs.blks = nil
		fs.tombs = nil

		// Recover our message state the old way
		if err := fs.recoverMsgs(); err != nil {
			return nil, err
		}

		// Check if our prior remember a last past where we can see.
		if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
			fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
			if lmb, err := fs.newMsgBlockForWrite(); err == nil {
				lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
			} else {
				return nil, err
			}
		}
		// Since we recovered here, make sure to kick ourselves to write out our stream state.
		fs.dirty++
		defer fs.kickFlushStateLoop()
	}

	// Also make sure we get rid of old idx and fss files on return.
	// Do this in separate go routine vs inline and at end of processing.
	defer func() {
		go func() {
			os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll))
			os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll))
		}()
	}()

	// Lock while do enforcements and removals.
	fs.mu.Lock()

	// Check if we have any left over tombstones to process.
	if len(fs.tombs) > 0 {
		for _, seq := range fs.tombs {
			fs.removeMsg(seq, false, false, false)
			fs.removeFromLostData(seq)
		}
		// Not needed after this phase.
		fs.tombs = nil
	}

	// Limits checks and enforcement.
	fs.enforceMsgLimit()
	fs.enforceBytesLimit()

	// Do age checks too, make sure to call in place.
	if fs.cfg.MaxAge != 0 {
		fs.expireMsgsOnRecover()
		fs.startAgeChk()
	}

	// If we have max msgs per subject make sure the is also enforced.
	if fs.cfg.MaxMsgsPer > 0 {
		fs.enforceMsgPerSubjectLimit()
	}

	// Grab first sequence for check below while we have lock.
	firstSeq := fs.state.FirstSeq
	fs.mu.Unlock()

	// If the stream has an initial sequence number then make sure we
	// have purged up until that point. We will do this only if the
	// recovered first sequence number is before our configured first
	// sequence. Need to do this locked as by now the age check timer
	// has started.
	if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
		if _, err := fs.purge(cfg.FirstSeq); err != nil {
			return nil, err
		}
	}

	// Write our meta data if it does not exist or is zero'd out.
	meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
	fi, err := os.Stat(meta)
	if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
		if err := fs.writeStreamMeta(); err != nil {
			return nil, err
		}
	}

	// If we expect to be encrypted check that what we are restoring is not plaintext.
	// This can happen on snapshot restores or conversions.
	if fs.prf != nil {
		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
			if err := fs.writeStreamMeta(); err != nil {
				return nil, err
			}
		}
	}

	fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)

	// Spin up the go routine that will write out or full state stream index.
	go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld)

	return fs, nil
}

func (fs *fileStore) registerServer(s *Server) {
	fs.mu.Lock()
	defer fs.mu.Unlock()
	fs.srv = s
}

// Lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) lockAllMsgBlocks() {
	for _, mb := range fs.blks {
		mb.mu.Lock()
	}
}

// Unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) unlockAllMsgBlocks() {
	for _, mb := range fs.blks {
		mb.mu.Unlock()
	}
}

func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
	if fs.isClosed() {
		return ErrStoreClosed
	}
	if cfg.Name == _EMPTY_ {
		return fmt.Errorf("name required")
	}
	if cfg.Storage != FileStorage {
		return fmt.Errorf("fileStore requires file storage type in config")
	}

	fs.mu.Lock()
	new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
	old_cfg := fs.cfg
	// Messages block reference fs.cfg.Subjects (in subjString) under the
	// mb's lock, not fs' lock. So do the switch here under all existing
	// message blocks' lock in order to silence the DATA RACE detector.
	fs.lockAllMsgBlocks()
	fs.cfg = new_cfg
	fs.unlockAllMsgBlocks()
	if err := fs.writeStreamMeta(); err != nil {
		fs.lockAllMsgBlocks()
		fs.cfg = old_cfg
		fs.unlockAllMsgBlocks()
		fs.mu.Unlock()
		return err
	}

	// Limits checks and enforcement.
	fs.enforceMsgLimit()
	fs.enforceBytesLimit()

	// Do age timers.
	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
		fs.startAgeChk()
	}
	if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
		fs.ageChk.Stop()
		fs.ageChk = nil
	}

	if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
		fs.enforceMsgPerSubjectLimit()
	}
	fs.mu.Unlock()

	if cfg.MaxAge != 0 {
		fs.expireMsgs()
	}
	return nil
}

func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
	if maxBytes > 0 {
		blkSize := (maxBytes / 4) + 1 // (25% overhead)
		// Round up to nearest 100
		if m := blkSize % 100; m != 0 {
			blkSize += 100 - m
		}
		if blkSize <= FileStoreMinBlkSize {
			blkSize = FileStoreMinBlkSize
		} else if blkSize >= FileStoreMaxBlkSize {
			blkSize = FileStoreMaxBlkSize
		} else {
			blkSize = defaultMediumBlockSize
		}
		if encrypted && blkSize > maximumEncryptedBlockSize {
			// Notes on this below.
			blkSize = maximumEncryptedBlockSize
		}
		return uint64(blkSize)
	}

	switch {
	case encrypted:
		// In the case of encrypted stores, large blocks can result in worsened perf
		// since many writes on disk involve re-encrypting the entire block. For now,
		// we will enforce a cap on the block size when encryption is enabled to avoid
		// this.
		return maximumEncryptedBlockSize
	case retention == LimitsPolicy:
		// TODO(dlc) - Make the blocksize relative to this if set.
		return defaultLargeBlockSize
	default:
		// TODO(dlc) - Make the blocksize relative to this if set.
		return defaultMediumBlockSize
	}
}

func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
	if sc == ChaCha {
		ek, err = chacha20poly1305.NewX(seed)
	} else if sc == AES {
		block, e := aes.NewCipher(seed)
		if e != nil {
			return nil, err
		}
		ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
	} else {
		err = errUnknownCipher
	}
	return ek, err
}

// Generate an asset encryption key from the context and server PRF.
func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
	if fs.prf == nil {
		return nil, nil, nil, nil, errNoEncryption
	}
	// Generate key encryption key.
	rb, err := fs.prf([]byte(context))
	if err != nil {
		return nil, nil, nil, nil, err
	}

	sc := fs.fcfg.Cipher

	kek, err := genEncryptionKey(sc, rb)
	if err != nil {
		return nil, nil, nil, nil, err
	}
	// Generate random asset encryption key seed.

	const seedSize = 32
	seed = make([]byte, seedSize)
	if n, err := rand.Read(seed); err != nil || n != seedSize {
		return nil, nil, nil, nil, err
	}

	aek, err = genEncryptionKey(sc, seed)
	if err != nil {
		return nil, nil, nil, nil, err
	}

	// Generate our nonce. Use same buffer to hold encrypted seed.
	nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
	rand.Read(nonce)

	bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
	if err != nil {
		return nil, nil, nil, nil, err
	}

	return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
}

// Will generate the block encryption key.
func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
	if sc == ChaCha {
		return chacha20.NewUnauthenticatedCipher(seed, nonce)
	} else if sc == AES {
		block, err := aes.NewCipher(seed)
		if err != nil {
			return nil, err
		}
		return cipher.NewCTR(block, nonce), nil
	}
	return nil, errUnknownCipher
}

// Lock should be held.
func (fs *fileStore) recoverAEK() error {
	if fs.prf != nil && fs.aek == nil {
		ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
		if err != nil {
			return err
		}
		rb, err := fs.prf([]byte(fs.cfg.Name))
		if err != nil {
			return err
		}
		kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
		if err != nil {
			return err
		}
		ns := kek.NonceSize()
		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
		if err != nil {
			return err
		}
		aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
		if err != nil {
			return err
		}
		fs.aek = aek
	}
	return nil
}

// Lock should be held.
func (fs *fileStore) setupAEK() error {
	if fs.prf != nil && fs.aek == nil {
		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
		if err != nil {
			return err
		}
		keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
			return err
		}
		if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
			return err
		}
		// Set our aek.
		fs.aek = key
	}
	return nil
}

// Write out meta and the checksum.
// Lock should be held.
func (fs *fileStore) writeStreamMeta() error {
	if err := fs.setupAEK(); err != nil {
		return err
	}

	meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
		return err
	}
	b, err := json.Marshal(fs.cfg)
	if err != nil {
		return err
	}
	// Encrypt if needed.
	if fs.aek != nil {
		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
		rand.Read(nonce)
		b = fs.aek.Seal(nonce, nonce, b, nil)
	}

	if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
		return err
	}
	fs.hh.Reset()
	fs.hh.Write(b)
	checksum := hex.EncodeToString(fs.hh.Sum(nil))
	sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
	if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
		return err
	}
	return nil
}

// Pools to recycle the blocks to help with memory pressure.
var blkPoolBig sync.Pool    // 16MB
var blkPoolMedium sync.Pool // 8MB
var blkPoolSmall sync.Pool  // 2MB

// Get a new msg block based on sz estimate.
func getMsgBlockBuf(sz int) (buf []byte) {
	var pb interface{}
	if sz <= defaultSmallBlockSize {
		pb = blkPoolSmall.Get()
	} else if sz <= defaultMediumBlockSize {
		pb = blkPoolMedium.Get()
	} else {
		pb = blkPoolBig.Get()
	}
	if pb != nil {
		buf = *(pb.(*[]byte))
	} else {
		// Here we need to make a new blk.
		// If small leave as is..
		if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
			sz = defaultMediumBlockSize
		} else if sz > defaultMediumBlockSize {
			sz = defaultLargeBlockSize
		}
		buf = make([]byte, sz)
	}
	return buf[:0]
}

// Recycle the msg block.
func recycleMsgBlockBuf(buf []byte) {
	if buf == nil || cap(buf) < defaultSmallBlockSize {
		return
	}
	// Make sure to reset before placing back into pool.
	buf = buf[:0]

	// We need to make sure the load code gets a block that can fit the maximum for a size block.
	// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
	// it right back in and making a new []byte.
	// From above we know its already >= defaultSmallBlockSize
	if sz := cap(buf); sz < defaultMediumBlockSize {
		blkPoolSmall.Put(&buf)
	} else if sz < defaultLargeBlockSize {
		blkPoolMedium.Put(&buf)
	} else {
		blkPoolBig.Put(&buf)
	}
}

const (
	msgHdrSize     = 22
	checksumSize   = 8
	emptyRecordLen = msgHdrSize + checksumSize
)

// Lock should be held.
func (fs *fileStore) noTrackSubjects() bool {
	return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
}

// Will init the basics for a message block.
func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
	mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}

	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))

	if mb.hh == nil {
		key := sha256.Sum256(fs.hashKeyForBlock(index))
		mb.hh, _ = highwayhash.New64(key[:])
	}
	return mb
}

// Lock for fs should be held.
func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
	if fs.prf == nil {
		return nil
	}

	var createdKeys bool
	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
	if err != nil {
		// We do not seem to have keys even though we should. Could be a plaintext conversion.
		// Create the keys and we will double check below.
		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
			return err
		}
		createdKeys = true
	} else {
		if len(ekey) < minBlkKeySize {
			return errBadKeySize
		}
		// Recover key encryption key.
		rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
		if err != nil {
			return err
		}

		sc := fs.fcfg.Cipher
		kek, err := genEncryptionKey(sc, rb)
		if err != nil {
			return err
		}
		ns := kek.NonceSize()
		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
		if err != nil {
			// We may be here on a cipher conversion, so attempt to convert.
			if err = mb.convertCipher(); err != nil {
				return err
			}
		} else {
			mb.seed, mb.nonce = seed, ekey[:ns]
		}
		mb.aek, err = genEncryptionKey(sc, mb.seed)
		if err != nil {
			return err
		}
		if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
			return err
		}
	}

	// If we created keys here, let's check the data and if it is plaintext convert here.
	if createdKeys {
		if err := mb.convertToEncrypted(); err != nil {
			return err
		}
	}

	return nil
}

// Load a last checksum if needed from the block file.
// Lock should be held.
func (mb *msgBlock) ensureLastChecksumLoaded() {
	var empty [8]byte
	if mb.lchk != empty {
		return
	}
	copy(mb.lchk[0:], mb.lastChecksum())
}

// Lock held on entry
func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
	mb := fs.initMsgBlock(index)

	// Open up the message file, but we will try to recover from the index file.
	// We will check that the last checksums match.
	file, err := os.Open(mb.mfn)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	if fi, err := file.Stat(); fi != nil {
		mb.rbytes = uint64(fi.Size())
	} else {
		return nil, err
	}

	// Make sure encryption loaded if needed.
	fs.loadEncryptionForMsgBlock(mb)

	// Grab last checksum from main block file.
	var lchk [8]byte
	if mb.rbytes >= checksumSize {
		if mb.bek != nil {
			if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
				mb.bek.XORKeyStream(buf, buf)
				copy(lchk[0:], buf[len(buf)-checksumSize:])
			}
		} else {
			file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
		}
	}

	file.Close()

	// Read our index file. Use this as source of truth if possible.
	if err := mb.readIndexInfo(); err == nil {
		// Quick sanity check here.
		// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
		if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
			if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
				fs.populateGlobalPerSubjectInfo(mb)
				// Try to dump any state we needed on recovery.
				mb.tryForceExpireCacheLocked()
			}
			fs.addMsgBlock(mb)
			return mb, nil
		}
	}

	// If we get data loss rebuilding the message block state record that with the fs itself.
	ld, tombs, _ := mb.rebuildState()
	if ld != nil {
		fs.addLostData(ld)
	}
	// Collect all tombstones.
	if len(tombs) > 0 {
		fs.tombs = append(fs.tombs, tombs...)
	}

	if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
		fs.populateGlobalPerSubjectInfo(mb)
		// Try to dump any state we needed on recovery.
		mb.tryForceExpireCacheLocked()
	}

	mb.closeFDs()
	fs.addMsgBlock(mb)

	return mb, nil
}

func (fs *fileStore) lostData() *LostStreamData {
	fs.mu.RLock()
	defer fs.mu.RUnlock()
	if fs.ld == nil {
		return nil
	}
	nld := *fs.ld
	return &nld
}

// Lock should be held.
func (fs *fileStore) addLostData(ld *LostStreamData) {
	if ld == nil {
		return
	}
	if fs.ld != nil {
		var added bool
		for _, seq := range ld.Msgs {
			if _, found := fs.ld.exists(seq); !found {
				fs.ld.Msgs = append(fs.ld.Msgs, seq)
				added = true
			}
		}
		if added {
			msgs := fs.ld.Msgs
			sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
			fs.ld.Bytes += ld.Bytes
		}
	} else {
		fs.ld = ld
	}
}

// Helper to see if we already have this sequence reported in our lost data.
func (ld *LostStreamData) exists(seq uint64) (int, bool) {
	i, found := sort.Find(len(ld.Msgs), func(i int) int {
		tseq := ld.Msgs[i]
		if tseq < seq {
			return -1
		}
		if tseq > seq {
			return +1
		}
		return 0
	})
	return i, found
}

func (fs *fileStore) removeFromLostData(seq uint64) {
	if fs.ld == nil {
		return
	}
	if i, found := fs.ld.exists(seq); found {
		fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
		if len(fs.ld.Msgs) == 0 {
			fs.ld = nil
		}
	}
}

func (fs *fileStore) rebuildState(ld *LostStreamData) {
	fs.mu.Lock()
	defer fs.mu.Unlock()
	fs.rebuildStateLocked(ld)
}

// Lock should be held.
func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
	fs.addLostData(ld)

	fs.state.Msgs, fs.state.Bytes = 0, 0
	fs.state.FirstSeq, fs.state.LastSeq = 0, 0

	for _, mb := range fs.blks {
		mb.mu.RLock()
		fs.state.Msgs += mb.msgs
		fs.state.Bytes += mb.bytes
		if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
			fs.state.FirstSeq = mb.first.seq
			fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
		}
		fs.state.LastSeq = mb.last.seq
		fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
		mb.mu.RUnlock()
	}
}

// Attempt to convert the cipher used for this message block.
func (mb *msgBlock) convertCipher() error {
	fs := mb.fs
	sc := fs.fcfg.Cipher

	var osc StoreCipher
	switch sc {
	case ChaCha:
		osc = AES
	case AES:
		osc = ChaCha
	}

	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
	if err != nil {
		return err
	}
	if len(ekey) < minBlkKeySize {
		return errBadKeySize
	}
	type prfWithCipher struct {
		keyGen
		StoreCipher
	}
	var prfs []prfWithCipher
	if fs.prf != nil {
		prfs = append(prfs, prfWithCipher{fs.prf, sc})
		prfs = append(prfs, prfWithCipher{fs.prf, osc})
	}
	if fs.oldprf != nil {
		prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
		prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
	}

	for _, prf := range prfs {
		// Recover key encryption key.
		rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
		if err != nil {
			continue
		}
		kek, err := genEncryptionKey(prf.StoreCipher, rb)
		if err != nil {
			continue
		}
		ns := kek.NonceSize()
		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
		if err != nil {
			continue
		}
		nonce := ekey[:ns]
		bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
		if err != nil {
			return err
		}

		buf, _ := mb.loadBlock(nil)
		bek.XORKeyStream(buf, buf)
		// Make sure we can parse with old cipher and key file.
		if err = mb.indexCacheBuf(buf); err != nil {
			return err
		}
		// Reset the cache since we just read everything in.
		mb.cache = nil

		// Generate new keys. If we error for some reason then we will put
		// the old keyfile back.
		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
			keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
			os.WriteFile(keyFile, ekey, defaultFilePerms)
			return err
		}
		mb.bek.XORKeyStream(buf, buf)
		if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
			return err
		}
		return nil
	}
	return fmt.Errorf("unable to recover keys")
}

// Convert a plaintext block to encrypted.
func (mb *msgBlock) convertToEncrypted() error {
	if mb.bek == nil {
		return nil
	}
	buf, err := mb.loadBlock(nil)
	if err != nil {
		return err
	}
	if err := mb.indexCacheBuf(buf); err != nil {
		// This likely indicates this was already encrypted or corrupt.
		mb.cache = nil
		return err
	}
	// Undo cache from above for later.
	mb.cache = nil
	mb.bek.XORKeyStream(buf, buf)
	if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
		return err
	}
	return nil
}

// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// We will return any lost data, and we will return any delete tombstones we encountered.
func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	return mb.rebuildStateLocked()
}

// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// Lock should be held.
func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
	startLastSeq := mb.last.seq

	// Remove the .fss file and clear any cache we have set.
	mb.clearCacheAndOffset()

	buf, err := mb.loadBlock(nil)
	if err != nil || len(buf) == 0 {
		var ld *LostStreamData
		// No data to rebuild from here.
		if mb.msgs > 0 {
			// We need to declare lost data here.
			ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
			for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
				if !mb.dmap.Exists(seq) {
					ld.Msgs = append(ld.Msgs, seq)
				}
			}
			// Clear invalid state. We will let this blk be added in here.
			mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
			mb.dmap.Empty()
			mb.first.seq = mb.last.seq + 1
		}
		return ld, nil, err
	}

	// Clear state we need to rebuild.
	mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
	mb.last.seq, mb.last.ts = 0, 0
	firstNeedsSet := true

	// Check if we need to decrypt.
	if mb.bek != nil && len(buf) > 0 {
		// Recreate to reset counter.
		mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
		if err != nil {
			return nil, nil, err
		}
		mb.bek.XORKeyStream(buf, buf)
	}

	// Check for compression.
	if buf, err = mb.decompressIfNeeded(buf); err != nil {
		return nil, nil, err
	}

	mb.rbytes = uint64(len(buf))

	addToDmap := func(seq uint64) {
		if seq == 0 {
			return
		}
		mb.dmap.Insert(seq)
	}

	var le = binary.LittleEndian

	truncate := func(index uint32) {
		var fd *os.File
		if mb.mfd != nil {
			fd = mb.mfd
		} else {
			fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
			if err == nil {
				defer fd.Close()
			}
		}
		if fd == nil {
			return
		}
		if err := fd.Truncate(int64(index)); err == nil {
			// Update our checksum.
			if index >= 8 {
				var lchk [8]byte
				fd.ReadAt(lchk[:], int64(index-8))
				copy(mb.lchk[0:], lchk[:])
			}
			fd.Sync()
		}
	}

	gatherLost := func(lb uint32) *LostStreamData {
		var ld LostStreamData
		for seq := mb.last.seq + 1; seq <= startLastSeq; seq++ {
			ld.Msgs = append(ld.Msgs, seq)
		}
		ld.Bytes = uint64(lb)
		return &ld
	}

	// For tombstones that we find and collect.
	var (
		tombstones      []uint64
		minTombstoneSeq uint64
		minTombstoneTs  int64
	)

	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
		if index+msgHdrSize > lbuf {
			truncate(index)
			return gatherLost(lbuf - index), tombstones, nil
		}

		hdr := buf[index : index+msgHdrSize]
		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])

		hasHeaders := rl&hbit != 0
		// Clear any headers bit that could be set.
		rl &^= hbit
		dlen := int(rl) - msgHdrSize
		// Do some quick sanity checks here.
		if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
			truncate(index)
			return gatherLost(lbuf - index), tombstones, errBadMsg
		}

		// Check for checksum failures before additional processing.
		data := buf[index+msgHdrSize : index+rl]
		if hh := mb.hh; hh != nil {
			hh.Reset()
			hh.Write(hdr[4:20])
			hh.Write(data[:slen])
			if hasHeaders {
				hh.Write(data[slen+4 : dlen-recordHashSize])
			} else {
				hh.Write(data[slen : dlen-recordHashSize])
			}
			checksum := hh.Sum(nil)
			if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
				truncate(index)
				return gatherLost(lbuf - index), tombstones, errBadMsg
			}
			copy(mb.lchk[0:], checksum)
		}

		// Grab our sequence and timestamp.
		seq := le.Uint64(hdr[4:])
		ts := int64(le.Uint64(hdr[12:]))

		// Check if this is a delete tombstone.
		if seq&tbit != 0 {
			seq = seq &^ tbit
			// Need to process this here and make sure we have accounted for this properly.
			tombstones = append(tombstones, seq)
			if minTombstoneSeq == 0 || seq < minTombstoneSeq {
				minTombstoneSeq, minTombstoneTs = seq, ts
			}
			index += rl
			continue
		}

		// This is an old erased message, or a new one that we can track.
		if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
			seq = seq &^ ebit
			if seq >= mb.first.seq {
				// Only add to dmap if past recorded first seq and non-zero.
				if seq != 0 {
					addToDmap(seq)
				}
				mb.last.seq = seq
				mb.last.ts = ts
				if mb.msgs == 0 {
					mb.first.seq, mb.first.ts = seq+1, 0
				}
			}
			index += rl
			continue
		}

		// This is for when we have index info that adjusts for deleted messages
		// at the head. So the first.seq will be already set here. If this is larger
		// replace what we have with this seq.
		if firstNeedsSet && seq >= mb.first.seq {
			firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts
		}

		if !mb.dmap.Exists(seq) {
			mb.msgs++
			mb.bytes += uint64(rl)

			// Rebuild per subject info if needed.
			if slen > 0 {
				if mb.fss == nil {
					mb.fss = make(map[string]*SimpleState)
				}
				// For the lookup, we cast the byte slice and there won't be any copy
				if ss := mb.fss[string(data[:slen])]; ss != nil {
					ss.Msgs++
					ss.Last = seq
				} else {
					// This will either use a subject from the config, or make a copy
					// so we don't reference the underlying buffer.
					subj := mb.subjString(data[:slen])
					mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
				}
			}
		}

		// Always set last
		mb.last.seq = seq
		mb.last.ts = ts

		// Advance to next record.
		index += rl
	}

	// For empty msg blocks make sure we recover last seq correctly based off of first.
	// Or if we seem to have no messages but had a tombstone, which we use to remember
	// sequences and timestamps now, use that to properly setup the first and last.
	if mb.msgs == 0 {
		if mb.first.seq > 0 {
			mb.last.seq = mb.first.seq - 1
		} else if mb.first.seq == 0 && minTombstoneSeq > 0 {
			mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0
			if mb.last.seq == 0 {
				mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs
			}
		}
	}

	return nil, tombstones, nil
}

// recoverFullState will attempt to receover our last full state and re-process any state changes
// that happened afterwards.
func (fs *fileStore) recoverFullState() (rerr error) {
	fs.mu.Lock()
	defer fs.mu.Unlock()

	// Check for any left over purged messages.
	<-dios
	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
	if _, err := os.Stat(pdir); err == nil {
		os.RemoveAll(pdir)
	}

	// Grab our stream state file and load it in.
	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
	buf, err := os.ReadFile(fn)
	dios <- struct{}{}

	if err != nil {
		return err
	}

	const minLen = 32
	if len(buf) < minLen {
		os.Remove(fn)
		return errCorruptState
	}

	// The highwayhash will be on the end. Check that it still matches.
	h := buf[len(buf)-highwayhash.Size64:]
	buf = buf[:len(buf)-highwayhash.Size64]
	fs.hh.Reset()
	fs.hh.Write(buf)
	if !bytes.Equal(h, fs.hh.Sum(nil)) {
		os.Remove(fn)
		return errCorruptState
	}

	// Decrypt if needed.
	if fs.prf != nil {
		// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
		// since snapshots strip encryption.
		if err := fs.recoverAEK(); err == nil {
			ns := fs.aek.NonceSize()
			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
			if err != nil {
				return err
			}
		}
	}

	if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
		os.Remove(fn)
		return errCorruptState
	}

	bi := hdrLen

	readU64 := func() uint64 {
		if bi < 0 {
			return 0
		}
		v, n := binary.Uvarint(buf[bi:])
		if n <= 0 {
			bi = -1
			return 0
		}
		bi += n
		return v
	}
	readI64 := func() int64 {
		if bi < 0 {
			return 0
		}
		v, n := binary.Varint(buf[bi:])
		if n <= 0 {
			bi = -1
			return -1
		}
		bi += n
		return v
	}

	setTime := func(t *time.Time, ts int64) {
		if ts == 0 {
			*t = time.Time{}
		} else {
			*t = time.Unix(0, ts).UTC()
		}
	}

	var state StreamState
	state.Msgs = readU64()
	state.Bytes = readU64()
	state.FirstSeq = readU64()
	baseTime := readI64()
	setTime(&state.FirstTime, baseTime)
	state.LastSeq = readU64()
	setTime(&state.LastTime, readI64())

	// Check for per subject info.
	if numSubjects := int(readU64()); numSubjects > 0 {
		fs.psim = make(map[string]*psi, numSubjects)
		for i := 0; i < numSubjects; i++ {
			if lsubj := int(readU64()); lsubj > 0 {
				if bi+lsubj > len(buf) {
					os.Remove(fn)
					return errCorruptState
				}
				subj := fs.subjString(buf[bi : bi+lsubj])
				bi += lsubj
				psi := &psi{total: readU64(), fblk: uint32(readU64())}
				if psi.total > 1 {
					psi.lblk = uint32(readU64())
				} else {
					psi.lblk = psi.fblk
				}
				fs.psim[subj] = psi
			}
		}
	}

	if numBlocks := readU64(); numBlocks > 0 {
		fs.blks = make([]*msgBlock, 0, numBlocks)
		for i := 0; i < int(numBlocks); i++ {
			index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
			if bi < 0 {
				break
			}
			mb := fs.initMsgBlock(index)
			mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes
			mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
			if numDeleted > 0 {
				dmap, n, err := avl.Decode(buf[bi:])
				if err != nil {
					os.Remove(fn)
					return errCorruptState
				}
				mb.dmap = *dmap
				mb.msgs -= numDeleted
				bi += n
			}
			fs.addMsgBlock(mb)
		}
	}

	// Pull in last block index for the block that had last checksum when we wrote the full state.
	blkIndex := uint32(readU64())
	var lchk [8]byte
	if bi+len(lchk) > len(buf) {
		bi = -1
	} else {
		copy(lchk[0:], buf[bi:bi+len(lchk)])
	}

	// Check if we had any errors.
	if bi < 0 {
		os.Remove(fn)
		return errCorruptState
	}

	// Move into place our state, msgBlks and subject info.
	fs.state = state

	// First let's check the happy path, open the blk file that was the lmb when we created the full state.
	// See if we have the last block available.
	var matched bool
	var mb *msgBlock
	if mb = fs.bim[blkIndex]; mb != nil {
		if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
			// If our saved state is past what we see on disk, fallback and rebuild.
			if ld, _, _ := mb.rebuildState(); ld != nil {
				fs.addLostData(ld)
			}
			return errPriorState
		}

		if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
			// Remove the last message block since we will re-process below.
			fs.removeMsgBlockFromList(mb)
		}
	}

	// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
	// If we did not match we re-process the last block.
	start := blkIndex
	if matched {
		start++
	}

	for bi := start; ; bi++ {
		nmb, err := fs.recoverMsgBlock(bi)
		if err != nil {
			if os.IsNotExist(err) {
				return nil
			}
			os.Remove(fn)
			return err
		}
		if nmb != nil {
			// Check if we have to account for a partial message block.
			if !matched && mb != nil && mb.index == nmb.index {
				if err := fs.adjustAccounting(mb, nmb); err != nil {
					return err
				}
			}
			// Update top level accounting.
			if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq {
				fs.state.FirstSeq = nmb.first.seq
				fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
			}
			if nmb.last.seq > fs.state.LastSeq {
				fs.state.LastSeq = nmb.last.seq
				fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
			}
			fs.state.Msgs += nmb.msgs
			fs.state.Bytes += nmb.bytes
		}
	}
}

// adjustAccounting will be called when a stream state was only partially accounted for
// with a message block, e.g. additional records were added after the stream state.
// Lock should be held.
func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error {
	nmb.mu.Lock()
	defer nmb.mu.Unlock()

	// First make sure the new block is loaded.
	if nmb.cacheNotLoaded() {
		nmb.loadMsgsWithLock()
	}
	nmb.ensurePerSubjectInfoLoaded()

	lookupAndAdjust := func(seq uint64) error {
		var smv StoreMsg
		// Lookup the message.
		sm, err := nmb.cacheLookup(seq, &smv)
		if err != nil {
			return err
		}
		// Since we found it we just need to adjust fs totals and psim.
		fs.state.Msgs--
		fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
		if len(sm.subj) > 0 && fs.psim != nil {
			fs.removePerSubject(sm.subj)
		}
		return nil
	}

	// Walk all the original mb's sequences that were included in the stream state.
	for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
		// If we had already declared it deleted we can move on since you can not undelete.
		if mb.dmap.Exists(seq) {
			continue
		}
		// Lookup the message.
		if err := lookupAndAdjust(seq); err != nil {
			return err
		}
	}

	// Now check to see if we had a higher first for the recovered state mb vs nmb.
	if nmb.first.seq < mb.first.seq {
		for seq := nmb.first.seq; seq < mb.first.seq; seq++ {
			// Lookup the message.
			if err := lookupAndAdjust(seq); err != nil {
				return err
			}
		}
		// Now set first for nmb.
		nmb.first = mb.first
	}

	return nil
}

// Grabs last checksum for the named block file.
// Takes into account encryption etc.
func (mb *msgBlock) lastChecksum() []byte {
	f, err := os.Open(mb.mfn)
	if err != nil {
		return nil
	}
	defer f.Close()

	var lchk [8]byte
	if fi, _ := f.Stat(); fi != nil {
		mb.rbytes = uint64(fi.Size())
	}
	if mb.rbytes < checksumSize {
		return nil
	}
	// Encrypted?
	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
			return nil
		}
	}
	if mb.bek != nil {
		if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
			if err != nil {
				return nil
			}
			mb.bek = bek
			mb.bek.XORKeyStream(buf, buf)
			copy(lchk[0:], buf[len(buf)-checksumSize:])
		}
	} else {
		f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
	}
	return lchk[:]
}

func (fs *fileStore) recoverMsgs() error {
	fs.mu.Lock()
	defer fs.mu.Unlock()

	// Check for any left over purged messages.
	<-dios
	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
	if _, err := os.Stat(pdir); err == nil {
		os.RemoveAll(pdir)
	}
	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	f, err := os.Open(mdir)
	if err != nil {
		dios <- struct{}{}
		return errNotReadable
	}
	dirs, err := f.ReadDir(-1)
	f.Close()
	dios <- struct{}{}

	if err != nil {
		return errNotReadable
	}

	indices := make(sort.IntSlice, 0, len(dirs))
	var index int
	for _, fi := range dirs {
		if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
			indices = append(indices, index)
		}
	}
	indices.Sort()

	// Recover all of the msg blocks.
	// We now guarantee they are coming in order.
	for _, index := range indices {
		if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
			// This is a truncate block with possibly no index. If the OS got shutdown
			// out from underneath of us this is possible.
			if mb.first.seq == 0 {
				mb.dirtyCloseWithRemove(true)
				fs.removeMsgBlockFromList(mb)
				continue
			}
			if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
				fs.state.FirstSeq = mb.first.seq
				if mb.first.ts == 0 {
					fs.state.FirstTime = time.Time{}
				} else {
					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
				}
			}
			if mb.last.seq > fs.state.LastSeq {
				fs.state.LastSeq = mb.last.seq
				fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
			}
			fs.state.Msgs += mb.msgs
			fs.state.Bytes += mb.bytes
		} else {
			return err
		}
	}

	if len(fs.blks) > 0 {
		fs.lmb = fs.blks[len(fs.blks)-1]
	} else {
		_, err = fs.newMsgBlockForWrite()
	}

	// Check if we encountered any lost data.
	if fs.ld != nil {
		var emptyBlks []*msgBlock
		for _, mb := range fs.blks {
			if mb.msgs == 0 && mb.rbytes == 0 {
				emptyBlks = append(emptyBlks, mb)
			}
		}
		for _, mb := range emptyBlks {
			fs.removeMsgBlock(mb)
		}
	}

	if err != nil {
		return err
	}

	// Check for keyfiles orphans.
	if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
		valid := make(map[uint32]bool)
		for _, mb := range fs.blks {
			valid[mb.index] = true
		}
		for _, fn := range kms {
			var index uint32
			shouldRemove := true
			if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
				shouldRemove = false
			}
			if shouldRemove {
				os.Remove(fn)
			}
		}
	}

	return nil
}

// Will expire msgs that have aged out on restart.
// We will treat this differently in case we have a recovery
// that will expire alot of messages on startup.
// Should only be called on startup.
func (fs *fileStore) expireMsgsOnRecover() {
	if fs.state.Msgs == 0 {
		return
	}

	var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
	var purged, bytes uint64
	var deleted int
	var nts int64

	// If we expire all make sure to write out a tombstone. Need to be done by hand here,
	// usually taken care of by fs.removeMsgBlock() but we do not call that here.
	var last msgId

	deleteEmptyBlock := func(mb *msgBlock) {
		// If we are the last keep state to remember first/last sequence.
		// Do this part by hand since not deleting one by one.
		if mb == fs.lmb {
			last = mb.last
		}
		// Make sure we do subject cleanup as well.
		mb.ensurePerSubjectInfoLoaded()
		for subj := range mb.fss {
			fs.removePerSubject(subj)
		}
		// Make sure we do subject cleanup as well.
		mb.ensurePerSubjectInfoLoaded()
		for subj := range mb.fss {
			fs.removePerSubject(subj)
		}
		mb.dirtyCloseWithRemove(true)
		deleted++
	}

	for _, mb := range fs.blks {
		mb.mu.Lock()
		if minAge < mb.first.ts {
			nts = mb.first.ts
			mb.mu.Unlock()
			break
		}
		// Can we remove whole block here?
		if mb.last.ts <= minAge {
			purged += mb.msgs
			bytes += mb.bytes
			deleteEmptyBlock(mb)
			mb.mu.Unlock()
			continue
		}

		// If we are here we have to process the interior messages of this blk.
		if err := mb.loadMsgsWithLock(); err != nil {
			mb.mu.Unlock()
			break
		}

		var smv StoreMsg
		var needNextFirst bool

		// Walk messages and remove if expired.
		mb.ensurePerSubjectInfoLoaded()
		for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
			sm, err := mb.cacheLookup(seq, &smv)
			// Process interior deleted msgs.
			if err == errDeletedMsg {
				// Update dmap.
				if mb.dmap.Exists(seq) {
					mb.dmap.Delete(seq)
				}
				// Keep this updated just in case since we are removing dmap entries.
				mb.first.seq, needNextFirst = seq, true
				continue
			}
			// Break on other errors.
			if err != nil || sm == nil {
				mb.first.seq, needNextFirst = seq, true
				break
			}

			// No error and sm != nil from here onward.

			// Check for done.
			if minAge < sm.ts {
				mb.first.seq, needNextFirst = sm.seq, false
				mb.first.seq = sm.seq
				mb.first.ts = sm.ts
				nts = sm.ts
				break
			}

			// Delete the message here.
			if mb.msgs > 0 {
				mb.first.seq, needNextFirst = seq, true
				sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
				if sz > mb.bytes {
					sz = mb.bytes
				}
				mb.bytes -= sz
				bytes += sz
				mb.msgs--
				purged++
			}
			// Update fss
			// Make sure we have fss loaded.
			mb.removeSeqPerSubject(sm.subj, seq)
			fs.removePerSubject(sm.subj)
		}
		// Make sure we have a proper next first sequence.
		if needNextFirst {
			mb.selectNextFirst()
		}
		// Check if empty after processing, could happen if tail of messages are all deleted.
		if mb.msgs == 0 {
			deleteEmptyBlock(mb)
		}
		mb.mu.Unlock()
		break
	}

	if nts > 0 {
		// Make sure to set age check based on this value.
		fs.resetAgeChk(nts - minAge)
	}

	if deleted > 0 {
		// Update block map.
		if fs.bim != nil {
			for _, mb := range fs.blks[:deleted] {
				delete(fs.bim, mb.index)
			}
		}
		// Update blks slice.
		fs.blks = copyMsgBlocks(fs.blks[deleted:])
		if lb := len(fs.blks); lb == 0 {
			fs.lmb = nil
		} else {
			fs.lmb = fs.blks[lb-1]
		}
	}
	// Update top level accounting.
	if purged < fs.state.Msgs {
		fs.state.Msgs -= purged
	} else {
		fs.state.Msgs = 0
	}
	if bytes < fs.state.Bytes {
		fs.state.Bytes -= bytes
	} else {
		fs.state.Bytes = 0
	}
	// Make sure to we properly set the fs first sequence and timestamp.
	fs.selectNextFirst()

	// Check if we have no messages and blocks left.
	if fs.lmb == nil && last.seq != 0 {
		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
			lmb.writeTombstone(last.seq, last.ts)
		}
		// Clear any global subject state.
		fs.psim = make(map[string]*psi)
	}
}

func copyMsgBlocks(src []*msgBlock) []*msgBlock {
	if src == nil {
		return nil
	}
	dst := make([]*msgBlock, len(src))
	copy(dst, src)
	return dst
}

// GetSeqFromTime looks for the first sequence number that has
// the message with >= timestamp.
// FIXME(dlc) - inefficient, and dumb really. Make this better.
func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
	fs.mu.RLock()
	lastSeq := fs.state.LastSeq
	closed := fs.closed
	fs.mu.RUnlock()

	if closed {
		return 0
	}

	mb := fs.selectMsgBlockForStart(t)
	if mb == nil {
		return lastSeq + 1
	}

	mb.mu.RLock()
	fseq := mb.first.seq
	lseq := mb.last.seq
	mb.mu.RUnlock()

	var smv StoreMsg

	// Linear search, hence the dumb part..
	ts := t.UnixNano()
	for seq := fseq; seq <= lseq; seq++ {
		sm, _, _ := mb.fetchMsg(seq, &smv)
		if sm != nil && sm.ts >= ts {
			return sm.seq
		}
	}
	return 0
}

// Find the first matching message.
func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
	mb.mu.Lock()
	defer mb.mu.Unlock()

	fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}

	if mb.cacheNotLoaded() {
		if err := mb.loadMsgsWithLock(); err != nil {
			return nil, false, err
		}
		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
			return nil, false, err
		}
	}

	// If we only have 1 subject currently and it matches our filter we can also set isAll.
	if !isAll && len(mb.fss) == 1 {
		_, isAll = mb.fss[filter]
	}
	// Skip scan of mb.fss if number of messages in the block are less than
	// 1/2 the number of subjects in mb.fss. Or we have a wc and lots of fss entries.
	const linearScanMaxFSS = 32
	doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS)

	if !doLinearScan {
		// If we have a wildcard match against all tracked subjects we know about.
		if wc {
			subs = subs[:0]
			for subj := range mb.fss {
				if subjectIsSubsetMatch(subj, filter) {
					subs = append(subs, subj)
				}
			}
		}
		fseq = mb.last.seq + 1
		for _, subj := range subs {
			ss := mb.fss[subj]
			if ss != nil && ss.firstNeedsUpdate {
				mb.recalculateFirstForSubj(subj, ss.First, ss)
			}
			if ss == nil || start > ss.Last || ss.First >= fseq {
				continue
			}
			if ss.First < start {
				fseq = start
			} else {
				fseq = ss.First
			}
		}
	}

	if fseq > mb.last.seq {
		return nil, false, ErrStoreMsgNotFound
	}

	if sm == nil {
		sm = new(StoreMsg)
	}

	for seq := fseq; seq <= mb.last.seq; seq++ {
		llseq := mb.llseq
		fsm, err := mb.cacheLookup(seq, sm)
		if err != nil {
			continue
		}
		expireOk := seq == mb.last.seq && mb.llseq == seq
		if isAll {
			return fsm, expireOk, nil
		}
		if doLinearScan {
			if wc && subjectIsSubsetMatch(fsm.subj, filter) {
				return fsm, expireOk, nil
			} else if !wc && fsm.subj == filter {
				return fsm, expireOk, nil
			}
		} else {
			for _, subj := range subs {
				if fsm.subj == subj {
					return fsm, expireOk, nil
				}
			}
		}
		// If we are here we did not match, so put the llseq back.
		mb.llseq = llseq
	}

	return nil, false, ErrStoreMsgNotFound
}

// This will traverse a message block and generate the filtered pending.
func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	return mb.filteredPendingLocked(subj, wc, seq)
}

// This will traverse a message block and generate the filtered pending.
// Lock should be held.
func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
	isAll := filter == _EMPTY_ || filter == fwcs

	// First check if we can optimize this part.
	// This means we want all and the starting sequence was before this block.
	if isAll && sseq <= mb.first.seq {
		return mb.msgs, mb.first.seq, mb.last.seq
	}

	update := func(ss *SimpleState) {
		total += ss.Msgs
		if first == 0 || ss.First < first {
			first = ss.First
		}
		if ss.Last > last {
			last = ss.Last
		}
	}

	// Make sure we have fss loaded.
	mb.ensurePerSubjectInfoLoaded()

	tsa := [32]string{}
	fsa := [32]string{}
	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)

	// 1. See if we match any subs from fss.
	// 2. If we match and the sseq is past ss.Last then we can use meta only.
	// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.

	isMatch := func(subj string) bool {
		if !wc {
			return subj == filter
		}
		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
		return isSubsetMatchTokenized(tts, fts)
	}

	var havePartial bool
	for subj, ss := range mb.fss {
		if isAll || isMatch(subj) {
			if ss.firstNeedsUpdate {
				mb.recalculateFirstForSubj(subj, ss.First, ss)
			}
			if sseq <= ss.First {
				update(ss)
			} else if sseq <= ss.Last {
				// We matched but its a partial.
				havePartial = true
				break
			}
		}
	}

	// If we did not encounter any partials we can return here.
	if !havePartial {
		return total, first, last
	}

	// If we are here we need to scan the msgs.
	// Clear what we had.
	total, first, last = 0, 0, 0

	// If we load the cache for a linear scan we want to expire that cache upon exit.
	var shouldExpire bool
	if mb.cacheNotLoaded() {
		mb.loadMsgsWithLock()
		shouldExpire = true
	}

	var smv StoreMsg
	for seq := sseq; seq <= mb.last.seq; seq++ {
		sm, _ := mb.cacheLookup(seq, &smv)
		if sm == nil {
			continue
		}
		if isAll || isMatch(sm.subj) {
			total++
			if first == 0 || seq < first {
				first = seq
			}
			if seq > last {
				last = seq
			}
		}
	}
	// If we loaded this block for this operation go ahead and expire it here.
	if shouldExpire {
		mb.tryForceExpireCacheLocked()
	}

	return total, first, last
}

// FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	lseq := fs.state.LastSeq
	if sseq < fs.state.FirstSeq {
		sseq = fs.state.FirstSeq
	}

	// Returned state.
	var ss SimpleState

	// If past the end no results.
	if sseq > lseq {
		return ss
	}

	// If we want all msgs that match we can shortcircuit.
	// TODO(dlc) - This can be extended for all cases but would
	// need to be careful on total msgs calculations etc.
	if sseq == fs.state.FirstSeq {
		fs.numFilteredPending(subj, &ss)
	} else {
		wc := subjectHasWildcard(subj)
		// Tracking subject state.
		// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
		for _, mb := range fs.blks {
			// Skip blocks that are less than our starting sequence.
			if sseq > atomic.LoadUint64(&mb.last.seq) {
				continue
			}
			t, f, l := mb.filteredPending(subj, wc, sseq)
			ss.Msgs += t
			if ss.First == 0 || (f > 0 && f < ss.First) {
				ss.First = f
			}
			if l > ss.Last {
				ss.Last = l
			}
		}
	}

	return ss
}

// Optimized way for getting all num pending matching a filter subject.
// Lock should be held.
func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
	isAll := filter == _EMPTY_ || filter == fwcs

	// If isAll we do not need to do anything special to calculate the first and last and total.
	if isAll {
		ss.First = fs.state.FirstSeq
		ss.Last = fs.state.LastSeq
		ss.Msgs = fs.state.Msgs
		return
	}

	tsa := [32]string{}
	fsa := [32]string{}
	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)

	start, stop := uint32(math.MaxUint32), uint32(0)
	for subj, psi := range fs.psim {
		if isAll {
			ss.Msgs += psi.total
		} else {
			tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
			if isSubsetMatchTokenized(tts, fts) {
				ss.Msgs += psi.total
				// Keep track of start and stop indexes for this subject.
				if psi.fblk < start {
					start = psi.fblk
				}
				if psi.lblk > stop {
					stop = psi.lblk
				}
			}
		}
	}
	// If not collecting all we do need to figure out the first and last sequences.
	if !isAll {
		wc := subjectHasWildcard(filter)
		// Do start
		mb := fs.bim[start]
		if mb != nil {
			_, f, _ := mb.filteredPending(filter, wc, 0)
			ss.First = f
		}
		if ss.First == 0 {
			// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
			for i := start + 1; i <= stop; i++ {
				mb := fs.bim[i]
				if mb == nil {
					continue
				}
				if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
					ss.First = f
					break
				}
			}
		}
		// Now last
		if mb = fs.bim[stop]; mb != nil {
			_, _, l := mb.filteredPending(filter, wc, 0)
			ss.Last = l
		}
	}
}

// SubjectsState returns a map of SimpleState for all matching subjects.
func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	if fs.state.Msgs == 0 {
		return nil
	}

	start, stop := fs.blks[0], fs.lmb
	// We can short circuit if not a wildcard using psim for start and stop.
	if !subjectHasWildcard(subject) {
		info := fs.psim[subject]
		if info == nil {
			return nil
		}
		start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
	}

	// Aggregate fss.
	fss := make(map[string]SimpleState)
	var startFound bool

	for _, mb := range fs.blks {
		if !startFound {
			if mb != start {
				continue
			}
			startFound = true
		}

		mb.mu.Lock()
		// Make sure we have fss loaded.
		mb.ensurePerSubjectInfoLoaded()
		for subj, ss := range mb.fss {
			if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
				if ss.firstNeedsUpdate {
					mb.recalculateFirstForSubj(subj, ss.First, ss)
				}
				oss := fss[subj]
				if oss.First == 0 { // New
					fss[subj] = *ss
				} else {
					// Merge here.
					oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
					fss[subj] = oss
				}
			}
		}
		mb.mu.Unlock()

		if mb == stop {
			break
		}
	}

	return fss
}

// NumPending will return the number of pending messages matching the filter subject starting at sequence.
// Optimized for stream num pending calculations for consumers.
func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	// This can always be last for these purposes.
	validThrough = fs.state.LastSeq

	if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
		return 0, validThrough
	}

	// Track starting for both block for the sseq and staring block that matches any subject.
	var seqStart, subjStart int

	// See if we need to figure out starting block per sseq.
	if sseq > fs.state.FirstSeq {
		seqStart, _ = fs.selectMsgBlockWithIndex(sseq)
	}

	var tsa, fsa [32]string
	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
	isAll := filter == _EMPTY_ || filter == fwcs
	wc := subjectHasWildcard(filter)

	// See if filter was provided but its the only subject.
	if !isAll && !wc && len(fs.psim) == 1 && fs.psim[filter] != nil {
		isAll = true
	}

	// If we are isAll and have no deleted we can do a simpler calculation.
	if isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
		if sseq == 0 {
			return fs.state.Msgs, validThrough
		}
		return fs.state.LastSeq - sseq + 1, validThrough
	}

	isMatch := func(subj string) bool {
		if isAll {
			return true
		}
		if !wc {
			return subj == filter
		}
		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
		return isSubsetMatchTokenized(tts, fts)
	}

	// If we would need to scan more from the beginning, revert back to calculating directly here.
	// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
	if lastPerSubject || seqStart >= (len(fs.blks)/2) {
		// If we need to track seen for last per subject.
		var seen map[string]bool
		if lastPerSubject {
			seen = make(map[string]bool)
		}

		for i := seqStart; i < len(fs.blks); i++ {
			mb := fs.blks[i]
			mb.mu.Lock()
			var t uint64
			if isAll && sseq <= mb.first.seq {
				if lastPerSubject {
					mb.ensurePerSubjectInfoLoaded()
					for subj := range mb.fss {
						if !seen[subj] {
							total++
							seen[subj] = true
						}
					}
				} else {
					total += mb.msgs
				}
				mb.mu.Unlock()
				continue
			}

			// If we are here we need to at least scan the subject fss.
			// Make sure we have fss loaded.
			mb.ensurePerSubjectInfoLoaded()
			var havePartial bool
			for subj, ss := range mb.fss {
				if !seen[subj] && isMatch(subj) {
					if lastPerSubject {
						// Can't have a partials with last by subject.
						if sseq <= ss.Last {
							t++
							seen[subj] = true
						}
					} else {
						if ss.firstNeedsUpdate {
							mb.recalculateFirstForSubj(subj, ss.First, ss)
						}
						if sseq <= ss.First {
							t += ss.Msgs
						} else if sseq <= ss.Last {
							// We matched but its a partial.
							havePartial = true
							break
						}
					}
				}
			}
			// See if we need to scan msgs here.
			if havePartial {
				// Clear on partial.
				t = 0
				// If we load the cache for a linear scan we want to expire that cache upon exit.
				var shouldExpire bool
				if mb.cacheNotLoaded() {
					mb.loadMsgsWithLock()
					shouldExpire = true
				}
				var smv StoreMsg
				for seq := sseq; seq <= mb.last.seq; seq++ {
					if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) {
						t++
					}
				}
				// If we loaded this block for this operation go ahead and expire it here.
				if shouldExpire {
					mb.tryForceExpireCacheLocked()
				}
			}
			mb.mu.Unlock()
			total += t
		}
		return total, validThrough
	}

	// If we are here its better to calculate totals from psim and adjust downward by scanning less blocks.
	// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
	start := uint32(math.MaxUint32)
	for subj, psi := range fs.psim {
		if isMatch(subj) {
			if lastPerSubject {
				total++
				// Keep track of start index for this subject.
				// Use last block in this case.
				if psi.lblk < start {
					start = psi.lblk
				}
			} else {
				total += psi.total
				// Keep track of start index for this subject.
				if psi.fblk < start {
					start = psi.fblk
				}
			}
		}
	}
	// See if we were asked for all, if so we are done.
	if sseq <= fs.state.FirstSeq {
		return total, validThrough
	}

	// If we are here we need to calculate partials for the first blocks.
	subjStart = int(start)
	firstSubjBlk := fs.bim[uint32(subjStart)]
	var firstSubjBlkFound bool
	var smv StoreMsg

	// Adjust in case not found.
	if firstSubjBlk == nil {
		firstSubjBlkFound = true
	}

	// Track how many we need to adjust against the total.
	var adjust uint64

	for i := 0; i <= seqStart; i++ {
		mb := fs.blks[i]

		// We can skip blks if we know they are below the first one that has any subject matches.
		if !firstSubjBlkFound {
			if mb == firstSubjBlk {
				firstSubjBlkFound = true
			} else {
				continue
			}
		}

		// We need to scan this block.
		var shouldExpire bool
		mb.mu.Lock()
		// Check if we should include all of this block in adjusting. If so work with metadata.
		if sseq > mb.last.seq {
			if isAll && !lastPerSubject {
				adjust += mb.msgs
			} else {
				// We need to adjust for all matches in this block.
				// We will scan fss state vs messages themselves.
				// Make sure we have fss loaded.
				mb.ensurePerSubjectInfoLoaded()
				for subj, ss := range mb.fss {
					if isMatch(subj) {
						if lastPerSubject {
							adjust++
						} else {
							adjust += ss.Msgs
						}
					}
				}
			}
		} else {
			// This is the last block. We need to scan per message here.
			if mb.cacheNotLoaded() {
				if err := mb.loadMsgsWithLock(); err != nil {
					mb.mu.Unlock()
					return 0, 0
				}
				shouldExpire = true
			}

			var last = mb.last.seq
			if sseq < last {
				last = sseq
			}
			for seq := mb.first.seq; seq < last; seq++ {
				sm, _ := mb.cacheLookup(seq, &smv)
				if sm == nil {
					continue
				}
				// Check if it matches our filter.
				if isMatch(sm.subj) && sm.seq < sseq {
					adjust++
				}
			}
		}
		// If we loaded the block try to force expire.
		if shouldExpire {
			mb.tryForceExpireCacheLocked()
		}
		mb.mu.Unlock()
	}
	// Make final adjustment.
	total -= adjust

	return total, validThrough
}

// SubjectsTotal return message totals per subject.
func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	if len(fs.psim) == 0 {
		return nil
	}

	tsa := [32]string{}
	fsa := [32]string{}
	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
	isAll := filter == _EMPTY_ || filter == fwcs
	wc := subjectHasWildcard(filter)

	isMatch := func(subj string) bool {
		if !wc {
			return subj == filter
		}
		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
		return isSubsetMatchTokenized(tts, fts)
	}

	fst := make(map[string]uint64)
	for subj, psi := range fs.psim {
		if isAll || isMatch(subj) {
			fst[subj] = psi.total
		}
	}
	return fst
}

// RegisterStorageUpdates registers a callback for updates to storage changes.
// It will present number of messages and bytes as a signed integer and an
// optional sequence number of the message if a single.
func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
	fs.mu.Lock()
	fs.scb = cb
	bsz := fs.state.Bytes
	fs.mu.Unlock()
	if cb != nil && bsz > 0 {
		cb(0, int64(bsz), 0, _EMPTY_)
	}
}

// Helper to get hash key for specific message block.
// Lock should be held
func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
	return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
}

func (mb *msgBlock) setupWriteCache(buf []byte) {
	// Make sure we have a cache setup.
	if mb.cache != nil {
		return
	}

	// Setup simple cache.
	mb.cache = &cache{buf: buf}
	// Make sure we set the proper cache offset if we have existing data.
	var fi os.FileInfo
	if mb.mfd != nil {
		fi, _ = mb.mfd.Stat()
	} else if mb.mfn != _EMPTY_ {
		fi, _ = os.Stat(mb.mfn)
	}
	if fi != nil {
		mb.cache.off = int(fi.Size())
	}
	mb.llts = time.Now().UnixNano()
	mb.startCacheExpireTimer()
}

// This rolls to a new append msg block.
// Lock should be held.
func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
	index := uint32(1)
	var rbuf []byte

	if lmb := fs.lmb; lmb != nil {
		index = lmb.index + 1

		// Determine if we can reclaim any resources here.
		if fs.fip {
			lmb.mu.Lock()
			lmb.closeFDsLocked()
			if lmb.cache != nil {
				// Reset write timestamp and see if we can expire this cache.
				rbuf = lmb.tryExpireWriteCache()
			}
			lmb.mu.Unlock()
		}
	}

	mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}

	// Lock should be held to quiet race detector.
	mb.mu.Lock()
	mb.setupWriteCache(rbuf)
	mb.fss = make(map[string]*SimpleState)

	// Set cache time to creation time to start.
	ts := time.Now().UnixNano()
	mb.llts, mb.lwts = 0, ts
	// Remember our last sequence number.
	mb.first.seq = fs.state.LastSeq + 1
	mb.last.seq = fs.state.LastSeq
	mb.mu.Unlock()

	// Now do local hash.
	key := sha256.Sum256(fs.hashKeyForBlock(index))
	hh, err := highwayhash.New64(key[:])
	if err != nil {
		return nil, fmt.Errorf("could not create hash: %v", err)
	}
	mb.hh = hh

	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, mb.index))
	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
	if err != nil {
		mb.dirtyCloseWithRemove(true)
		return nil, fmt.Errorf("Error creating msg block file [%q]: %v", mb.mfn, err)
	}
	mb.mfd = mfd

	// Check if encryption is enabled.
	if fs.prf != nil {
		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
			return nil, err
		}
	}

	// If we know we will need this so go ahead and spin up.
	if !fs.fip {
		mb.spinUpFlushLoop()
	}

	// Add to our list of blocks and mark as last.
	fs.addMsgBlock(mb)

	if fs.dirty > 0 {
		fs.kickFlushStateLoop()
	}

	return mb, nil
}

// Generate the keys for this message block and write them out.
func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
	if mb == nil {
		return nil
	}
	key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
	if err != nil {
		return err
	}
	mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
	if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
		return err
	}
	if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
		return err
	}
	mb.kfn = keyFile
	return nil
}

// Stores a raw message with expected sequence number and timestamp.
// Lock should be held.
func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
	if fs.closed {
		return ErrStoreClosed
	}

	// Per subject max check needed.
	mmp := uint64(fs.cfg.MaxMsgsPer)
	var psmc uint64
	psmax := mmp > 0 && len(subj) > 0
	if psmax {
		if info, ok := fs.psim[subj]; ok {
			psmc = info.total
		}
	}

	var fseq uint64
	// Check if we are discarding new messages when we reach the limit.
	if fs.cfg.Discard == DiscardNew {
		var asl bool
		if psmax && psmc >= mmp {
			// If we are instructed to discard new per subject, this is an error.
			if fs.cfg.DiscardNewPer {
				return ErrMaxMsgsPerSubject
			}
			if fseq, err = fs.firstSeqForSubj(subj); err != nil {
				return err
			}
			asl = true
		}
		if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
			return ErrMaxMsgs
		}
		if fs.cfg.MaxBytes > 0 && fs.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(fs.cfg.MaxBytes) {
			if !asl || fs.sizeForSeq(fseq) <= len(msg)+len(hdr) {
				return ErrMaxBytes
			}
		}
	}

	// Check sequence.
	if seq != fs.state.LastSeq+1 {
		if seq > 0 {
			return ErrSequenceMismatch
		}
		seq = fs.state.LastSeq + 1
	}

	// Write msg record.
	n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
	if err != nil {
		return err
	}

	// Mark dirty here since we added in a new message.
	// We do not kick the flusher, that happens on new msg block for write or Stop().
	fs.dirty++

	// Adjust top level tracking of per subject msg counts.
	if len(subj) > 0 {
		index := fs.lmb.index
		if info, ok := fs.psim[subj]; ok {
			info.total++
			if index > info.lblk {
				info.lblk = index
			}
		} else {
			fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index}
		}
	}

	// Adjust first if needed.
	now := time.Unix(0, ts).UTC()
	if fs.state.Msgs == 0 {
		fs.state.FirstSeq = seq
		fs.state.FirstTime = now
	}

	fs.state.Msgs++
	fs.state.Bytes += n
	fs.state.LastSeq = seq
	fs.state.LastTime = now

	// Enforce per message limits.
	// We snapshotted psmc before our actual write, so >= comparison needed.
	if psmax && psmc >= mmp {
		// We may have done this above.
		if fseq == 0 {
			fseq, _ = fs.firstSeqForSubj(subj)
		}
		if ok, _ := fs.removeMsgViaLimits(fseq); ok {
			// Make sure we are below the limit.
			if psmc--; psmc >= mmp {
				for info, ok := fs.psim[subj]; ok && info.total > mmp; info, ok = fs.psim[subj] {
					if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
						if ok, _ := fs.removeMsgViaLimits(seq); !ok {
							break
						}
					} else {
						break
					}
				}
			}
		} else if mb := fs.selectMsgBlock(fseq); mb != nil {
			// If we are here we could not remove fseq from above, so rebuild.
			var ld *LostStreamData
			if ld, _, _ = mb.rebuildState(); ld != nil {
				fs.rebuildStateLocked(ld)
			}
		}
	}

	// Limits checks and enforcement.
	// If they do any deletions they will update the
	// byte count on their own, so no need to compensate.
	fs.enforceMsgLimit()
	fs.enforceBytesLimit()

	// Check if we have and need the age expiration timer running.
	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
		fs.startAgeChk()
	}

	return nil
}

// StoreRawMsg stores a raw message with expected sequence number and timestamp.
func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
	fs.mu.Lock()
	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
	cb := fs.scb
	// Check if first message timestamp requires expiry
	// sooner than initial replica expiry timer set to MaxAge when initializing.
	if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
		fs.receivedAny = true
		// don't block here by calling expireMsgs directly.
		// Instead, set short timeout.
		fs.resetAgeChk(int64(time.Millisecond * 50))
	}
	fs.mu.Unlock()

	if err == nil && cb != nil {
		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
	}

	return err
}

// Store stores a message. We hold the main filestore lock for any write operation.
func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
	fs.mu.Lock()
	seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
	cb := fs.scb
	fs.mu.Unlock()

	if err != nil {
		seq, ts = 0, 0
	} else if cb != nil {
		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
	}

	return seq, ts, err
}

// skipMsg will update this message block for a skipped message.
// If we do not have any messages, just update the metadata, otherwise
// we will place an empty record marking the sequence as used. The
// sequence will be marked erased.
// fs lock should be held.
func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
	if mb == nil {
		return
	}
	var needsRecord bool

	nowts := now.UnixNano()

	mb.mu.Lock()
	// If we are empty can just do meta.
	if mb.msgs == 0 {
		mb.last.seq = seq
		mb.last.ts = nowts
		mb.first.seq = seq + 1
		mb.first.ts = nowts
	} else {
		needsRecord = true
		mb.dmap.Insert(seq)
	}
	mb.mu.Unlock()

	if needsRecord {
		mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
	} else {
		mb.kickFlusher()
	}
}

// SkipMsg will use the next sequence number but not store anything.
func (fs *fileStore) SkipMsg() uint64 {
	fs.mu.Lock()
	defer fs.mu.Unlock()

	// Grab time and last seq.
	now, seq := time.Now().UTC(), fs.state.LastSeq+1
	fs.state.LastSeq, fs.state.LastTime = seq, now
	if fs.state.Msgs == 0 {
		fs.state.FirstSeq, fs.state.FirstTime = seq, now
	}
	if seq == fs.state.FirstSeq {
		fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
	}
	fs.lmb.skipMsg(seq, now)

	return seq
}

// Lock should be held.
func (fs *fileStore) rebuildFirst() {
	if len(fs.blks) == 0 {
		return
	}
	fmb := fs.blks[0]
	if fmb == nil {
		return
	}

	ld, _, _ := fmb.rebuildState()
	fmb.mu.RLock()
	isEmpty := fmb.msgs == 0
	fmb.mu.RUnlock()
	if isEmpty {
		fs.removeMsgBlock(fmb)
	}
	fs.selectNextFirst()
	fs.rebuildStateLocked(ld)
}

// Optimized helper function to return first sequence.
// subj will always be publish subject here, meaning non-wildcard.
// We assume a fast check that this subj even exists already happened.
// Lock should be held.
func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
	if len(fs.blks) == 0 {
		return 0, nil
	}

	// See if we can optimize where we start.
	start, stop := fs.blks[0].index, fs.lmb.index
	if info, ok := fs.psim[subj]; ok {
		start, stop = info.fblk, info.lblk
	}

	for i := start; i <= stop; i++ {
		mb := fs.bim[i]
		if mb == nil {
			continue
		}
		mb.mu.Lock()
		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
			mb.mu.Unlock()
			return 0, err
		}
		ss := mb.fss[subj]
		mb.mu.Unlock()
		if ss != nil {
			// Adjust first if it was not where we thought it should be.
			if i != start {
				if info, ok := fs.psim[subj]; ok {
					info.fblk = i
				}
			}
			if ss.firstNeedsUpdate {
				mb.recalculateFirstForSubj(subj, ss.First, ss)
			}
			return ss.First, nil
		}
	}
	return 0, nil
}

// Will check the msg limit and drop firstSeq msg if needed.
// Lock should be held.
func (fs *fileStore) enforceMsgLimit() {
	if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
		return
	}
	for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
			fs.rebuildFirst()
			return
		}
	}
}

// Will check the bytes limit and drop msgs if needed.
// Lock should be held.
func (fs *fileStore) enforceBytesLimit() {
	if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
		return
	}
	for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
			fs.rebuildFirst()
			return
		}
	}
}

// Will make sure we have limits honored for max msgs per subject on recovery or config update.
// We will make sure to go through all msg blocks etc. but in practice this
// will most likely only be the last one, so can take a more conservative approach.
// Lock should be held.
func (fs *fileStore) enforceMsgPerSubjectLimit() {
	maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)

	// We want to suppress callbacks from remove during this process
	// since these should have already been deleted and accounted for.
	cb := fs.scb
	fs.scb = nil
	defer func() { fs.scb = cb }()

	var numMsgs uint64

	// collect all that are not correct.
	needAttention := make(map[string]*psi)
	for subj, psi := range fs.psim {
		numMsgs += psi.total
		if psi.total > maxMsgsPer {
			needAttention[subj] = psi
		}
	}

	// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
	// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
	if numMsgs != fs.state.Msgs {
		// Clear any global subject state.
		fs.psim = make(map[string]*psi)
		for _, mb := range fs.blks {
			ld, _, err := mb.rebuildState()
			if err != nil && ld != nil {
				fs.addLostData(ld)
			}
			fs.populateGlobalPerSubjectInfo(mb)
		}
		// Rebuild fs state too.
		fs.rebuildStateLocked(nil)
		// Need to redo blocks that need attention.
		needAttention = make(map[string]*psi)
		for subj, psi := range fs.psim {
			if psi.total > maxMsgsPer {
				needAttention[subj] = psi
			}
		}
	}

	// Collect all the msgBlks we alter.
	blks := make(map[*msgBlock]struct{})

	// For re-use below.
	var sm StoreMsg

	// Walk all subjects that need attention here.
	for subj, info := range needAttention {
		total, start, stop := info.total, info.fblk, info.lblk

		for i := start; i <= stop; i++ {
			mb := fs.bim[i]
			if mb == nil {
				continue
			}
			// Grab the ss entry for this subject in case sparse.
			mb.mu.Lock()
			mb.ensurePerSubjectInfoLoaded()
			ss := mb.fss[subj]
			if ss != nil && ss.firstNeedsUpdate {
				mb.recalculateFirstForSubj(subj, ss.First, ss)
			}
			mb.mu.Unlock()
			if ss == nil {
				continue
			}
			for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
				m, _, err := mb.firstMatching(subj, false, seq, &sm)
				if err == nil {
					seq = m.seq + 1
					if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
						total--
						blks[mb] = struct{}{}
					}
				} else {
					// On error just do single increment.
					seq++
				}
			}
		}
	}

	// Now write updated index for all affected msgBlks.
	for mb := range blks {
		mb.tryForceExpireCacheLocked()
	}
}

// Lock should be held.
func (fs *fileStore) deleteFirstMsg() (bool, error) {
	return fs.removeMsgViaLimits(fs.state.FirstSeq)
}

// If we remove via limits that can always be recovered on a restart we
// do not force the system to update the index file.
// Lock should be held.
func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
	return fs.removeMsg(seq, false, true, false)
}

// RemoveMsg will remove the message from this store.
// Will return the number of bytes removed.
func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
	return fs.removeMsg(seq, false, false, true)
}

func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
	return fs.removeMsg(seq, true, false, true)
}

// Convenience function to remove per subject tracking at the filestore level.
// Lock should be held.
func (fs *fileStore) removePerSubject(subj string) {
	if len(subj) == 0 {
		return
	}
	// We do not update sense of fblk here but will do so when we resolve during lookup.
	if info, ok := fs.psim[subj]; ok {
		info.total--
		if info.total == 0 {
			delete(fs.psim, subj)
		}
	}
}

// Remove a message, optionally rewriting the mb file.
func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
	if seq == 0 {
		return false, ErrStoreMsgNotFound
	}
	fsLock := func() {
		if needFSLock {
			fs.mu.Lock()
		}
	}
	fsUnlock := func() {
		if needFSLock {
			fs.mu.Unlock()
		}
	}

	fsLock()

	if fs.closed {
		fsUnlock()
		return false, ErrStoreClosed
	}
	if !viaLimits && fs.sips > 0 {
		fsUnlock()
		return false, ErrStoreSnapshotInProgress
	}
	// If in encrypted mode negate secure rewrite here.
	if secure && fs.prf != nil {
		secure = false
	}

	if fs.state.Msgs == 0 {
		var err = ErrStoreEOF
		if seq <= fs.state.LastSeq {
			err = ErrStoreMsgNotFound
		}
		fsUnlock()
		return false, err
	}

	mb := fs.selectMsgBlock(seq)
	if mb == nil {
		var err = ErrStoreEOF
		if seq <= fs.state.LastSeq {
			err = ErrStoreMsgNotFound
		}
		fsUnlock()
		return false, err
	}

	mb.mu.Lock()

	// See if we are closed or the sequence number is still relevant.
	if mb.closed || seq < mb.first.seq {
		mb.mu.Unlock()
		fsUnlock()
		return false, nil
	}

	// Now check dmap if it is there.
	if mb.dmap.Exists(seq) {
		mb.mu.Unlock()
		fsUnlock()
		return false, nil
	}

	// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
	// Now just load regardless.
	// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
	if mb.cacheNotLoaded() {
		// We do not want to block possible activity within another msg block.
		// We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another
		// go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block.
		mb.mu.Unlock()
		fsUnlock()
		if err := mb.loadMsgs(); err != nil {
			return false, err
		}
		fsLock()
		// We need to check if things changed out from underneath us.
		if fs.closed {
			fsUnlock()
			return false, ErrStoreClosed
		}
		mb.mu.Lock()
		if mb.closed || seq < mb.first.seq {
			mb.mu.Unlock()
			fsUnlock()
			return false, nil
		}
		// cacheLookup below will do dmap check so no need to repeat here.
	}

	var smv StoreMsg
	sm, err := mb.cacheLookup(seq, &smv)
	if err != nil {
		mb.mu.Unlock()
		fsUnlock()
		// Mimic err behavior from above check to dmap. No error returned if already removed.
		if err == errDeletedMsg {
			err = nil
		}
		return false, err
	}
	// Grab size
	msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)

	// Set cache timestamp for last remove.
	mb.lrts = time.Now().UnixNano()

	// Global stats
	if fs.state.Msgs > 0 {
		fs.state.Msgs--
	}
	if msz < fs.state.Bytes {
		fs.state.Bytes -= msz
	} else {
		fs.state.Bytes = 0
	}

	// Now local mb updates.
	if mb.msgs > 0 {
		mb.msgs--
	}
	if msz < mb.bytes {
		mb.bytes -= msz
	} else {
		mb.bytes = 0
	}

	// Mark as dirty for stream state.
	fs.dirty++

	// If we are tracking subjects here make sure we update that accounting.
	mb.ensurePerSubjectInfoLoaded()

	// If we are tracking multiple subjects here make sure we update that accounting.
	mb.removeSeqPerSubject(sm.subj, seq)
	fs.removePerSubject(sm.subj)

	if secure {
		// Grab record info.
		ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
		mb.eraseMsg(seq, int(ri), int(rl))
	}

	fifo := seq == mb.first.seq
	isLastBlock := mb == fs.lmb
	isEmpty := mb.msgs == 0

	if fifo {
		mb.selectNextFirst()
		if !isEmpty {
			// Can update this one in place.
			if seq == fs.state.FirstSeq {
				fs.state.FirstSeq = mb.first.seq // new one.
				if mb.first.ts == 0 {
					fs.state.FirstTime = time.Time{}
				} else {
					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
				}
			}
		}
	} else if !isEmpty {
		if mb.dmap.IsEmpty() {
			// Mark initial base for delete set.
			mb.dmap.SetInitialMin(mb.first.seq)
		}
		// Out of order delete.
		mb.dmap.Insert(seq)
		// Check if <25% utilization and minimum size met.
		if mb.rbytes > compactMinimum && !isLastBlock {
			// Remove the interior delete records
			rbytes := mb.rbytes - uint64(mb.dmap.Size()*emptyRecordLen)
			if rbytes>>2 > mb.bytes {
				mb.compact()
			}
		}
	}

	if secure {
		if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
			// We have the mb lock here, this needs the mb locks so do in its own go routine.
			go fs.rebuildState(ld)
		}
	}

	// If empty remove this block and check if we need to update first sequence.
	// We will write a tombstone at the end.
	var firstSeqNeedsUpdate bool
	if isEmpty {
		fs.removeMsgBlock(mb)
		firstSeqNeedsUpdate = seq == fs.state.FirstSeq
	}
	mb.mu.Unlock()

	// If we emptied the current message block and the seq was state.FirstSeq
	// then we need to jump message blocks. We will also write the index so
	// we don't lose track of the first sequence.
	if firstSeqNeedsUpdate {
		fs.selectNextFirst()
	}

	// Check if we need to write a deleted record tombstone.
	// This is for user initiated removes or to hold the first seq
	// when the last block is empty.
	if !viaLimits || (isEmpty && isLastBlock) {
		if lmb := fs.lmb; sm != nil && lmb != nil {
			lmb.writeTombstone(sm.seq, sm.ts)
		}
		fs.kickFlushStateLoop()
	}

	if cb := fs.scb; cb != nil {
		// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
		fs.mu.Unlock()
		// Storage updates.
		var subj string
		if sm != nil {
			subj = sm.subj
		}
		delta := int64(msz)
		cb(-1, -delta, seq, subj)

		if !needFSLock {
			fs.mu.Lock()
		}
	} else if needFSLock {
		// We acquired it so release it.
		fs.mu.Unlock()
	}

	return true, nil
}

// This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
// This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
// writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
// Write lock needs to be held.
func (mb *msgBlock) compact() {
	wasLoaded := mb.cacheAlreadyLoaded()
	if !wasLoaded {
		if err := mb.loadMsgsWithLock(); err != nil {
			return
		}
	}

	buf := mb.cache.buf
	nbuf := make([]byte, 0, len(buf))

	var le = binary.LittleEndian
	var firstSet bool

	isDeleted := func(seq uint64) bool {
		if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
			return true
		}
		return mb.dmap.Exists(seq)
	}

	// For skip msgs.
	var smh [msgHdrSize]byte

	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
		if index+msgHdrSize > lbuf {
			return
		}
		hdr := buf[index : index+msgHdrSize]
		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
		// Clear any headers bit that could be set.
		rl &^= hbit
		dlen := int(rl) - msgHdrSize
		// Do some quick sanity checks here.
		if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
			return
		}
		// Only need to process non-deleted messages.
		seq := le.Uint64(hdr[4:])

		if !isDeleted(seq) {
			// Normal message here.
			nbuf = append(nbuf, buf[index:index+rl]...)
			// Do not set based on tombstone.
			if !firstSet && seq&tbit == 0 {
				firstSet = true
				mb.first.seq = seq
			}
		} else if firstSet {
			// This is an interior delete that we need to make sure we have a placeholder for.
			le.PutUint32(smh[0:], emptyRecordLen)
			le.PutUint64(smh[4:], seq|ebit)
			le.PutUint64(smh[12:], 0)
			le.PutUint16(smh[20:], 0)
			nbuf = append(nbuf, smh[:]...)
			mb.hh.Reset()
			mb.hh.Write(smh[4:20])
			checksum := mb.hh.Sum(nil)
			nbuf = append(nbuf, checksum...)
		}
		// Always set last.
		mb.last.seq = seq &^ ebit

		// Advance to next record.
		index += rl
	}

	// Check for encryption.
	if mb.bek != nil && len(nbuf) > 0 {
		// Recreate to reset counter.
		rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
		if err != nil {
			return
		}
		rbek.XORKeyStream(nbuf, nbuf)
	}

	// Close FDs first.
	mb.closeFDsLocked()

	// We will write to a new file and mv/rename it in case of failure.
	mfn := filepath.Join(filepath.Join(mb.fs.fcfg.StoreDir, msgDir), fmt.Sprintf(newScan, mb.index))
	if err := os.WriteFile(mfn, nbuf, defaultFilePerms); err != nil {
		os.Remove(mfn)
		return
	}
	if err := os.Rename(mfn, mb.mfn); err != nil {
		os.Remove(mfn)
		return
	}

	// Remove index file and wipe delete map, then rebuild.
	mb.deleteDmap()
	mb.rebuildStateLocked()

	// If we entered with the msgs loaded make sure to reload them.
	if wasLoaded {
		mb.loadMsgsWithLock()
	}
}

// Empty out our dmap.
func (mb *msgBlock) deleteDmap() {
	mb.dmap.Empty()
}

// Grab info from a slot.
// Lock should be held.
func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
	if mb.cache == nil || slot >= len(mb.cache.idx) {
		return 0, 0, false, errPartialCache
	}

	bi := mb.cache.idx[slot]
	ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0

	// If this is a deleted slot return here.
	if bi == dbit {
		return 0, 0, false, errDeletedMsg
	}

	// Determine record length
	var rl uint32
	if len(mb.cache.idx) > slot+1 {
		ni := mb.cache.idx[slot+1] &^ hbit
		rl = ni - ri
	} else {
		rl = mb.cache.lrl
	}
	if rl < msgHdrSize {
		return 0, 0, false, errBadMsg
	}
	return uint32(ri), rl, hashChecked, nil
}

func (fs *fileStore) isClosed() bool {
	fs.mu.RLock()
	closed := fs.closed
	fs.mu.RUnlock()
	return closed
}

// Will spin up our flush loop.
func (mb *msgBlock) spinUpFlushLoop() {
	mb.mu.Lock()
	defer mb.mu.Unlock()

	// Are we already running or closed?
	if mb.flusher || mb.closed {
		return
	}
	mb.flusher = true
	mb.fch = make(chan struct{}, 1)
	mb.qch = make(chan struct{})
	fch, qch := mb.fch, mb.qch

	go mb.flushLoop(fch, qch)
}

// Raw low level kicker for flush loops.
func kickFlusher(fch chan struct{}) {
	if fch != nil {
		select {
		case fch <- struct{}{}:
		default:
		}
	}
}

// Kick flusher for this message block.
func (mb *msgBlock) kickFlusher() {
	mb.mu.RLock()
	defer mb.mu.RUnlock()
	kickFlusher(mb.fch)
}

func (mb *msgBlock) setInFlusher() {
	mb.mu.Lock()
	mb.flusher = true
	mb.mu.Unlock()
}

func (mb *msgBlock) clearInFlusher() {
	mb.mu.Lock()
	mb.flusher = false
	mb.mu.Unlock()
}

// flushLoop watches for messages, index info, or recently closed msg block updates.
func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
	mb.setInFlusher()
	defer mb.clearInFlusher()

	for {
		select {
		case <-fch:
			// If we have pending messages process them first.
			if waiting := mb.pendingWriteSize(); waiting != 0 {
				ts := 1 * time.Millisecond
				var waited time.Duration

				for waiting < coalesceMinimum {
					time.Sleep(ts)
					select {
					case <-qch:
						return
					default:
					}
					newWaiting := mb.pendingWriteSize()
					if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
						break
					}
					waiting = newWaiting
					ts *= 2
				}
				mb.flushPendingMsgs()
				// Check if we are no longer the last message block. If we are
				// not we can close FDs and exit.
				mb.fs.mu.RLock()
				notLast := mb != mb.fs.lmb
				mb.fs.mu.RUnlock()
				if notLast {
					if err := mb.closeFDs(); err == nil {
						return
					}
				}
			}
		case <-qch:
			return
		}
	}
}

// Lock should be held.
func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
	var le = binary.LittleEndian
	var hdr [msgHdrSize]byte

	le.PutUint32(hdr[0:], uint32(rl))
	le.PutUint64(hdr[4:], seq|ebit)
	le.PutUint64(hdr[12:], 0)
	le.PutUint16(hdr[20:], 0)

	// Randomize record
	data := make([]byte, rl-emptyRecordLen)
	rand.Read(data)

	// Now write to underlying buffer.
	var b bytes.Buffer
	b.Write(hdr[:])
	b.Write(data)

	// Calculate hash.
	mb.hh.Reset()
	mb.hh.Write(hdr[4:20])
	mb.hh.Write(data)
	checksum := mb.hh.Sum(nil)
	// Write to msg record.
	b.Write(checksum)

	// Update both cache and disk.
	nbytes := b.Bytes()

	// Cache
	if ri >= mb.cache.off {
		li := ri - mb.cache.off
		buf := mb.cache.buf[li : li+rl]
		copy(buf, nbytes)
	}

	// Disk
	if mb.cache.off+mb.cache.wp > ri {
		mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
		if err != nil {
			return err
		}
		defer mfd.Close()
		if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
			mfd.Sync()
		}
		if err != nil {
			return err
		}
	}
	return nil
}

// Truncate this message block to the storedMsg.
func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
	// Make sure we are loaded to process messages etc.
	if err := mb.loadMsgs(); err != nil {
		return 0, 0, err
	}

	// Calculate new eof using slot info from our new last sm.
	ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
	if err != nil {
		return 0, 0, err
	}
	// Calculate new eof.
	eof := int64(ri + rl)

	var purged, bytes uint64

	mb.mu.Lock()

	checkDmap := mb.dmap.Size() > 0
	var smv StoreMsg

	for seq := mb.last.seq; seq > sm.seq; seq-- {
		if checkDmap {
			if mb.dmap.Exists(seq) {
				// Delete and skip to next.
				mb.dmap.Delete(seq)
				checkDmap = !mb.dmap.IsEmpty()
				continue
			}
		}
		// We should have a valid msg to calculate removal stats.
		if m, err := mb.cacheLookup(seq, &smv); err == nil {
			if mb.msgs > 0 {
				rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
				mb.msgs--
				if rl > mb.bytes {
					rl = mb.bytes
				}
				mb.bytes -= rl
				mb.rbytes -= rl
				// For return accounting.
				purged++
				bytes += uint64(rl)
			}
		}
	}

	// If the block is compressed then we have to load it into memory
	// and decompress it, truncate it and then write it back out.
	// Otherwise, truncate the file itself and close the descriptor.
	if mb.cmp != NoCompression {
		buf, err := mb.loadBlock(nil)
		if err != nil {
			return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
		}
		if mb.bek != nil && len(buf) > 0 {
			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
			if err != nil {
				return 0, 0, err
			}
			mb.bek = bek
			mb.bek.XORKeyStream(buf, buf)
		}
		buf, err = mb.decompressIfNeeded(buf)
		if err != nil {
			return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
		}
		buf = buf[:eof]
		copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
		buf, err = mb.cmp.Compress(buf)
		if err != nil {
			return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
		}
		meta := &CompressionInfo{
			Algorithm:    mb.cmp,
			OriginalSize: uint64(eof),
		}
		buf = append(meta.MarshalMetadata(), buf...)
		if mb.bek != nil && len(buf) > 0 {
			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
			if err != nil {
				return 0, 0, err
			}
			mb.bek = bek
			mb.bek.XORKeyStream(buf, buf)
		}
		n, err := mb.writeAt(buf, 0)
		if err != nil {
			return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
		}
		if n != len(buf) {
			return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
		}
		mb.mfd.Truncate(int64(len(buf)))
		mb.mfd.Sync()
	} else if mb.mfd != nil {
		mb.mfd.Truncate(eof)
		mb.mfd.Sync()
		// Update our checksum.
		var lchk [8]byte
		mb.mfd.ReadAt(lchk[:], eof-8)
		copy(mb.lchk[0:], lchk[:])
	} else {
		mb.mu.Unlock()
		return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
	}

	// Update our last msg.
	mb.last.seq = sm.seq
	mb.last.ts = sm.ts

	// Clear our cache.
	mb.clearCacheAndOffset()

	// Redo per subject info for this block.
	mb.resetPerSubjectInfo()

	mb.mu.Unlock()

	// Load msgs again.
	mb.loadMsgs()

	return purged, bytes, nil
}

// Lock should be held.
func (mb *msgBlock) isEmpty() bool {
	return mb.first.seq > mb.last.seq
}

// Lock should be held.
func (mb *msgBlock) selectNextFirst() {
	var seq uint64
	for seq = mb.first.seq + 1; seq <= mb.last.seq; seq++ {
		if mb.dmap.Exists(seq) {
			// We will move past this so we can delete the entry.
			mb.dmap.Delete(seq)
		} else {
			break
		}
	}
	// Set new first sequence.
	mb.first.seq = seq

	// Check if we are empty..
	if mb.isEmpty() {
		mb.first.ts = 0
		return
	}

	// Need to get the timestamp.
	// We will try the cache direct and fallback if needed.
	var smv StoreMsg
	sm, _ := mb.cacheLookup(seq, &smv)
	if sm == nil {
		// Slow path, need to unlock.
		mb.mu.Unlock()
		sm, _, _ = mb.fetchMsg(seq, &smv)
		mb.mu.Lock()
	}
	if sm != nil {
		mb.first.ts = sm.ts
	} else {
		mb.first.ts = 0
	}
}

// Select the next FirstSeq
// Lock should be held.
func (fs *fileStore) selectNextFirst() {
	if len(fs.blks) > 0 {
		mb := fs.blks[0]
		mb.mu.RLock()
		fs.state.FirstSeq = mb.first.seq
		fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
		mb.mu.RUnlock()
	} else {
		// Could not find anything, so treat like purge
		fs.state.FirstSeq = fs.state.LastSeq + 1
		fs.state.FirstTime = time.Time{}
	}
}

// Lock should be held.
func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
	if td == 0 {
		td = mb.cexp
	}
	if mb.ctmr == nil {
		mb.ctmr = time.AfterFunc(td, mb.expireCache)
	} else {
		mb.ctmr.Reset(td)
	}
}

// Lock should be held.
func (mb *msgBlock) startCacheExpireTimer() {
	mb.resetCacheExpireTimer(0)
}

// Used when we load in a message block.
// Lock should be held.
func (mb *msgBlock) clearCacheAndOffset() {
	// Reset linear scan tracker.
	mb.llseq = 0
	if mb.cache != nil {
		mb.cache.off = 0
		mb.cache.wp = 0
	}
	mb.clearCache()
}

// Lock should be held.
func (mb *msgBlock) clearCache() {
	if mb.ctmr != nil && mb.fss == nil {
		mb.ctmr.Stop()
		mb.ctmr = nil
	}

	if mb.cache == nil {
		return
	}

	buf := mb.cache.buf
	if mb.cache.off == 0 {
		mb.cache = nil
	} else {
		// Clear msgs and index.
		mb.cache.buf = nil
		mb.cache.idx = nil
		mb.cache.wp = 0
	}
	recycleMsgBlockBuf(buf)
}

// Called to possibly expire a message block cache.
func (mb *msgBlock) expireCache() {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	mb.expireCacheLocked()
}

func (mb *msgBlock) tryForceExpireCache() {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	mb.tryForceExpireCacheLocked()
}

// We will attempt to force expire this by temporarily clearing the last load time.
func (mb *msgBlock) tryForceExpireCacheLocked() {
	llts := mb.llts
	mb.llts = 0
	mb.expireCacheLocked()
	mb.llts = llts
}

// This is for expiration of the write cache, which will be partial with fip.
// So we want to bypass the Pools here.
// Lock should be held.
func (mb *msgBlock) tryExpireWriteCache() []byte {
	if mb.cache == nil {
		return nil
	}
	lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
	mb.lwts, mb.cache.nra = 0, true
	mb.expireCacheLocked()
	mb.lwts = lwts
	if mb.cache != nil {
		mb.cache.nra = nra
	}
	// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
	if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
		// Clear last write time since we now are about to move on to a new lmb.
		mb.lwts = 0
		return buf[:0]
	}
	return nil
}

// Lock should be held.
func (mb *msgBlock) expireCacheLocked() {
	if mb.cache == nil && mb.fss == nil {
		if mb.ctmr != nil {
			mb.ctmr.Stop()
			mb.ctmr = nil
		}
		return
	}

	// Can't expire if we still have pending.
	if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
		mb.resetCacheExpireTimer(mb.cexp)
		return
	}

	// Grab timestamp to compare.
	tns := time.Now().UnixNano()

	// For the core buffer of messages, we care about reads and writes, but not removes.
	bufts := mb.llts
	if mb.lwts > bufts {
		bufts = mb.lwts
	}

	// Check for activity on the cache that would prevent us from expiring.
	if tns-bufts <= int64(mb.cexp) {
		mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
		return
	}

	// If we are here we will at least expire the core msg buffer.
	// We need to capture offset in case we do a write next before a full load.
	if mb.cache != nil {
		mb.cache.off += len(mb.cache.buf)
		if !mb.cache.nra {
			recycleMsgBlockBuf(mb.cache.buf)
		}
		mb.cache.buf = nil
		mb.cache.wp = 0
	}

	// Check if we can clear out our fss and idx unless under force expire.
	// We used to hold onto the idx longer but removes need buf now so no point.
	mb.fss = nil
	mb.clearCache()
}

func (fs *fileStore) startAgeChk() {
	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
		fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
	}
}

// Lock should be held.
func (fs *fileStore) resetAgeChk(delta int64) {
	if fs.cfg.MaxAge == 0 {
		return
	}

	fireIn := fs.cfg.MaxAge
	if delta > 0 && time.Duration(delta) < fireIn {
		fireIn = time.Duration(delta)
	}
	if fs.ageChk != nil {
		fs.ageChk.Reset(fireIn)
	} else {
		fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
	}
}

// Lock should be held.
func (fs *fileStore) cancelAgeChk() {
	if fs.ageChk != nil {
		fs.ageChk.Stop()
		fs.ageChk = nil
	}
}

// Will expire msgs that are too old.
func (fs *fileStore) expireMsgs() {
	// We need to delete one by one here and can not optimize for the time being.
	// Reason is that we need more information to adjust ack pending in consumers.
	var smv StoreMsg
	var sm *StoreMsg
	fs.mu.RLock()
	maxAge := int64(fs.cfg.MaxAge)
	minAge := time.Now().UnixNano() - maxAge
	fs.mu.RUnlock()

	for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
		fs.mu.Lock()
		fs.removeMsgViaLimits(sm.seq)
		fs.mu.Unlock()
		// Recalculate in case we are expiring a bunch.
		minAge = time.Now().UnixNano() - maxAge
	}

	fs.mu.Lock()
	defer fs.mu.Unlock()

	// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
	if fs.state.Msgs == 0 {
		fs.cancelAgeChk()
	} else {
		if sm == nil {
			fs.resetAgeChk(0)
		} else {
			fs.resetAgeChk(sm.ts - minAge)
		}
	}
}

// Lock should be held.
func (fs *fileStore) checkAndFlushAllBlocks() {
	for _, mb := range fs.blks {
		if mb.pendingWriteSize() > 0 {
			// Since fs lock is held need to pull this apart in case we need to rebuild state.
			mb.mu.Lock()
			ld, _ := mb.flushPendingMsgsLocked()
			mb.mu.Unlock()
			if ld != nil {
				fs.rebuildStateLocked(ld)
			}
		}
	}
}

// This will check all the checksums on messages and report back any sequence numbers with errors.
func (fs *fileStore) checkMsgs() *LostStreamData {
	fs.mu.Lock()
	defer fs.mu.Unlock()

	fs.checkAndFlushAllBlocks()

	// Clear any global subject state.
	fs.psim = make(map[string]*psi)

	for _, mb := range fs.blks {
		// Make sure encryption loaded if needed for the block.
		fs.loadEncryptionForMsgBlock(mb)
		// FIXME(dlc) - check tombstones here too?
		if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
			// Rebuild fs state too.
			mb.fs.rebuildStateLocked(ld)
		}
		fs.populateGlobalPerSubjectInfo(mb)
	}

	return fs.ld
}

// Lock should be held.
func (mb *msgBlock) enableForWriting(fip bool) error {
	if mb == nil {
		return errNoMsgBlk
	}
	if mb.mfd != nil {
		return nil
	}
	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
	if err != nil {
		return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
	}
	mb.mfd = mfd

	// Spin up our flusher loop if needed.
	if !fip {
		mb.spinUpFlushLoop()
	}

	return nil
}

// Helper function to place a delete tombstone.
// Lock should be held.
func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
	return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
}

// Will write the message record to the underlying message block.
// filestore lock will be held.
func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
	mb.mu.Lock()
	defer mb.mu.Unlock()

	// Enable for writing if our mfd is not open.
	if mb.mfd == nil {
		if err := mb.enableForWriting(flush); err != nil {
			return err
		}
	}

	// Make sure we have a cache setup.
	if mb.cache == nil {
		mb.setupWriteCache(nil)
	}

	// Check if we are tracking per subject for our simple state.
	// Do this before changing the cache that would trigger a flush pending msgs call
	// if we needed to regenerate the per subject info.
	if len(subj) > 0 && !mb.noTrack {
		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
			return err
		}
		if ss := mb.fss[subj]; ss != nil {
			ss.Msgs++
			ss.Last = seq
		} else {
			mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
		}
	}

	// Indexing
	index := len(mb.cache.buf) + int(mb.cache.off)

	// Formats
	// Format with no header
	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
	// With headers, high bit on total length will be set.
	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)

	// First write header, etc.
	var le = binary.LittleEndian
	var hdr [msgHdrSize]byte

	l := uint32(rl)
	hasHeaders := len(mhdr) > 0
	if hasHeaders {
		l |= hbit
	}

	le.PutUint32(hdr[0:], l)
	le.PutUint64(hdr[4:], seq)
	le.PutUint64(hdr[12:], uint64(ts))
	le.PutUint16(hdr[20:], uint16(len(subj)))

	// Now write to underlying buffer.
	mb.cache.buf = append(mb.cache.buf, hdr[:]...)
	mb.cache.buf = append(mb.cache.buf, subj...)

	if hasHeaders {
		var hlen [4]byte
		le.PutUint32(hlen[0:], uint32(len(mhdr)))
		mb.cache.buf = append(mb.cache.buf, hlen[:]...)
		mb.cache.buf = append(mb.cache.buf, mhdr...)
	}
	mb.cache.buf = append(mb.cache.buf, msg...)

	// Calculate hash.
	mb.hh.Reset()
	mb.hh.Write(hdr[4:20])
	mb.hh.Write([]byte(subj))
	if hasHeaders {
		mb.hh.Write(mhdr)
	}
	mb.hh.Write(msg)
	checksum := mb.hh.Sum(nil)
	// Grab last checksum
	copy(mb.lchk[0:], checksum)

	// Update write through cache.
	// Write to msg record.
	mb.cache.buf = append(mb.cache.buf, checksum...)
	mb.cache.lrl = uint32(rl)

	// Set cache timestamp for last store.
	mb.lwts = ts

	// Only update index and do accounting if not a delete tombstone.
	if seq&tbit == 0 {
		// Accounting, do this before stripping ebit, it is ebit aware.
		mb.updateAccounting(seq, ts, rl)
		// Strip ebit if set.
		seq = seq &^ ebit
		if mb.cache.fseq == 0 {
			mb.cache.fseq = seq
		}
		// Write index
		mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
	}

	fch, werr := mb.fch, mb.werr

	// If we should be flushing, or had a write error, do so here.
	if flush || werr != nil {
		ld, err := mb.flushPendingMsgsLocked()
		if ld != nil && mb.fs != nil {
			// We have the mb lock here, this needs the mb locks so do in its own go routine.
			go mb.fs.rebuildState(ld)
		}
		if err != nil {
			return err
		}
	} else {
		// Kick the flusher here.
		kickFlusher(fch)
	}

	return nil
}

// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSize() int {
	if mb == nil {
		return 0
	}
	mb.mu.RLock()
	defer mb.mu.RUnlock()
	return mb.pendingWriteSizeLocked()
}

// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSizeLocked() int {
	if mb == nil {
		return 0
	}
	var pending int
	if !mb.closed && mb.mfd != nil && mb.cache != nil {
		pending = len(mb.cache.buf) - int(mb.cache.wp)
	}
	return pending
}

// Try to close our FDs if we can.
func (mb *msgBlock) closeFDs() error {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	return mb.closeFDsLocked()
}

func (mb *msgBlock) closeFDsLocked() error {
	if buf, _ := mb.bytesPending(); len(buf) > 0 {
		return errPendingData
	}
	mb.closeFDsLockedNoCheck()
	return nil
}

func (mb *msgBlock) closeFDsLockedNoCheck() {
	if mb.mfd != nil {
		mb.mfd.Close()
		mb.mfd = nil
	}
}

// bytesPending returns the buffer to be used for writing to the underlying file.
// This marks we are in flush and will return nil if asked again until cleared.
// Lock should be held.
func (mb *msgBlock) bytesPending() ([]byte, error) {
	if mb == nil || mb.mfd == nil {
		return nil, errNoPending
	}
	if mb.cache == nil {
		return nil, errNoCache
	}
	if len(mb.cache.buf) <= mb.cache.wp {
		return nil, errNoPending
	}
	buf := mb.cache.buf[mb.cache.wp:]
	if len(buf) == 0 {
		return nil, errNoPending
	}
	return buf, nil
}

// Returns the current blkSize including deleted msgs etc.
func (mb *msgBlock) blkSize() uint64 {
	mb.mu.RLock()
	nb := mb.rbytes
	mb.mu.RUnlock()
	return nb
}

// Update accounting on a write msg.
// Lock should be held.
func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
	isDeleted := seq&ebit != 0
	if isDeleted {
		seq = seq &^ ebit
	}

	if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq {
		mb.first.seq = seq
		mb.first.ts = ts
	}
	// Need atomics here for selectMsgBlock speed.
	atomic.StoreUint64(&mb.last.seq, seq)
	mb.last.ts = ts
	mb.rbytes += rl
	if !isDeleted {
		mb.bytes += rl
		mb.msgs++
	}
}

// Lock should be held.
func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
	var err error

	// Get size for this message.
	rl := fileStoreMsgSize(subj, hdr, msg)
	if rl&hbit != 0 {
		return 0, ErrMsgTooLarge
	}
	// Grab our current last message block.
	mb := fs.lmb

	// Mark as dirty for stream state.
	fs.dirty++

	if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
		if mb != nil && fs.fcfg.Compression != NoCompression {
			// We've now reached the end of this message block, if we want
			// to compress blocks then now's the time to do it.
			go mb.recompressOnDiskIfNeeded()
		}
		if mb, err = fs.newMsgBlockForWrite(); err != nil {
			return 0, err
		}
	}

	// Ask msg block to store in write through cache.
	err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)

	return rl, err
}

func (mb *msgBlock) recompressOnDiskIfNeeded() error {
	// Wait for disk I/O slots to become available. This prevents us from
	// running away with system resources.
	<-dios
	defer func() {
		dios <- struct{}{}
	}()

	alg := mb.fs.fcfg.Compression
	mb.mu.Lock()
	defer mb.mu.Unlock()

	origFN := mb.mfn                    // The original message block on disk.
	tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.

	// Open up the file block and read in the entire contents into memory.
	// One of two things will happen:
	// 1. The block will be compressed already and have a valid metadata
	//    header, in which case we do nothing.
	// 2. The block will be uncompressed, in which case we will compress it
	//    and then write it back out to disk, reencrypting if necessary.
	origBuf, err := os.ReadFile(origFN)
	if err != nil {
		return fmt.Errorf("failed to read original block from disk: %w", err)
	}

	// If the block is encrypted then we will need to decrypt it before
	// doing anything. We always encrypt after compressing because then the
	// compression can be as efficient as possible on the raw data, whereas
	// the encrypted ciphertext will not compress anywhere near as well.
	// The block encryption also covers the optional compression metadata.
	if mb.bek != nil && len(origBuf) > 0 {
		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
		if err != nil {
			return err
		}
		mb.bek = bek
		mb.bek.XORKeyStream(origBuf, origBuf)
	}

	meta := &CompressionInfo{}
	if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
		// An error is only returned here if there's a problem with parsing
		// the metadata. If the file has no metadata at all, no error is
		// returned and the algorithm defaults to no compression.
		return fmt.Errorf("failed to read existing metadata header: %w", err)
	}
	if meta.Algorithm == alg {
		// The block is already compressed with the chosen algorithm so there
		// is nothing else to do. This is not a common case, it is here only
		// to ensure we don't do unnecessary work in case something asked us
		// to recompress an already compressed block with the same algorithm.
		return nil
	} else if alg != NoCompression {
		// The block is already compressed using some algorithm, so we need
		// to decompress the block using the existing algorithm before we can
		// recompress it with the new one.
		if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
			return fmt.Errorf("failed to decompress original block: %w", err)
		}
	}

	// Rather than modifying the existing block on disk (which is a dangerous
	// operation if something goes wrong), create a new temporary file. We will
	// write out the new block here and then swap the files around afterwards
	// once everything else has succeeded correctly.
	tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
	if err != nil {
		return fmt.Errorf("failed to create temporary file: %w", err)
	}

	// The original buffer at this point is uncompressed, so we will now compress
	// it if needed. Note that if the selected algorithm is NoCompression, the
	// Compress function will just return the input buffer unmodified.
	cmpBuf, err := alg.Compress(origBuf)
	if err != nil {
		return fmt.Errorf("failed to compress block: %w", err)
	}

	// We only need to write out the metadata header if compression is enabled.
	// If we're trying to uncompress the file on disk at this point, don't bother
	// writing metadata.
	if alg != NoCompression {
		meta := &CompressionInfo{
			Algorithm:    alg,
			OriginalSize: uint64(len(origBuf)),
		}
		cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
	}

	// Re-encrypt the block if necessary.
	if mb.bek != nil && len(cmpBuf) > 0 {
		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
		if err != nil {
			return err
		}
		mb.bek = bek
		mb.bek.XORKeyStream(cmpBuf, cmpBuf)
	}

	// Write the new block data (which might be compressed or encrypted) to the
	// temporary file.
	errorCleanup := func(err error) error {
		tmpFD.Close()
		os.Remove(tmpFN)
		return err
	}
	if n, err := tmpFD.Write(cmpBuf); err != nil {
		return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
	} else if n != len(cmpBuf) {
		return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
	}
	if err := tmpFD.Sync(); err != nil {
		return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
	}
	if err := tmpFD.Close(); err != nil {
		return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
	}

	// Now replace the original file with the newly updated temp file.
	if err := os.Rename(tmpFN, origFN); err != nil {
		return fmt.Errorf("failed to move temporary file into place: %w", err)
	}

	// Since the message block might be retained in memory, make sure the
	// compression algorithm is up-to-date, since this will be needed when
	// compacting or truncating.
	mb.cmp = alg
	return nil
}

func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
	var meta CompressionInfo
	if n, err := meta.UnmarshalMetadata(buf); err != nil {
		// There was a problem parsing the metadata header of the block.
		// If there's no metadata header, an error isn't returned here,
		// we will instead just use default values of no compression.
		return nil, err
	} else if n == 0 {
		// There were no metadata bytes, so we assume the block is not
		// compressed and return it as-is.
		return buf, nil
	} else {
		// Metadata was present so it's quite likely the block contents
		// are compressed. If by any chance the metadata claims that the
		// block is uncompressed, then the input slice is just returned
		// unmodified.
		return meta.Algorithm.Decompress(buf[n:])
	}
}

// Sync msg and index files as needed. This is called from a timer.
func (fs *fileStore) syncBlocks() {
	fs.mu.RLock()
	if fs.closed {
		fs.mu.RUnlock()
		return
	}
	blks := append([]*msgBlock(nil), fs.blks...)
	fs.mu.RUnlock()

	for _, mb := range blks {
		// Do actual sync. Hold lock for consistency.
		mb.mu.Lock()
		if mb.closed {
			mb.mu.Unlock()
			continue
		}
		if mb.needSync {
			// Flush anything that may be pending.
			if mb.pendingWriteSizeLocked() > 0 {
				mb.flushPendingMsgsLocked()
			}
			if mb.mfd != nil {
				mb.mfd.Sync()
			} else {
				fd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
				if err != nil {
					mb.mu.Unlock()
					continue
				}
				fd.Sync()
				fd.Close()
			}
			mb.needSync = false
		}
		// See if we can close FDs due to being idle.
		if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
			mb.dirtyCloseWithRemove(false)
		}
		mb.mu.Unlock()
	}

	fs.mu.Lock()
	fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
	syncAlways := fs.fcfg.SyncAlways
	fs.mu.Unlock()

	if !syncAlways {
		if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
			fd.Sync()
			fd.Close()
		}
	}
}

// Select the message block where this message should be found.
// Return nil if not in the set.
// Read lock should be held.
func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
	_, mb := fs.selectMsgBlockWithIndex(seq)
	return mb
}

func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
	// Check for out of range.
	if seq < fs.state.FirstSeq || seq > fs.state.LastSeq {
		return -1, nil
	}

	const linearThresh = 32
	nb := len(fs.blks) - 1

	if nb < linearThresh {
		for i, mb := range fs.blks {
			if seq <= atomic.LoadUint64(&mb.last.seq) {
				return i, mb
			}
		}
		return -1, nil
	}

	// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
	for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
		mb := fs.blks[mid]
		// Right now these atomic loads do not factor in, so fine to leave. Was considering
		// uplifting these to fs scope to avoid atomic load but not needed.
		first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
		if seq > last {
			low = mid + 1
		} else if seq < first {
			high = mid - 1
		} else {
			return mid, mb
		}
	}

	return -1, nil
}

// Select the message block where this message should be found.
// Return nil if not in the set.
func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	t := minTime.UnixNano()
	for _, mb := range fs.blks {
		mb.mu.RLock()
		found := t <= mb.last.ts
		mb.mu.RUnlock()
		if found {
			return mb
		}
	}
	return nil
}

// Index a raw msg buffer.
// Lock should be held.
func (mb *msgBlock) indexCacheBuf(buf []byte) error {
	var le = binary.LittleEndian

	var fseq, pseq uint64
	var idx []uint32
	var index uint32

	if mb.cache == nil {
		// Approximation, may adjust below.
		fseq = mb.first.seq
		idx = make([]uint32, 0, mb.msgs)
		mb.cache = &cache{}
	} else {
		fseq = mb.cache.fseq
		idx = mb.cache.idx
		if len(idx) == 0 {
			idx = make([]uint32, 0, mb.msgs)
		}
		index = uint32(len(mb.cache.buf))
		buf = append(mb.cache.buf, buf...)
	}

	// Create FSS if we should track.
	if !mb.noTrack {
		mb.fss = make(map[string]*SimpleState)
	}

	lbuf := uint32(len(buf))
	for index < lbuf {
		if index+msgHdrSize > lbuf {
			return errCorruptState
		}
		hdr := buf[index : index+msgHdrSize]
		rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:]))

		// Clear any headers bit that could be set.
		rl &^= hbit
		dlen := int(rl) - msgHdrSize

		// Do some quick sanity checks here.
		if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
			// This means something is off.
			// TODO(dlc) - Add into bad list?
			return errCorruptState
		}

		// Check for tombstones which we can skip in terms of indexing.
		if seq&tbit != 0 {
			index += rl
			continue
		}

		// Clear any erase bits.
		erased := seq&ebit != 0
		seq = seq &^ ebit

		// We defer checksum checks to individual msg cache lookups to amortorize costs and
		// not introduce latency for first message from a newly loaded block.
		if seq >= mb.first.seq {
			// Track that we do not have holes.
			// Not expected but did see it in the field.
			if pseq > 0 && seq != pseq+1 {
				for dseq := pseq + 1; dseq < seq; dseq++ {
					idx = append(idx, dbit)
					mb.dmap.Insert(dseq)
				}
			}
			pseq = seq
			// Add to our index.
			idx = append(idx, index)
			mb.cache.lrl = uint32(rl)
			// Adjust if we guessed wrong.
			if seq != 0 && seq < fseq {
				fseq = seq
			}

			// Handle FSS inline here.
			if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
				bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
				if ss := mb.fss[string(bsubj)]; ss != nil {
					ss.Msgs++
					ss.Last = seq
				} else {
					subj := mb.subjString(bsubj)
					mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
				}
			}
		}
		index += rl
	}

	mb.cache.buf = buf
	mb.cache.idx = idx
	mb.cache.fseq = fseq
	mb.cache.wp += int(lbuf)

	return nil
}

// flushPendingMsgs writes out any messages for this message block.
func (mb *msgBlock) flushPendingMsgs() error {
	mb.mu.Lock()
	fsLostData, err := mb.flushPendingMsgsLocked()
	fs := mb.fs
	mb.mu.Unlock()

	// Signals us that we need to rebuild filestore state.
	if fsLostData != nil && fs != nil {
		// Rebuild fs state too.
		fs.rebuildState(fsLostData)
	}
	return err
}

// Write function for actual data.
// mb.mfd should not be nil.
// Lock should held.
func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
	// Used to mock write failures.
	if mb.mockWriteErr {
		// Reset on trip.
		mb.mockWriteErr = false
		return 0, errors.New("mock write error")
	}
	return mb.mfd.WriteAt(buf, woff)
}

// flushPendingMsgsLocked writes out any messages for this message block.
// Lock should be held.
func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
	// Signals us that we need to rebuild filestore state.
	var fsLostData *LostStreamData

	if mb.cache == nil || mb.mfd == nil {
		return nil, nil
	}

	buf, err := mb.bytesPending()
	// If we got an error back return here.
	if err != nil {
		// No pending data to be written is not an error.
		if err == errNoPending || err == errNoCache {
			err = nil
		}
		return nil, err
	}

	woff := int64(mb.cache.off + mb.cache.wp)
	lob := len(buf)

	// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
	// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
	// under heavy load.

	// Check if we need to encrypt.
	if mb.bek != nil && lob > 0 {
		// Need to leave original alone.
		var dst []byte
		if lob <= defaultLargeBlockSize {
			dst = getMsgBlockBuf(lob)[:lob]
		} else {
			dst = make([]byte, lob)
		}
		mb.bek.XORKeyStream(dst, buf)
		buf = dst
	}

	// Append new data to the message block file.
	for lbb := lob; lbb > 0; lbb = len(buf) {
		n, err := mb.writeAt(buf, woff)
		if err != nil {
			mb.dirtyCloseWithRemove(false)
			ld, _, _ := mb.rebuildStateLocked()
			mb.werr = err
			return ld, err
		}
		// Update our write offset.
		woff += int64(n)
		// Partial write.
		if n != lbb {
			buf = buf[n:]
		} else {
			// Done.
			break
		}
	}

	// Clear any error.
	mb.werr = nil

	// Cache may be gone.
	if mb.cache == nil || mb.mfd == nil {
		return fsLostData, mb.werr
	}

	// Check if we are in sync always mode.
	if mb.syncAlways {
		mb.mfd.Sync()
	} else {
		mb.needSync = true
	}

	// Check for additional writes while we were writing to the disk.
	moreBytes := len(mb.cache.buf) - mb.cache.wp - lob

	// Decide what we want to do with the buffer in hand. If we have load interest
	// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
	if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
		mb.cache.wp += lob
	} else {
		if cap(mb.cache.buf) <= maxBufReuse {
			buf = mb.cache.buf[:0]
		} else {
			recycleMsgBlockBuf(mb.cache.buf)
			buf = nil
		}
		if moreBytes > 0 {
			nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
			if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
				buf = nbuf
			} else {
				buf = append(buf, nbuf...)
			}
		}
		// Update our cache offset.
		mb.cache.off = int(woff)
		// Reset write pointer.
		mb.cache.wp = 0
		// Place buffer back in the cache structure.
		mb.cache.buf = buf
		// Mark fseq to 0
		mb.cache.fseq = 0
	}

	return fsLostData, mb.werr
}

// Lock should be held.
func (mb *msgBlock) clearLoading() {
	mb.loading = false
}

// Will load msgs from disk.
func (mb *msgBlock) loadMsgs() error {
	// We hold the lock here the whole time by design.
	mb.mu.Lock()
	defer mb.mu.Unlock()
	return mb.loadMsgsWithLock()
}

// Lock should be held.
func (mb *msgBlock) cacheAlreadyLoaded() bool {
	if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
		return false
	}
	numEntries := mb.msgs + uint64(mb.dmap.Size()) + (mb.first.seq - mb.cache.fseq)
	return numEntries == uint64(len(mb.cache.idx))
}

// Lock should be held.
func (mb *msgBlock) cacheNotLoaded() bool {
	return !mb.cacheAlreadyLoaded()
}

// Used to load in the block contents.
// Lock should be held and all conditionals satisfied prior.
func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
	f, err := os.Open(mb.mfn)
	if err != nil {
		if os.IsNotExist(err) {
			err = errNoBlkData
		}
		return nil, err
	}
	defer f.Close()

	var sz int
	if info, err := f.Stat(); err == nil {
		sz64 := info.Size()
		if int64(int(sz64)) == sz64 {
			sz = int(sz64)
		} else {
			return nil, errMsgBlkTooBig
		}
	}

	if buf == nil {
		buf = getMsgBlockBuf(sz)
		if sz > cap(buf) {
			// We know we will make a new one so just recycle for now.
			recycleMsgBlockBuf(buf)
			buf = nil
		}
	}

	if sz > cap(buf) {
		buf = make([]byte, sz)
	} else {
		buf = buf[:sz]
	}

	n, err := io.ReadFull(f, buf)
	// On success capture raw bytes size.
	if err == nil {
		mb.rbytes = uint64(n)
	}
	return buf[:n], err
}

// Lock should be held.
func (mb *msgBlock) loadMsgsWithLock() error {
	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
			return err
		}
	}

	// Check to see if we are loading already.
	if mb.loading {
		return nil
	}

	// Set loading status.
	mb.loading = true
	defer mb.clearLoading()

	var nchecks int

checkCache:
	nchecks++
	if nchecks > 8 {
		return errCorruptState
	}

	// Check to see if we have a full cache.
	if mb.cacheAlreadyLoaded() {
		return nil
	}

	mb.llts = time.Now().UnixNano()

	// FIXME(dlc) - We could be smarter here.
	if buf, _ := mb.bytesPending(); len(buf) > 0 {
		ld, err := mb.flushPendingMsgsLocked()
		if ld != nil && mb.fs != nil {
			// We do not know if fs is locked or not at this point.
			// This should be an exceptional condition so do so in Go routine.
			go mb.fs.rebuildState(ld)
		}
		if err != nil {
			return err
		}
		goto checkCache
	}

	// Load in the whole block.
	// We want to hold the mb lock here to avoid any changes to state.
	buf, err := mb.loadBlock(nil)
	if err != nil {
		if err == errNoBlkData {
			if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
				// Rebuild fs state too.
				go mb.fs.rebuildState(ld)
			}
		}
		return err
	}

	// Reset the cache since we just read everything in.
	// Make sure this is cleared in case we had a partial when we started.
	mb.clearCacheAndOffset()

	// Check if we need to decrypt.
	if mb.bek != nil && len(buf) > 0 {
		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
		if err != nil {
			return err
		}
		mb.bek = bek
		mb.bek.XORKeyStream(buf, buf)
	}

	// Check for compression.
	if buf, err = mb.decompressIfNeeded(buf); err != nil {
		return err
	}

	if err := mb.indexCacheBuf(buf); err != nil {
		if err == errCorruptState {
			var ld *LostStreamData
			if ld, _, err = mb.rebuildStateLocked(); ld != nil {
				// We do not know if fs is locked or not at this point.
				// This should be an exceptional condition so do so in Go routine.
				go mb.fs.rebuildState(ld)
			}
		}
		if err != nil {
			return err
		}
		goto checkCache
	}

	if len(buf) > 0 {
		mb.cloads++
		mb.startCacheExpireTimer()
	}

	return nil
}

// Fetch a message from this block, possibly reading in and caching the messages.
// We assume the block was selected and is correct, so we do not do range checks.
func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
	mb.mu.Lock()
	defer mb.mu.Unlock()

	if mb.cacheNotLoaded() {
		if err := mb.loadMsgsWithLock(); err != nil {
			return nil, false, err
		}
	}
	fsm, err := mb.cacheLookup(seq, sm)
	if err != nil {
		return nil, false, err
	}
	expireOk := seq == mb.last.seq && mb.llseq == seq
	return fsm, expireOk, err
}

var (
	errNoCache       = errors.New("no message cache")
	errBadMsg        = errors.New("malformed or corrupt message")
	errDeletedMsg    = errors.New("deleted message")
	errPartialCache  = errors.New("partial cache")
	errNoPending     = errors.New("message block does not have pending data")
	errNotReadable   = errors.New("storage directory not readable")
	errCorruptState  = errors.New("corrupt state file")
	errPriorState    = errors.New("prior state file")
	errPendingData   = errors.New("pending data still present")
	errNoEncryption  = errors.New("encryption not enabled")
	errBadKeySize    = errors.New("encryption bad key size")
	errNoMsgBlk      = errors.New("no message block")
	errMsgBlkTooBig  = errors.New("message block size exceeded int capacity")
	errUnknownCipher = errors.New("unknown cipher")
	errNoMainKey     = errors.New("encrypted store encountered with no main key")
	errNoBlkData     = errors.New("message block data missing")
)

const (
	// Used for marking messages that have had their checksums checked.
	// Used to signal a message record with headers.
	hbit = 1 << 31
	// Used for marking erased messages sequences.
	ebit = 1 << 63
	// Used for marking tombstone sequences.
	tbit = 1 << 62
	// Used to mark a bad index as deleted.
	dbit = 1 << 30
)

// Will do a lookup from cache.
// Lock should be held.
func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
	if seq < mb.first.seq || seq > mb.last.seq {
		return nil, ErrStoreMsgNotFound
	}

	// If we have a delete map check it.
	if mb.dmap.Exists(seq) {
		mb.llts = time.Now().UnixNano()
		return nil, errDeletedMsg
	}

	// Detect no cache loaded.
	if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
		return nil, errNoCache
	}
	// Check partial cache status.
	if seq < mb.cache.fseq {
		return nil, errPartialCache
	}

	bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
	if err != nil {
		return nil, err
	}

	// Update cache activity.
	mb.llts = time.Now().UnixNano()
	// The llseq signals us when we can expire a cache at the end of a linear scan.
	// We want to only update when we know the last reads (multiple consumers) are sequential.
	if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 {
		mb.llseq = seq
	}

	li := int(bi) - mb.cache.off
	if li >= len(mb.cache.buf) {
		return nil, errPartialCache
	}
	buf := mb.cache.buf[li:]

	// We use the high bit to denote we have already checked the checksum.
	var hh hash.Hash64
	if !hashChecked {
		hh = mb.hh // This will force the hash check in msgFromBuf.
	}

	// Parse from the raw buffer.
	fsm, err := mb.msgFromBuf(buf, sm, hh)
	if err != nil || fsm == nil {
		return nil, err
	}

	// Deleted messages that are decoded return a 0 for sequence.
	if fsm.seq == 0 {
		return nil, errDeletedMsg
	}

	if seq != fsm.seq {
		recycleMsgBlockBuf(mb.cache.buf)
		mb.cache.buf = nil
		return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
	}

	// Clear the check bit here after we know all is good.
	if !hashChecked {
		mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
	}

	return fsm, nil
}

// Used when we are checking if discarding a message due to max msgs per subject will give us
// enough room for a max bytes condition.
// Lock should be already held.
func (fs *fileStore) sizeForSeq(seq uint64) int {
	if seq == 0 {
		return 0
	}
	var smv StoreMsg
	if mb := fs.selectMsgBlock(seq); mb != nil {
		if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
			return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
		}
	}
	return 0
}

// Will return message for the given sequence number.
func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
	// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
	// be stalled. Need another lock if want to happen in parallel.
	fs.mu.RLock()
	if fs.closed {
		fs.mu.RUnlock()
		return nil, ErrStoreClosed
	}
	// Indicates we want first msg.
	if seq == 0 {
		seq = fs.state.FirstSeq
	}
	// Make sure to snapshot here.
	mb, lmb, lseq := fs.selectMsgBlock(seq), fs.lmb, fs.state.LastSeq
	fs.mu.RUnlock()

	if mb == nil {
		var err = ErrStoreEOF
		if seq <= lseq {
			err = ErrStoreMsgNotFound
		}
		return nil, err
	}

	fsm, expireOk, err := mb.fetchMsg(seq, sm)
	if err != nil {
		return nil, err
	}

	// We detected a linear scan and access to the last message.
	// If we are not the last message block we can try to expire the cache.
	if mb != lmb && expireOk {
		mb.tryForceExpireCache()
	}

	return fsm, nil
}

// Internal function to return msg parts from a raw buffer.
// Lock should be held.
func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
	if len(buf) < emptyRecordLen {
		return nil, errBadMsg
	}
	var le = binary.LittleEndian

	hdr := buf[:msgHdrSize]
	rl := le.Uint32(hdr[0:])
	hasHeaders := rl&hbit != 0
	rl &^= hbit // clear header bit
	dlen := int(rl) - msgHdrSize
	slen := int(le.Uint16(hdr[20:]))
	// Simple sanity check.
	if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
		return nil, errBadMsg
	}
	data := buf[msgHdrSize : msgHdrSize+dlen]
	// Do checksum tests here if requested.
	if hh != nil {
		hh.Reset()
		hh.Write(hdr[4:20])
		hh.Write(data[:slen])
		if hasHeaders {
			hh.Write(data[slen+4 : dlen-recordHashSize])
		} else {
			hh.Write(data[slen : dlen-recordHashSize])
		}
		if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
			return nil, errBadMsg
		}
	}
	seq := le.Uint64(hdr[4:])
	if seq&ebit != 0 {
		seq = 0
	}
	ts := int64(le.Uint64(hdr[12:]))

	// Create a StoreMsg if needed.
	if sm == nil {
		sm = new(StoreMsg)
	} else {
		sm.clear()
	}
	// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
	// layers and for us to be safe to expire, and recycle, the large msgBlocks.
	end := dlen - 8

	if hasHeaders {
		hl := le.Uint32(data[slen:])
		bi := slen + 4
		li := bi + int(hl)
		sm.buf = append(sm.buf, data[bi:end]...)
		li, end = li-bi, end-bi
		sm.hdr = sm.buf[0:li:li]
		sm.msg = sm.buf[li:end]
	} else {
		sm.buf = append(sm.buf, data[slen:end]...)
		sm.msg = sm.buf[0 : end-slen]
	}
	sm.seq, sm.ts = seq, ts
	// Treat subject a bit different to not reference underlying buf.
	if slen > 0 {
		sm.subj = mb.subjString(data[:slen])
	}

	return sm, nil
}

// Used to intern strings for subjects.
// Based on idea from https://github.com/josharian/intern/blob/master/intern.go
var subjPool = sync.Pool{
	New: func() any {
		return make(map[string]string)
	},
}

// Get an interned string from a byte slice.
func subjFromBytes(b []byte) string {
	sm := subjPool.Get().(map[string]string)
	defer subjPool.Put(sm)
	subj, ok := sm[string(b)]
	if ok {
		return subj
	}
	s := string(b)
	sm[s] = s
	return s
}

// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (fs *fileStore) subjString(skey []byte) string {
	if fs == nil || len(skey) == 0 {
		return _EMPTY_
	}

	if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 {
		if lsubjs == 1 {
			// The cast for the comparison does not make a copy
			if string(skey) == fs.cfg.Subjects[0] {
				return fs.cfg.Subjects[0]
			}
		} else {
			for _, subj := range fs.cfg.Subjects {
				if string(skey) == subj {
					return subj
				}
			}
		}
	}
	return subjFromBytes(skey)
}

// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (mb *msgBlock) subjString(skey []byte) string {
	return mb.fs.subjString(skey)
}

// LoadMsg will lookup the message by sequence number and return it if found.
func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
	return fs.msgForSeq(seq, sm)
}

// loadLast will load the last message for a subject. Subject should be non empty and not ">".
func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	if fs.closed || fs.lmb == nil {
		return nil, ErrStoreClosed
	}

	if len(fs.blks) == 0 {
		return nil, ErrStoreMsgNotFound
	}

	start, stop := fs.lmb.index, fs.blks[0].index
	wc := subjectHasWildcard(subj)
	// If literal subject check for presence.
	if !wc {
		if info := fs.psim[subj]; info == nil {
			return nil, ErrStoreMsgNotFound
		} else {
			start, stop = info.lblk, info.fblk
		}
	}

	// Walk blocks backwards.
	for i := start; i >= stop; i-- {
		mb := fs.bim[i]
		if mb == nil {
			continue
		}
		mb.mu.Lock()
		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
			mb.mu.Unlock()
			return nil, err
		}
		var l uint64
		// Optimize if subject is not a wildcard.
		if !wc {
			if ss := mb.fss[subj]; ss != nil {
				l = ss.Last
			}
		}
		if l == 0 {
			_, _, l = mb.filteredPendingLocked(subj, wc, mb.first.seq)
		}
		if l > 0 {
			if mb.cacheNotLoaded() {
				if err := mb.loadMsgsWithLock(); err != nil {
					mb.mu.Unlock()
					return nil, err
				}
			}
			lsm, err = mb.cacheLookup(l, sm)
		}
		mb.mu.Unlock()
		if l > 0 {
			break
		}
	}
	return lsm, err
}

// LoadLastMsg will return the last message we have that matches a given subject.
// The subject can be a wildcard.
func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
	if subject == _EMPTY_ || subject == fwcs {
		sm, err = fs.msgForSeq(fs.lastSeq(), smv)
	} else {
		sm, err = fs.loadLast(subject, smv)
	}
	if sm == nil || (err != nil && err != ErrStoreClosed) {
		err = ErrStoreMsgNotFound
	}
	return sm, err
}

func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	if fs.closed {
		return nil, 0, ErrStoreClosed
	}
	if start < fs.state.FirstSeq {
		start = fs.state.FirstSeq
	}

	if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
		for i := bi; i < len(fs.blks); i++ {
			mb := fs.blks[i]
			if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
				if expireOk && mb != fs.lmb {
					mb.tryForceExpireCache()
				}
				return sm, sm.seq, nil
			} else if err != ErrStoreMsgNotFound {
				return nil, 0, err
			}
		}
	}

	return nil, fs.state.LastSeq, ErrStoreEOF
}

// Type returns the type of the underlying store.
func (fs *fileStore) Type() StorageType {
	return FileStorage
}

// Returns number of subjects in this store.
// Lock should be held.
func (fs *fileStore) numSubjects() int {
	return len(fs.psim)
}

// FastState will fill in state with only the following.
// Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
func (fs *fileStore) FastState(state *StreamState) {
	fs.mu.RLock()
	state.Msgs = fs.state.Msgs
	state.Bytes = fs.state.Bytes
	state.FirstSeq = fs.state.FirstSeq
	state.FirstTime = fs.state.FirstTime
	state.LastSeq = fs.state.LastSeq
	state.LastTime = fs.state.LastTime
	if state.LastSeq > state.FirstSeq {
		state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
		if state.NumDeleted < 0 {
			state.NumDeleted = 0
		}
	}
	state.Consumers = len(fs.cfs)
	state.NumSubjects = fs.numSubjects()
	fs.mu.RUnlock()
}

// State returns the current state of the stream.
func (fs *fileStore) State() StreamState {
	fs.mu.RLock()
	state := fs.state
	state.Consumers = len(fs.cfs)
	state.NumSubjects = fs.numSubjects()
	state.Deleted = nil // make sure.

	if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
		state.Deleted = make([]uint64, 0, numDeleted)
		cur := fs.state.FirstSeq

		for _, mb := range fs.blks {
			mb.mu.Lock()
			fseq := mb.first.seq
			// Account for messages missing from the head.
			if fseq > cur {
				for seq := cur; seq < fseq; seq++ {
					state.Deleted = append(state.Deleted, seq)
				}
			}
			cur = mb.last.seq + 1 // Expected next first.

			mb.dmap.Range(func(seq uint64) bool {
				if seq < fseq {
					mb.dmap.Delete(seq)
				} else {
					state.Deleted = append(state.Deleted, seq)
				}
				return true
			})
			mb.mu.Unlock()
		}
	}
	fs.mu.RUnlock()

	state.Lost = fs.lostData()

	// Can not be guaranteed to be sorted.
	if len(state.Deleted) > 0 {
		sort.Slice(state.Deleted, func(i, j int) bool {
			return state.Deleted[i] < state.Deleted[j]
		})
		state.NumDeleted = len(state.Deleted)
	}
	return state
}

func (fs *fileStore) Utilization() (total, reported uint64, err error) {
	fs.mu.RLock()
	defer fs.mu.RUnlock()
	for _, mb := range fs.blks {
		mb.mu.RLock()
		reported += mb.bytes
		total += mb.rbytes
		mb.mu.RUnlock()
	}
	return total, reported, nil
}

func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
	if len(hdr) == 0 {
		// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
		return uint64(22 + len(subj) + len(msg) + 8)
	}
	// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
	return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
}

func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
	return uint64(emptyRecordLen + slen + 4 + maxPayload)
}

// Determine time since last write or remove of a message.
// Read lock should be held.
func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
	if mb.closed {
		return 0
	}
	last := mb.lwts
	if mb.lrts > last {
		last = mb.lrts
	}
	return time.Since(time.Unix(0, last).UTC())
}

func checkNewHeader(hdr []byte) error {
	if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
		(hdr[1] != version && hdr[1] != newVersion) {
		return errCorruptState
	}
	return nil
}

// readIndexInfo will read in the index information for the message block.
func (mb *msgBlock) readIndexInfo() error {
	ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
	buf, err := os.ReadFile(ifn)
	if err != nil {
		return err
	}

	// Set if first time.
	if mb.liwsz == 0 {
		mb.liwsz = int64(len(buf))
	}

	// Decrypt if needed.
	if mb.aek != nil {
		buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
		if err != nil {
			return err
		}
	}

	if err := checkNewHeader(buf); err != nil {
		defer os.Remove(ifn)
		return fmt.Errorf("bad index file")
	}

	bi := hdrLen

	// Helpers, will set i to -1 on error.
	readSeq := func() uint64 {
		if bi < 0 {
			return 0
		}
		seq, n := binary.Uvarint(buf[bi:])
		if n <= 0 {
			bi = -1
			return 0
		}
		bi += n
		return seq &^ ebit
	}
	readCount := readSeq
	readTimeStamp := func() int64 {
		if bi < 0 {
			return 0
		}
		ts, n := binary.Varint(buf[bi:])
		if n <= 0 {
			bi = -1
			return -1
		}
		bi += n
		return ts
	}
	mb.msgs = readCount()
	mb.bytes = readCount()
	mb.first.seq = readSeq()
	mb.first.ts = readTimeStamp()
	mb.last.seq = readSeq()
	mb.last.ts = readTimeStamp()
	dmapLen := readCount()

	// Check if this is a short write index file.
	if bi < 0 || bi+checksumSize > len(buf) {
		os.Remove(ifn)
		return fmt.Errorf("short index file")
	}

	// Check for consistency if accounting. If something is off bail and we will rebuild.
	if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen {
		os.Remove(ifn)
		return fmt.Errorf("accounting inconsistent")
	}

	// Checksum
	copy(mb.lchk[0:], buf[bi:bi+checksumSize])
	bi += checksumSize

	// Now check for presence of a delete map
	if dmapLen > 0 {
		// New version is encoded avl seqset.
		if buf[1] == newVersion {
			dmap, _, err := avl.Decode(buf[bi:])
			if err != nil {
				return fmt.Errorf("could not decode avl dmap: %v", err)
			}
			mb.dmap = *dmap
		} else {
			// This is the old version.
			for i := 0; i < int(dmapLen); i++ {
				seq := readSeq()
				if seq == 0 {
					break
				}
				mb.dmap.Insert(seq + mb.first.seq)
			}
		}
	}

	return nil
}

// Will return total number of cache loads.
func (fs *fileStore) cacheLoads() uint64 {
	var tl uint64
	fs.mu.RLock()
	for _, mb := range fs.blks {
		tl += mb.cloads
	}
	fs.mu.RUnlock()
	return tl
}

// Will return total number of cached bytes.
func (fs *fileStore) cacheSize() uint64 {
	var sz uint64
	fs.mu.RLock()
	for _, mb := range fs.blks {
		mb.mu.RLock()
		if mb.cache != nil {
			sz += uint64(len(mb.cache.buf))
		}
		mb.mu.RUnlock()
	}
	fs.mu.RUnlock()
	return sz
}

// Will return total number of dmapEntries for all msg blocks.
func (fs *fileStore) dmapEntries() int {
	var total int
	fs.mu.RLock()
	for _, mb := range fs.blks {
		total += mb.dmap.Size()
	}
	fs.mu.RUnlock()
	return total
}

// Fixed helper for iterating.
func subjectsEqual(a, b string) bool {
	return a == b
}

func subjectsAll(a, b string) bool {
	return true
}

func compareFn(subject string) func(string, string) bool {
	if subject == _EMPTY_ || subject == fwcs {
		return subjectsAll
	}
	if subjectHasWildcard(subject) {
		return subjectIsSubsetMatch
	}
	return subjectsEqual
}

// PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
// Will return the number of purged messages.
func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
	if subject == _EMPTY_ || subject == fwcs {
		if keep == 0 && (sequence == 0 || sequence == 1) {
			return fs.Purge()
		}
		if sequence > 1 {
			return fs.Compact(sequence)
		}
	}

	eq, wc := compareFn(subject), subjectHasWildcard(subject)
	var firstSeqNeedsUpdate bool
	var bytes uint64

	// If we have a "keep" designation need to get full filtered state so we know how many to purge.
	var maxp uint64
	if keep > 0 {
		ss := fs.FilteredState(1, subject)
		if keep >= ss.Msgs {
			return 0, nil
		}
		maxp = ss.Msgs - keep
	}

	var smv StoreMsg

	fs.mu.Lock()
	// We may remove blocks as we purge, so don't range directly on fs.blks
	// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
	for i := 0; i < len(fs.blks); i++ {
		mb := fs.blks[i]
		mb.mu.Lock()
		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
			mb.mu.Unlock()
			continue
		}
		t, f, l := mb.filteredPendingLocked(subject, wc, mb.first.seq)
		if t == 0 {
			mb.mu.Unlock()
			continue
		}

		var shouldExpire bool
		if mb.cacheNotLoaded() {
			mb.loadMsgsWithLock()
			shouldExpire = true
		}
		if sequence > 1 && sequence <= l {
			l = sequence - 1
		}

		for seq := f; seq <= l; seq++ {
			if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
				rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
				// Do fast in place remove.
				// Stats
				if mb.msgs > 0 {
					// Msgs
					fs.state.Msgs--
					mb.msgs--
					// Bytes, make sure to not go negative.
					if rl > fs.state.Bytes {
						rl = fs.state.Bytes
					}
					if rl > mb.bytes {
						rl = mb.bytes
					}
					fs.state.Bytes -= rl
					mb.bytes -= rl
					// Totals
					purged++
					bytes += rl
				}
				// FSS updates.
				mb.removeSeqPerSubject(sm.subj, seq)
				fs.removePerSubject(sm.subj)

				// Check for first message.
				if seq == mb.first.seq {
					mb.selectNextFirst()
					if mb.isEmpty() {
						fs.removeMsgBlock(mb)
						i--
						// keep flag set, if set previously
						firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
					} else if seq == fs.state.FirstSeq {
						fs.state.FirstSeq = mb.first.seq // new one.
						fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
					}
				} else {
					// Out of order delete.
					mb.dmap.Insert(seq)
				}

				if maxp > 0 && purged >= maxp {
					break
				}
			}
		}
		// Expire if we were responsible for loading.
		if shouldExpire {
			// Expire this cache before moving on.
			mb.tryForceExpireCacheLocked()
		}
		mb.mu.Unlock()

		// Check if we should break out of top level too.
		if maxp > 0 && purged >= maxp {
			break
		}
	}
	if firstSeqNeedsUpdate {
		fs.selectNextFirst()
	}

	fs.dirty++
	cb := fs.scb
	fs.mu.Unlock()

	fs.kickFlushStateLoop()

	if cb != nil {
		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
	}

	return purged, nil
}

// Purge will remove all messages from this store.
// Will return the number of purged messages.
func (fs *fileStore) Purge() (uint64, error) {
	return fs.purge(0)
}

func (fs *fileStore) purge(fseq uint64) (uint64, error) {
	fs.mu.Lock()
	if fs.closed {
		fs.mu.Unlock()
		return 0, ErrStoreClosed
	}

	purged := fs.state.Msgs
	rbytes := int64(fs.state.Bytes)

	fs.state.FirstSeq = fs.state.LastSeq + 1
	fs.state.FirstTime = time.Time{}

	fs.state.Bytes = 0
	fs.state.Msgs = 0

	for _, mb := range fs.blks {
		mb.dirtyClose()
	}

	fs.blks = nil
	fs.lmb = nil
	fs.bim = make(map[uint32]*msgBlock)
	// Clear any per subject tracking.
	fs.psim = make(map[string]*psi)
	// Mark dirty
	fs.dirty++

	// Move the msgs directory out of the way, will delete out of band.
	// FIXME(dlc) - These can error and we need to change api above to propagate?
	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
	// If purge directory still exists then we need to wait
	// in place and remove since rename would fail.
	if _, err := os.Stat(pdir); err == nil {
		os.RemoveAll(pdir)
	}
	os.Rename(mdir, pdir)

	go os.RemoveAll(pdir)

	// Create new one.
	os.MkdirAll(mdir, defaultDirPerms)

	// Make sure we have a lmb to write to.
	if _, err := fs.newMsgBlockForWrite(); err != nil {
		fs.mu.Unlock()
		return purged, err
	}

	// Check if we need to set the first seq to a new number.
	if fseq > fs.state.FirstSeq {
		fs.state.FirstSeq = fseq
		fs.state.LastSeq = fseq - 1
	}

	lmb := fs.lmb
	lmb.first.seq = fs.state.FirstSeq
	lmb.last.seq = fs.state.LastSeq
	lmb.last.ts = fs.state.LastTime.UnixNano()

	if fs.lmb.last.seq > 1 {
		// Leave a tombstone so we can remember our starting sequence in case
		// full state becomes corrupted.
		lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts)
	}

	cb := fs.scb
	fs.mu.Unlock()

	if cb != nil {
		cb(-int64(purged), -rbytes, 0, _EMPTY_)
	}

	return purged, nil
}

// Compact will remove all messages from this store up to
// but not including the seq parameter.
// Will return the number of purged messages.
func (fs *fileStore) Compact(seq uint64) (uint64, error) {
	if seq == 0 {
		return fs.purge(seq)
	}

	var purged, bytes uint64

	fs.mu.Lock()
	// Same as purge all.
	if lseq := fs.state.LastSeq; seq > lseq {
		fs.mu.Unlock()
		return fs.purge(seq)
	}
	// We have to delete interior messages.
	smb := fs.selectMsgBlock(seq)
	if smb == nil {
		fs.mu.Unlock()
		return 0, nil
	}

	// All msgblocks up to this one can be thrown away.
	var deleted int
	for _, mb := range fs.blks {
		if mb == smb {
			break
		}
		mb.mu.Lock()
		purged += mb.msgs
		bytes += mb.bytes
		// Make sure we do subject cleanup as well.
		mb.ensurePerSubjectInfoLoaded()
		for subj := range mb.fss {
			fs.removePerSubject(subj)
		}
		// Now close.
		mb.dirtyCloseWithRemove(true)
		mb.mu.Unlock()
		deleted++
	}

	var smv StoreMsg
	var err error
	var isEmpty bool

	smb.mu.Lock()
	if smb.first.seq == seq {
		goto SKIP
	}

	// Make sure we have the messages loaded.
	if smb.cacheNotLoaded() {
		if err = smb.loadMsgsWithLock(); err != nil {
			goto SKIP
		}
	}
	for mseq := smb.first.seq; mseq < seq; mseq++ {
		sm, err := smb.cacheLookup(mseq, &smv)
		if err == errDeletedMsg {
			// Update dmap.
			if !smb.dmap.IsEmpty() {
				smb.dmap.Delete(seq)
			}
		} else if sm != nil {
			sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
			if smb.msgs > 0 {
				smb.msgs--
				if sz > smb.bytes {
					sz = smb.bytes
				}
				smb.bytes -= sz
				bytes += sz
				purged++
			}
			// Update fss
			smb.removeSeqPerSubject(sm.subj, mseq)
			fs.removePerSubject(sm.subj)
		}
	}

	// Check if empty after processing, could happen if tail of messages are all deleted.
	isEmpty = smb.msgs == 0
	if isEmpty {
		smb.dirtyCloseWithRemove(true)
		// Update fs first here as well.
		fs.state.FirstSeq = smb.last.seq + 1
		fs.state.FirstTime = time.Time{}
		deleted++
	} else {
		// Make sure to sync changes.
		smb.needSync = true
		// Update fs first seq and time.
		smb.first.seq = seq - 1 // Just for start condition for selectNextFirst.
		smb.selectNextFirst()

		fs.state.FirstSeq = smb.first.seq
		fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()

		// Check if we should reclaim the head space from this block.
		// This will be optimistic only, so don't continue if we encounter any errors here.
		if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
			var moff uint32
			moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq))
			if err != nil || moff >= uint32(len(smb.cache.buf)) {
				goto SKIP
			}
			buf := smb.cache.buf[moff:]
			// Don't reuse, copy to new recycled buf.
			nbuf := getMsgBlockBuf(len(buf))
			nbuf = append(nbuf, buf...)
			smb.closeFDsLockedNoCheck()
			// Check for encryption.
			if smb.bek != nil && len(nbuf) > 0 {
				// Recreate to reset counter.
				bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
				if err != nil {
					goto SKIP
				}
				// For future writes make sure to set smb.bek to keep counter correct.
				smb.bek = bek
				smb.bek.XORKeyStream(nbuf, nbuf)
			}
			// Recompress if necessary (smb.cmp contains the algorithm used when
			// the block was loaded from disk, or defaults to NoCompression if not)
			if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
				goto SKIP
			}
			if err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms); err != nil {
				goto SKIP
			}
			// Make sure to remove fss state.
			smb.fss = nil
			smb.clearCacheAndOffset()
			smb.rbytes = uint64(len(nbuf))
		}
	}

SKIP:
	smb.mu.Unlock()

	if deleted > 0 {
		// Update block map.
		if fs.bim != nil {
			for _, mb := range fs.blks[:deleted] {
				delete(fs.bim, mb.index)
			}
		}
		// Update blks slice.
		fs.blks = copyMsgBlocks(fs.blks[deleted:])
		if lb := len(fs.blks); lb == 0 {
			fs.lmb = nil
		} else {
			fs.lmb = fs.blks[lb-1]
		}
	}

	// Update top level accounting.
	if purged > fs.state.Msgs {
		purged = fs.state.Msgs
	}
	fs.state.Msgs -= purged

	if bytes > fs.state.Bytes {
		bytes = fs.state.Bytes
	}
	fs.state.Bytes -= bytes

	fs.dirty++
	fs.kickFlushStateLoop()

	cb := fs.scb
	fs.mu.Unlock()

	if cb != nil && purged > 0 {
		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
	}

	return purged, err
}

// Will completely reset our store.
func (fs *fileStore) reset() error {
	fs.mu.Lock()
	if fs.closed {
		fs.mu.Unlock()
		return ErrStoreClosed
	}
	if fs.sips > 0 {
		fs.mu.Unlock()
		return ErrStoreSnapshotInProgress
	}

	var purged, bytes uint64
	cb := fs.scb

	for _, mb := range fs.blks {
		mb.mu.Lock()
		purged += mb.msgs
		bytes += mb.bytes
		mb.dirtyCloseWithRemove(true)
		mb.mu.Unlock()
	}

	// Reset
	fs.state.FirstSeq = 0
	fs.state.FirstTime = time.Time{}
	fs.state.LastSeq = 0
	fs.state.LastTime = time.Now().UTC()
	// Update msgs and bytes.
	fs.state.Msgs = 0
	fs.state.Bytes = 0

	// Reset blocks.
	fs.blks, fs.lmb = nil, nil

	// Reset subject mappings.
	fs.psim = make(map[string]*psi)
	fs.bim = make(map[uint32]*msgBlock)

	fs.mu.Unlock()

	if cb != nil {
		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
	}

	return nil
}

// Truncate will truncate a stream store up to seq. Sequence needs to be valid.
func (fs *fileStore) Truncate(seq uint64) error {
	// Check for request to reset.
	if seq == 0 {
		return fs.reset()
	}

	fs.mu.Lock()

	if fs.closed {
		fs.mu.Unlock()
		return ErrStoreClosed
	}
	if fs.sips > 0 {
		fs.mu.Unlock()
		return ErrStoreSnapshotInProgress
	}

	nlmb := fs.selectMsgBlock(seq)
	if nlmb == nil {
		fs.mu.Unlock()
		return ErrInvalidSequence
	}
	lsm, _, _ := nlmb.fetchMsg(seq, nil)
	if lsm == nil {
		fs.mu.Unlock()
		return ErrInvalidSequence
	}

	// Set lmb to nlmb and make sure writeable.
	fs.lmb = nlmb
	if err := nlmb.enableForWriting(fs.fip); err != nil {
		return err
	}

	var purged, bytes uint64

	// Truncate our new last message block.
	nmsgs, nbytes, err := nlmb.truncate(lsm)
	if err != nil {
		fs.mu.Unlock()
		return fmt.Errorf("nlmb.truncate: %w", err)
	}
	// Account for the truncated msgs and bytes.
	purged += nmsgs
	bytes += nbytes

	// Remove any left over msg blocks.
	getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
	for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
		mb.mu.Lock()
		purged += mb.msgs
		bytes += mb.bytes
		fs.removeMsgBlock(mb)
		mb.mu.Unlock()
	}

	// Reset last.
	fs.state.LastSeq = lsm.seq
	fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
	// Update msgs and bytes.
	if purged > fs.state.Msgs {
		purged = fs.state.Msgs
	}
	fs.state.Msgs -= purged
	if bytes > fs.state.Bytes {
		bytes = fs.state.Bytes
	}
	fs.state.Bytes -= bytes

	// Reset our subject lookup info.
	fs.resetGlobalPerSubjectInfo()

	fs.dirty++
	fs.kickFlushStateLoop()

	cb := fs.scb
	fs.mu.Unlock()

	if cb != nil {
		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
	}

	return nil
}

func (fs *fileStore) lastSeq() uint64 {
	fs.mu.RLock()
	seq := fs.state.LastSeq
	fs.mu.RUnlock()
	return seq
}

// Returns number of msg blks.
func (fs *fileStore) numMsgBlocks() int {
	fs.mu.RLock()
	defer fs.mu.RUnlock()
	return len(fs.blks)
}

// Will add a new msgBlock.
// Lock should be held.
func (fs *fileStore) addMsgBlock(mb *msgBlock) {
	fs.blks = append(fs.blks, mb)
	fs.lmb = mb
	fs.bim[mb.index] = mb
}

// Remove from our list of blks.
// Both locks should be held.
func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
	// Remove from list.
	for i, omb := range fs.blks {
		if mb == omb {
			blks := append(fs.blks[:i], fs.blks[i+1:]...)
			fs.blks = copyMsgBlocks(blks)
			if fs.bim != nil {
				delete(fs.bim, mb.index)
			}
			break
		}
	}
}

// Removes the msgBlock
// Both locks should be held.
func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
	mb.dirtyCloseWithRemove(true)
	fs.removeMsgBlockFromList(mb)
	// Check for us being last message block
	if mb == fs.lmb {
		last := mb.last
		// Creating a new message write block requires that the lmb lock is not held.
		mb.mu.Unlock()
		// Write the tombstone to remember since this was last block.
		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
			lmb.writeTombstone(last.seq, last.ts)
		}
		mb.mu.Lock()
	}
}

// Called by purge to simply get rid of the cache and close our fds.
// Lock should not be held.
func (mb *msgBlock) dirtyClose() {
	mb.mu.Lock()
	defer mb.mu.Unlock()
	mb.dirtyCloseWithRemove(false)
}

// Should be called with lock held.
func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
	if mb == nil {
		return
	}
	// Stop cache expiration timer.
	if mb.ctmr != nil {
		mb.ctmr.Stop()
		mb.ctmr = nil
	}
	// Clear any tracking by subject.
	mb.fss = nil
	// Close cache
	mb.clearCacheAndOffset()
	// Quit our loops.
	if mb.qch != nil {
		close(mb.qch)
		mb.qch = nil
	}
	if mb.mfd != nil {
		mb.mfd.Close()
		mb.mfd = nil
	}
	if remove {
		if mb.mfn != _EMPTY_ {
			os.Remove(mb.mfn)
			mb.mfn = _EMPTY_
		}
		if mb.kfn != _EMPTY_ {
			os.Remove(mb.kfn)
		}
		// Since we are removing a block kick the state flusher.
		mb.fs.kickFlushStateLoop()
	}
}

// Remove a seq from the fss and select new first.
// Lock should be held.
func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
	mb.ensurePerSubjectInfoLoaded()
	ss := mb.fss[subj]
	if ss == nil {
		return
	}

	if ss.Msgs == 1 {
		delete(mb.fss, subj)
		return
	}

	ss.Msgs--

	// Only one left.
	if ss.Msgs == 1 {
		if seq == ss.Last {
			ss.Last = ss.First
		} else {
			ss.First = ss.Last
		}
		ss.firstNeedsUpdate = false
		return
	}

	// We can lazily calculate the first sequence when needed.
	ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
}

// Will recalulate the first sequence for this subject in this block.
// Will avoid slower path message lookups and scan the cache directly instead.
func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
	// Need to make sure messages are loaded.
	if mb.cacheNotLoaded() {
		if err := mb.loadMsgsWithLock(); err != nil {
			return
		}
	}
	// Mark first as updated.
	ss.firstNeedsUpdate = false
	startSeq++

	startSlot := int(startSeq - mb.cache.fseq)
	if startSlot >= len(mb.cache.idx) {
		ss.First = ss.Last
		return
	} else if startSlot < 0 {
		startSlot = 0
	}

	var le = binary.LittleEndian
	for slot := startSlot; slot < len(mb.cache.idx); slot++ {
		li := int(mb.cache.idx[slot]&^hbit) - mb.cache.off
		if li >= len(mb.cache.buf) {
			ss.First = ss.Last
			return
		}
		buf := mb.cache.buf[li:]
		hdr := buf[:msgHdrSize]
		slen := int(le.Uint16(hdr[20:]))
		if subj == string(buf[msgHdrSize:msgHdrSize+slen]) {
			seq := le.Uint64(hdr[4:])
			if seq < mb.first.seq || seq&ebit != 0 {
				continue
			}
			if mb.dmap.Exists(seq) {
				continue
			}
			ss.First = seq
			return
		}
	}
}

// Lock should be held.
func (fs *fileStore) resetGlobalPerSubjectInfo() {
	// Clear any global subject state.
	fs.psim = make(map[string]*psi)
	for _, mb := range fs.blks {
		fs.populateGlobalPerSubjectInfo(mb)
	}
}

// Lock should be held.
func (mb *msgBlock) resetPerSubjectInfo() error {
	mb.fss = nil
	return mb.generatePerSubjectInfo()
}

// generatePerSubjectInfo will generate the per subject info via the raw msg block.
// Lock should be held.
func (mb *msgBlock) generatePerSubjectInfo() error {
	// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
	if mb.msgs == 0 {
		return nil
	}

	if mb.cacheNotLoaded() {
		if err := mb.loadMsgsWithLock(); err != nil {
			return err
		}
		// indexCaceheBuf can produce fss now, so if non-nil we are good.
		if mb.fss != nil {
			return nil
		}
	}

	// Create new one regardless.
	mb.fss = make(map[string]*SimpleState)

	var smv StoreMsg
	fseq, lseq := mb.first.seq, mb.last.seq
	for seq := fseq; seq <= lseq; seq++ {
		sm, err := mb.cacheLookup(seq, &smv)
		if err != nil {
			// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
			if err == ErrStoreMsgNotFound || err == errDeletedMsg {
				continue
			}
			if err == errNoCache {
				return nil
			}
			return err
		}
		if sm != nil && len(sm.subj) > 0 {
			if ss := mb.fss[sm.subj]; ss != nil {
				ss.Msgs++
				ss.Last = seq
			} else {
				mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
			}
		}
	}

	if len(mb.fss) > 0 {
		// Make sure we run the cache expire timer.
		mb.llts = time.Now().UnixNano()
		mb.startCacheExpireTimer()
	}
	return nil
}

// Helper to make sure fss loaded if we are tracking.
// Lock should be held
func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
	if mb.fss != nil || mb.noTrack {
		return nil
	}
	if mb.msgs == 0 {
		mb.fss = make(map[string]*SimpleState)
		return nil
	}
	return mb.generatePerSubjectInfo()
}

// Called on recovery to populate the global psim state.
// Lock should be held.
func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
	mb.mu.Lock()
	defer mb.mu.Unlock()

	if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
		return
	}

	// Now populate psim.
	for subj, ss := range mb.fss {
		if len(subj) > 0 {
			if info, ok := fs.psim[subj]; ok {
				info.total += ss.Msgs
				if mb.index > info.lblk {
					info.lblk = mb.index
				}
			} else {
				fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}
			}
		}
	}
}

// Close the message block.
func (mb *msgBlock) close(sync bool) {
	if mb == nil {
		return
	}
	mb.mu.Lock()
	defer mb.mu.Unlock()

	if mb.closed {
		return
	}

	// Stop cache expiration timer.
	if mb.ctmr != nil {
		mb.ctmr.Stop()
		mb.ctmr = nil
	}

	mb.fss = nil

	// Close cache
	mb.clearCacheAndOffset()
	// Quit our loops.
	if mb.qch != nil {
		close(mb.qch)
		mb.qch = nil
	}
	if mb.mfd != nil {
		if sync {
			mb.mfd.Sync()
		}
		mb.mfd.Close()
	}
	mb.mfd = nil
	// Mark as closed.
	mb.closed = true
}

func (fs *fileStore) closeAllMsgBlocks(sync bool) {
	for _, mb := range fs.blks {
		mb.close(sync)
	}
}

func (fs *fileStore) Delete() error {
	if fs.isClosed() {
		// Always attempt to remove since we could have been closed beforehand.
		os.RemoveAll(fs.fcfg.StoreDir)
		// Since we did remove, if we did have anything remaining make sure to
		// call into any storage updates that had been registered.
		fs.mu.Lock()
		cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
		// Guard against double accounting if called twice.
		fs.state.Msgs, fs.state.Bytes = 0, 0
		fs.mu.Unlock()
		if msgs > 0 && cb != nil {
			cb(-msgs, -bytes, 0, _EMPTY_)
		}
		return ErrStoreClosed
	}

	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
	// If purge directory still exists then we need to wait
	// in place and remove since rename would fail.
	if _, err := os.Stat(pdir); err == nil {
		os.RemoveAll(pdir)
	}

	// Do Purge() since if we have lots of blocks uses a mv/rename.
	fs.Purge()

	if err := fs.Stop(); err != nil {
		return err
	}

	err := os.RemoveAll(fs.fcfg.StoreDir)
	if err == nil {
		return nil
	}
	ttl := time.Now().Add(time.Second)
	for time.Now().Before(ttl) {
		time.Sleep(10 * time.Millisecond)
		if err = os.RemoveAll(fs.fcfg.StoreDir); err == nil {
			return nil
		}
	}
	return err
}

// Lock should be held.
func (fs *fileStore) cancelSyncTimer() {
	if fs.syncTmr != nil {
		fs.syncTmr.Stop()
		fs.syncTmr = nil
	}
}

const (
	fullStateMagic   = uint8(11)
	fullStateVersion = uint8(1)
)

// This go routine runs and receives kicks to write out our full stream state index.
// This will get kicked when we create a new block or when we delete a block in general.
// This is also called during Stop().
func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) {
	for {
		select {
		case <-fch:
			fs.writeFullState()
		case <-qch:
			close(done)
			return
		}
	}
}

// Kick the flusher.
func (fs *fileStore) kickFlushStateLoop() {
	kickFlusher(fs.fch)
}

// Helper since unixnano of zero time undefined.
func timestampNormalized(t time.Time) int64 {
	if t.IsZero() {
		return 0
	}
	return t.UnixNano()
}

// This will write the full binary state for the stream.
// This plus everything new since last hash will be the total recovered state.
// This state dump will have the following.
// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
// 4. Last block index and hash of record inclusive to this stream state.
func (fs *fileStore) writeFullState() error {
	fs.mu.Lock()

	if fs.closed || fs.dirty == 0 {
		fs.mu.Unlock()
		return nil
	}

	var _buf [32 * 1024]byte
	_buf[0], _buf[1] = fullStateMagic, fullStateVersion
	buf := _buf[:hdrLen]

	buf = binary.AppendUvarint(buf, fs.state.Msgs)
	buf = binary.AppendUvarint(buf, fs.state.Bytes)
	buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
	buf = binary.AppendUvarint(buf, fs.state.LastSeq)
	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))

	// Do per subject information map if applicable.
	numSubjects := len(fs.psim)
	buf = binary.AppendUvarint(buf, uint64(numSubjects))

	if numSubjects > 0 {
		for subj, psi := range fs.psim {
			buf = binary.AppendUvarint(buf, uint64(len(subj)))
			buf = append(buf, subj...)
			buf = binary.AppendUvarint(buf, psi.total)
			buf = binary.AppendUvarint(buf, uint64(psi.fblk))
			if psi.total > 1 {
				buf = binary.AppendUvarint(buf, uint64(psi.lblk))
			}
		}
	}

	// Now walk all blocks and write out first and last and optional dmap encoding.
	var lbi uint32
	var lchk [8]byte

	nb := len(fs.blks)
	buf = binary.AppendUvarint(buf, uint64(nb))

	// Use basetime to save some space.
	baseTime := timestampNormalized(fs.state.FirstTime)

	for _, mb := range fs.blks {
		mb.mu.RLock()
		buf = binary.AppendUvarint(buf, uint64(mb.index))
		buf = binary.AppendUvarint(buf, mb.bytes)
		buf = binary.AppendUvarint(buf, mb.first.seq)
		buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
		buf = binary.AppendUvarint(buf, mb.last.seq)
		buf = binary.AppendVarint(buf, mb.last.ts-baseTime)

		numDeleted := mb.dmap.Size()
		buf = binary.AppendUvarint(buf, uint64(numDeleted))
		if numDeleted > 0 {
			var scratch [8 * 1024]byte
			dmap, _ := mb.dmap.Encode(scratch[:0])
			buf = append(buf, dmap...)
		}
		// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
		// We use this to quickly open this file on recovery.
		if mb == fs.lmb {
			lbi = mb.index
			mb.ensureLastChecksumLoaded()
			copy(lchk[0:], mb.lchk[:])
		}
		mb.mu.RUnlock()
	}

	// Place block index and hash onto the end.
	buf = binary.AppendUvarint(buf, uint64(lbi))
	buf = append(buf, lchk[:]...)

	// Encrypt if needed.
	if fs.prf != nil {
		if err := fs.setupAEK(); err != nil {
			fs.mu.Unlock()
			return err
		}
		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
		rand.Read(nonce)
		buf = fs.aek.Seal(nonce, nonce, buf, nil)
	}

	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)

	fs.hh.Reset()
	fs.hh.Write(buf)
	buf = fs.hh.Sum(buf)

	// Snapshot prior dirty count.
	priorDirty := fs.dirty
	// Release lock.
	fs.mu.Unlock()

	// Write to a tmp file and rename.
	const tmpPre = streamStreamStateFile + tsep
	f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
	if err != nil {
		return err
	}
	tmpName := f.Name()
	defer os.Remove(tmpName)
	if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
		f.Sync()
	}
	f.Close()
	if err != nil {
		return err
	}

	// Rename into position under our lock, clear prior dirty pending on success.
	fs.mu.Lock()
	if !fs.closed {
		if err := os.Rename(tmpName, fn); err != nil {
			fs.mu.Unlock()
			return err
		}
		fs.dirty -= priorDirty
	}
	fs.mu.Unlock()

	return nil
}

// Stop the current filestore.
func (fs *fileStore) Stop() error {
	fs.mu.Lock()
	if fs.closed {
		fs.mu.Unlock()
		return ErrStoreClosed
	}

	fs.checkAndFlushAllBlocks()
	fs.closeAllMsgBlocks(false)

	fs.cancelSyncTimer()
	fs.cancelAgeChk()

	// Release the state flusher loop.
	close(fs.qch)

	// Wait for the state flush loop to exit.
	fsld := fs.fsld
	fs.mu.Unlock()
	<-fsld
	// Write full state if needed. If not dirty this is a no-op.
	fs.writeFullState()
	fs.mu.Lock()

	// Mark as closed.
	fs.closed = true
	fs.lmb = nil

	// We should update the upper usage layer on a stop.
	cb, bytes := fs.scb, int64(fs.state.Bytes)

	var _cfs [256]ConsumerStore
	cfs := append(_cfs[:0], fs.cfs...)
	fs.cfs = nil
	fs.mu.Unlock()

	for _, o := range cfs {
		o.Stop()
	}

	if bytes > 0 && cb != nil {
		cb(0, -bytes, 0, _EMPTY_)
	}

	return nil
}

const errFile = "errors.txt"

// Stream our snapshot through S2 compression and tar.
func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) {
	defer w.Close()

	enc := s2.NewWriter(w)
	defer enc.Close()

	tw := tar.NewWriter(enc)
	defer tw.Close()

	defer func() {
		fs.mu.Lock()
		fs.sips--
		fs.mu.Unlock()
	}()

	modTime := time.Now().UTC()

	writeFile := func(name string, buf []byte) error {
		hdr := &tar.Header{
			Name:    name,
			Mode:    0600,
			ModTime: modTime,
			Uname:   "nats",
			Gname:   "nats",
			Size:    int64(len(buf)),
			Format:  tar.FormatPAX,
		}
		if err := tw.WriteHeader(hdr); err != nil {
			return err
		}
		if _, err := tw.Write(buf); err != nil {
			return err
		}
		return nil
	}

	writeErr := func(err string) {
		writeFile(errFile, []byte(err))
	}

	fs.mu.Lock()
	blks := fs.blks
	// Grab our general meta data.
	// We do this now instead of pulling from files since they could be encrypted.
	meta, err := json.Marshal(fs.cfg)
	if err != nil {
		fs.mu.Unlock()
		writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
		return
	}
	hh := fs.hh
	hh.Reset()
	hh.Write(meta)
	sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
	fs.mu.Unlock()

	// Meta first.
	if writeFile(JetStreamMetaFile, meta) != nil {
		return
	}
	if writeFile(JetStreamMetaFileSum, sum) != nil {
		return
	}

	// Can't use join path here, tar only recognizes relative paths with forward slashes.
	msgPre := msgDir + "/"
	var bbuf []byte

	const minLen = 32
	sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
	if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
		if fs.aek != nil {
			ns := fs.aek.NonceSize()
			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
			if err == nil {
				// Redo hash checksum at end on plaintext.
				hh.Reset()
				hh.Write(buf)
				buf = fs.hh.Sum(buf)
			}
		}
		if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
			return
		}
	}

	// Now do messages themselves.
	for _, mb := range blks {
		if mb.pendingWriteSize() > 0 {
			mb.flushPendingMsgs()
		}
		mb.mu.Lock()
		// We could stream but don't want to hold the lock and prevent changes, so just read in and
		// release the lock for now.
		bbuf, err = mb.loadBlock(bbuf)
		if err != nil {
			mb.mu.Unlock()
			writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
			return
		}
		// Check for encryption.
		if mb.bek != nil && len(bbuf) > 0 {
			rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
			if err != nil {
				mb.mu.Unlock()
				writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
				return
			}
			rbek.XORKeyStream(bbuf, bbuf)
		}
		// Check for compression.
		if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
			mb.mu.Unlock()
			writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
			return
		}
		mb.mu.Unlock()

		// Do this one unlocked.
		if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
			return
		}
	}

	// Bail if no consumers requested.
	if !includeConsumers {
		return
	}

	// Do consumers' state last.
	fs.mu.RLock()
	cfs := fs.cfs
	fs.mu.RUnlock()

	for _, cs := range cfs {
		o, ok := cs.(*consumerFileStore)
		if !ok {
			continue
		}
		o.mu.Lock()
		// Grab our general meta data.
		// We do this now instead of pulling from files since they could be encrypted.
		meta, err := json.Marshal(o.cfg)
		if err != nil {
			o.mu.Unlock()
			writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
			return
		}
		o.hh.Reset()
		o.hh.Write(meta)
		sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))

		// We can have the running state directly encoded now.
		state, err := o.encodeState()
		if err != nil {
			o.mu.Unlock()
			writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
			return
		}
		odirPre := filepath.Join(consumerDir, o.name)
		o.mu.Unlock()

		// Write all the consumer files.
		if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
			return
		}
		if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
			return
		}
		writeFile(filepath.Join(odirPre, consumerState), state)
	}
}

// Create a snapshot of this stream and its consumer's state along with messages.
func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
	fs.mu.Lock()
	if fs.closed {
		fs.mu.Unlock()
		return nil, ErrStoreClosed
	}
	// Only allow one at a time.
	if fs.sips > 0 {
		fs.mu.Unlock()
		return nil, ErrStoreSnapshotInProgress
	}
	// Mark us as snapshotting
	fs.sips += 1
	fs.mu.Unlock()

	if checkMsgs {
		ld := fs.checkMsgs()
		if ld != nil && len(ld.Msgs) > 0 {
			return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
		}
	}

	pr, pw := net.Pipe()

	// Set a write deadline here to protect ourselves.
	if deadline > 0 {
		pw.SetWriteDeadline(time.Now().Add(deadline))
	}

	// We can add to our stream while snapshotting but not "user" delete anything.
	var state StreamState
	fs.FastState(&state)

	// Stream in separate Go routine.
	go fs.streamSnapshot(pw, &state, includeConsumers)

	return &SnapshotResult{pr, state}, nil
}

// Helper to return the config.
func (fs *fileStore) fileStoreConfig() FileStoreConfig {
	fs.mu.RLock()
	defer fs.mu.RUnlock()
	return fs.fcfg
}

// Read lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readLockAllMsgBlocks() {
	for _, mb := range fs.blks {
		mb.mu.RLock()
	}
}

// Read unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readUnlockAllMsgBlocks() {
	for _, mb := range fs.blks {
		mb.mu.RUnlock()
	}
}

// Binary encoded state snapshot, >= v2.10 server.
func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
	fs.mu.RLock()
	defer fs.mu.RUnlock()

	// Calculate deleted.
	var numDeleted int64
	if fs.state.LastSeq > fs.state.FirstSeq {
		numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
		if numDeleted < 0 {
			numDeleted = 0
		}
	}

	// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
	var buf [1024]byte
	buf[0], buf[1] = streamStateMagic, streamStateVersion
	n := hdrLen
	n += binary.PutUvarint(buf[n:], fs.state.Msgs)
	n += binary.PutUvarint(buf[n:], fs.state.Bytes)
	n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
	n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
	n += binary.PutUvarint(buf[n:], failed)
	n += binary.PutUvarint(buf[n:], uint64(numDeleted))

	b := buf[0:n]

	if numDeleted > 0 {
		var scratch [4 * 1024]byte

		fs.readLockAllMsgBlocks()
		defer fs.readUnlockAllMsgBlocks()

		for _, db := range fs.deleteBlocks() {
			switch db := db.(type) {
			case *DeleteRange:
				first, _, num := db.State()
				scratch[0] = runLengthMagic
				i := 1
				i += binary.PutUvarint(scratch[i:], first)
				i += binary.PutUvarint(scratch[i:], num)
				b = append(b, scratch[0:i]...)
			case *avl.SequenceSet:
				buf, err := db.Encode(scratch[:0])
				if err != nil {
					return nil, err
				}
				b = append(b, buf...)
			default:
				return nil, errors.New("no impl")
			}
		}
	}

	return b, nil
}

// We used to be more sophisticated to save memory, but speed is more important.
// All blocks should be at least read locked.
func (fs *fileStore) deleteBlocks() DeleteBlocks {
	var dbs DeleteBlocks
	var prevLast uint64

	for _, mb := range fs.blks {
		// Detect if we have a gap between these blocks.
		if prevLast > 0 && prevLast+1 != mb.first.seq {
			gap := mb.first.seq - prevLast - 1
			dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: gap})
		}
		if mb.dmap.Size() > 0 {
			dbs = append(dbs, &mb.dmap)
		}
		prevLast = mb.last.seq
	}
	return dbs
}

// SyncDeleted will make sure this stream has same deleted state as dbs.
func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
	if len(dbs) == 0 {
		return
	}

	fs.mu.Lock()
	defer fs.mu.Unlock()

	var needsCheck DeleteBlocks

	fs.readLockAllMsgBlocks()
	mdbs := fs.deleteBlocks()
	for i, db := range dbs {
		// If the block is same as what we have we can skip.
		if i < len(mdbs) {
			first, last, num := db.State()
			eFirst, eLast, eNum := mdbs[i].State()
			if first == eFirst && last == eLast && num == eNum {
				continue
			}
		}
		// Need to insert these.
		needsCheck = append(needsCheck, db)
	}
	fs.readUnlockAllMsgBlocks()

	for _, db := range needsCheck {
		db.Range(func(dseq uint64) bool {
			fs.removeMsg(dseq, false, true, false)
			return true
		})
	}
}

////////////////////////////////////////////////////////////////////////////////
// Consumers
////////////////////////////////////////////////////////////////////////////////

type consumerFileStore struct {
	mu      sync.Mutex
	fs      *fileStore
	cfg     *FileConsumerInfo
	prf     keyGen
	aek     cipher.AEAD
	name    string
	odir    string
	ifn     string
	hh      hash.Hash64
	state   ConsumerState
	fch     chan struct{}
	qch     chan struct{}
	flusher bool
	writing bool
	dirty   bool
	closed  bool
}

func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
	if fs == nil {
		return nil, fmt.Errorf("filestore is nil")
	}
	if fs.isClosed() {
		return nil, ErrStoreClosed
	}
	if cfg == nil || name == _EMPTY_ {
		return nil, fmt.Errorf("bad consumer config")
	}

	// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
	if cfg.MemoryStorage {
		// Create directly here.
		o := &consumerMemStore{ms: fs, cfg: *cfg}
		fs.AddConsumer(o)
		return o, nil
	}

	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
		return nil, fmt.Errorf("could not create consumer directory - %v", err)
	}
	csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
	o := &consumerFileStore{
		fs:   fs,
		cfg:  csi,
		prf:  fs.prf,
		name: name,
		odir: odir,
		ifn:  filepath.Join(odir, consumerState),
	}
	key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
	hh, err := highwayhash.New64(key[:])
	if err != nil {
		return nil, fmt.Errorf("could not create hash: %v", err)
	}
	o.hh = hh

	// Check for encryption.
	if o.prf != nil {
		if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
			if len(ekey) < minBlkKeySize {
				return nil, errBadKeySize
			}
			// Recover key encryption key.
			rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
			if err != nil {
				return nil, err
			}

			sc := fs.fcfg.Cipher
			kek, err := genEncryptionKey(sc, rb)
			if err != nil {
				return nil, err
			}
			ns := kek.NonceSize()
			nonce := ekey[:ns]
			seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
			if err != nil {
				// We may be here on a cipher conversion, so attempt to convert.
				if err = o.convertCipher(); err != nil {
					return nil, err
				}
			} else {
				o.aek, err = genEncryptionKey(sc, seed)
			}
			if err != nil {
				return nil, err
			}
		}
	}

	// Track if we are creating the directory so that we can clean up if we encounter an error.
	var didCreate bool

	// Write our meta data iff does not exist.
	meta := filepath.Join(odir, JetStreamMetaFile)
	if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
		didCreate = true
		csi.Created = time.Now().UTC()
		if err := o.writeConsumerMeta(); err != nil {
			os.RemoveAll(odir)
			return nil, err
		}
	}

	// If we expect to be encrypted check that what we are restoring is not plaintext.
	// This can happen on snapshot restores or conversions.
	if o.prf != nil {
		keyFile := filepath.Join(odir, JetStreamMetaFileKey)
		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
			if err := o.writeConsumerMeta(); err != nil {
				if didCreate {
					os.RemoveAll(odir)
				}
				return nil, err
			}
			// Redo the state file as well here if we have one and we can tell it was plaintext.
			if buf, err := os.ReadFile(o.ifn); err == nil {
				if _, err := decodeConsumerState(buf); err == nil {
					if err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms); err != nil {
						if didCreate {
							os.RemoveAll(odir)
						}
						return nil, err
					}
				}
			}
		}
	}

	// Create channels to control our flush go routine.
	o.fch = make(chan struct{}, 1)
	o.qch = make(chan struct{})
	go o.flushLoop(o.fch, o.qch)

	// Make sure to load in our state from disk if needed.
	o.loadState()

	// Assign to filestore.
	fs.AddConsumer(o)

	return o, nil
}

func (o *consumerFileStore) convertCipher() error {
	fs := o.fs
	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)

	ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
	if err != nil {
		return err
	}
	if len(ekey) < minBlkKeySize {
		return errBadKeySize
	}
	// Recover key encryption key.
	rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
	if err != nil {
		return err
	}

	// Do these in reverse since converting.
	sc := fs.fcfg.Cipher
	osc := AES
	if sc == AES {
		osc = ChaCha
	}
	kek, err := genEncryptionKey(osc, rb)
	if err != nil {
		return err
	}
	ns := kek.NonceSize()
	nonce := ekey[:ns]
	seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
	if err != nil {
		return err
	}
	aek, err := genEncryptionKey(osc, seed)
	if err != nil {
		return err
	}
	// Now read in and decode our state using the old cipher.
	buf, err := os.ReadFile(o.ifn)
	if err != nil {
		return err
	}
	buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
	if err != nil {
		return err
	}

	// Since we are here we recovered our old state.
	// Now write our meta, which will generate the new keys with the new cipher.
	if err := o.writeConsumerMeta(); err != nil {
		return err
	}

	// Now write out or state with the new cipher.
	return o.writeState(buf)
}

// Kick flusher for this consumer.
// Lock should be held.
func (o *consumerFileStore) kickFlusher() {
	if o.fch != nil {
		select {
		case o.fch <- struct{}{}:
		default:
		}
	}
	o.dirty = true
}

// Set in flusher status
func (o *consumerFileStore) setInFlusher() {
	o.mu.Lock()
	o.flusher = true
	o.mu.Unlock()
}

// Clear in flusher status
func (o *consumerFileStore) clearInFlusher() {
	o.mu.Lock()
	o.flusher = false
	o.mu.Unlock()
}

// Report in flusher status
func (o *consumerFileStore) inFlusher() bool {
	o.mu.Lock()
	defer o.mu.Unlock()
	return o.flusher
}

// flushLoop watches for consumer updates and the quit channel.
func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {

	o.setInFlusher()
	defer o.clearInFlusher()

	// Maintain approximately 10 updates per second per consumer under load.
	const minTime = 100 * time.Millisecond
	var lastWrite time.Time
	var dt *time.Timer

	setDelayTimer := func(addWait time.Duration) {
		if dt == nil {
			dt = time.NewTimer(addWait)
			return
		}
		if !dt.Stop() {
			select {
			case <-dt.C:
			default:
			}
		}
		dt.Reset(addWait)
	}

	for {
		select {
		case <-fch:
			if ts := time.Since(lastWrite); ts < minTime {
				setDelayTimer(minTime - ts)
				select {
				case <-dt.C:
				case <-qch:
					return
				}
			}
			o.mu.Lock()
			if o.closed {
				o.mu.Unlock()
				return
			}
			buf, err := o.encodeState()
			o.mu.Unlock()
			if err != nil {
				return
			}
			// TODO(dlc) - if we error should start failing upwards.
			if err := o.writeState(buf); err == nil {
				lastWrite = time.Now()
			}
		case <-qch:
			return
		}
	}
}

// SetStarting sets our starting stream sequence.
func (o *consumerFileStore) SetStarting(sseq uint64) error {
	o.mu.Lock()
	o.state.Delivered.Stream = sseq
	buf, err := o.encodeState()
	o.mu.Unlock()
	if err != nil {
		return err
	}
	return o.writeState(buf)
}

// HasState returns if this store has a recorded state.
func (o *consumerFileStore) HasState() bool {
	o.mu.Lock()
	_, err := os.Stat(o.ifn)
	o.mu.Unlock()
	return err == nil
}

// UpdateDelivered is called whenever a new message has been delivered.
func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
	o.mu.Lock()
	defer o.mu.Unlock()

	if dc != 1 && o.cfg.AckPolicy == AckNone {
		return ErrNoAckPolicy
	}

	// On restarts the old leader may get a replay from the raft logs that are old.
	if dseq <= o.state.AckFloor.Consumer {
		return nil
	}

	// See if we expect an ack for this.
	if o.cfg.AckPolicy != AckNone {
		// Need to create pending records here.
		if o.state.Pending == nil {
			o.state.Pending = make(map[uint64]*Pending)
		}
		var p *Pending
		// Check for an update to a message already delivered.
		if sseq <= o.state.Delivered.Stream {
			if p = o.state.Pending[sseq]; p != nil {
				p.Sequence, p.Timestamp = dseq, ts
			}
		} else {
			// Add to pending.
			o.state.Pending[sseq] = &Pending{dseq, ts}
		}
		// Update delivered as needed.
		if dseq > o.state.Delivered.Consumer {
			o.state.Delivered.Consumer = dseq
		}
		if sseq > o.state.Delivered.Stream {
			o.state.Delivered.Stream = sseq
		}

		if dc > 1 {
			if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
				// Make sure to remove from pending.
				delete(o.state.Pending, sseq)
			}
			if o.state.Redelivered == nil {
				o.state.Redelivered = make(map[uint64]uint64)
			}
			// Only update if greater then what we already have.
			if o.state.Redelivered[sseq] < dc-1 {
				o.state.Redelivered[sseq] = dc - 1
			}
		}
	} else {
		// For AckNone just update delivered and ackfloor at the same time.
		if dseq > o.state.Delivered.Consumer {
			o.state.Delivered.Consumer = dseq
			o.state.AckFloor.Consumer = dseq
		}
		if sseq > o.state.Delivered.Stream {
			o.state.Delivered.Stream = sseq
			o.state.AckFloor.Stream = sseq
		}
	}
	// Make sure we flush to disk.
	o.kickFlusher()

	return nil
}

// UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
	o.mu.Lock()
	defer o.mu.Unlock()

	if o.cfg.AckPolicy == AckNone {
		return ErrNoAckPolicy
	}

	// On restarts the old leader may get a replay from the raft logs that are old.
	if dseq <= o.state.AckFloor.Consumer {
		return nil
	}

	if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
		return ErrStoreMsgNotFound
	}

	// Check for AckAll here.
	if o.cfg.AckPolicy == AckAll {
		sgap := sseq - o.state.AckFloor.Stream
		o.state.AckFloor.Consumer = dseq
		o.state.AckFloor.Stream = sseq
		for seq := sseq; seq > sseq-sgap; seq-- {
			delete(o.state.Pending, seq)
			if len(o.state.Redelivered) > 0 {
				delete(o.state.Redelivered, seq)
			}
		}
		o.kickFlusher()
		return nil
	}

	// AckExplicit

	// First delete from our pending state.
	if p, ok := o.state.Pending[sseq]; ok {
		delete(o.state.Pending, sseq)
		dseq = p.Sequence // Use the original.
	}
	if len(o.state.Pending) == 0 {
		o.state.AckFloor.Consumer = o.state.Delivered.Consumer
		o.state.AckFloor.Stream = o.state.Delivered.Stream
	} else if dseq == o.state.AckFloor.Consumer+1 {
		o.state.AckFloor.Consumer = dseq
		o.state.AckFloor.Stream = sseq

		if o.state.Delivered.Consumer > dseq {
			for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
				if p, ok := o.state.Pending[ss]; ok {
					if p.Sequence > 0 {
						o.state.AckFloor.Consumer = p.Sequence - 1
						o.state.AckFloor.Stream = ss - 1
					}
					break
				}
			}
		}
	}
	// We do these regardless.
	delete(o.state.Redelivered, sseq)

	o.kickFlusher()
	return nil
}

const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen

// Encode our consumer state, version 2.
// Lock should be held.

func (o *consumerFileStore) EncodedState() ([]byte, error) {
	o.mu.Lock()
	defer o.mu.Unlock()
	return o.encodeState()
}

func (o *consumerFileStore) encodeState() ([]byte, error) {
	// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
	state, err := o.stateWithCopyLocked(false)
	if err != nil {
		return nil, err
	}
	return encodeConsumerState(state), nil
}

func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
	o.mu.Lock()
	defer o.mu.Unlock()

	// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
	csi := o.cfg
	csi.ConsumerConfig = *cfg

	return o.writeConsumerMeta()
}

func (o *consumerFileStore) Update(state *ConsumerState) error {
	o.mu.Lock()
	defer o.mu.Unlock()

	// Check to see if this is an outdated update.
	if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
		return nil
	}

	// Sanity checks.
	if state.AckFloor.Consumer > state.Delivered.Consumer {
		return fmt.Errorf("bad ack floor for consumer")
	}
	if state.AckFloor.Stream > state.Delivered.Stream {
		return fmt.Errorf("bad ack floor for stream")
	}

	// Copy to our state.
	var pending map[uint64]*Pending
	var redelivered map[uint64]uint64
	if len(state.Pending) > 0 {
		pending = make(map[uint64]*Pending, len(state.Pending))
		for seq, p := range state.Pending {
			pending[seq] = &Pending{p.Sequence, p.Timestamp}
			if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
				return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
			}
		}
	}
	if len(state.Redelivered) > 0 {
		redelivered = make(map[uint64]uint64, len(state.Redelivered))
		for seq, dc := range state.Redelivered {
			redelivered[seq] = dc
		}
	}

	o.state.Delivered = state.Delivered
	o.state.AckFloor = state.AckFloor
	o.state.Pending = pending
	o.state.Redelivered = redelivered

	o.kickFlusher()

	return nil
}

// Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
// Lock should be held.
func (o *consumerFileStore) encryptState(buf []byte) []byte {
	if o.aek == nil {
		return buf
	}
	// TODO(dlc) - Optimize on space usage a bit?
	nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
	rand.Read(nonce)
	return o.aek.Seal(nonce, nonce, buf, nil)
}

// Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
// https://github.com/nats-io/nats-server/issues/2742
var dios chan struct{}

// Used to setup our simplistic counting semaphore using buffered channels.
// golang.org's semaphore seemed a bit heavy.
func init() {
	// Limit ourselves to a max of 4 blocking IO calls.
	const nIO = 4
	dios = make(chan struct{}, nIO)
	// Fill it up to start.
	for i := 0; i < nIO; i++ {
		dios <- struct{}{}
	}
}

func (o *consumerFileStore) writeState(buf []byte) error {
	// Check if we have the index file open.
	o.mu.Lock()
	if o.writing || len(buf) == 0 {
		o.mu.Unlock()
		return nil
	}

	// Check on encryption.
	if o.aek != nil {
		buf = o.encryptState(buf)
	}

	o.writing = true
	o.dirty = false
	ifn := o.ifn
	o.mu.Unlock()

	// Lock not held here but we do limit number of outstanding calls that could block OS threads.
	<-dios
	err := os.WriteFile(ifn, buf, defaultFilePerms)
	dios <- struct{}{}

	o.mu.Lock()
	if err != nil {
		o.dirty = true
	}
	o.writing = false
	o.mu.Unlock()

	return err
}

// Will upodate the config. Only used when recovering ephemerals.
func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
	o.mu.Lock()
	defer o.mu.Unlock()
	o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
	return o.writeConsumerMeta()
}

// Write out the consumer meta data, i.e. state.
// Lock should be held.
func (cfs *consumerFileStore) writeConsumerMeta() error {
	meta := filepath.Join(cfs.odir, JetStreamMetaFile)
	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
		return err
	}

	if cfs.prf != nil && cfs.aek == nil {
		fs := cfs.fs
		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
		if err != nil {
			return err
		}
		cfs.aek = key
		keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
			return err
		}
		if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
			return err
		}
	}

	b, err := json.Marshal(cfs.cfg)
	if err != nil {
		return err
	}
	// Encrypt if needed.
	if cfs.aek != nil {
		nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
		rand.Read(nonce)
		b = cfs.aek.Seal(nonce, nonce, b, nil)
	}

	if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
		return err
	}
	cfs.hh.Reset()
	cfs.hh.Write(b)
	checksum := hex.EncodeToString(cfs.hh.Sum(nil))
	sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
	if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
		return err
	}
	return nil
}

// Consumer version.
func checkConsumerHeader(hdr []byte) (uint8, error) {
	if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
		return 0, errCorruptState
	}
	version := hdr[1]
	switch version {
	case 1, 2:
		return version, nil
	}
	return 0, fmt.Errorf("unsupported version: %d", version)
}

func (o *consumerFileStore) copyPending() map[uint64]*Pending {
	pending := make(map[uint64]*Pending, len(o.state.Pending))
	for seq, p := range o.state.Pending {
		pending[seq] = &Pending{p.Sequence, p.Timestamp}
	}
	return pending
}

func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
	redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
	for seq, dc := range o.state.Redelivered {
		redelivered[seq] = dc
	}
	return redelivered
}

// Type returns the type of the underlying store.
func (o *consumerFileStore) Type() StorageType { return FileStorage }

// State retrieves the state from the state file.
// This is not expected to be called in high performance code, only on startup.
func (o *consumerFileStore) State() (*ConsumerState, error) {
	return o.stateWithCopy(true)
}

// This will not copy pending or redelivered, so should only be done under the
// consumer owner's lock.
func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
	return o.stateWithCopy(false)
}

func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
	o.mu.Lock()
	defer o.mu.Unlock()
	return o.stateWithCopyLocked(doCopy)
}

// Lock should be held.
func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
	if o.closed {
		return nil, ErrStoreClosed
	}

	state := &ConsumerState{}

	// See if we have a running state or if we need to read in from disk.
	if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
		state.Delivered = o.state.Delivered
		state.AckFloor = o.state.AckFloor
		if len(o.state.Pending) > 0 {
			if doCopy {
				state.Pending = o.copyPending()
			} else {
				state.Pending = o.state.Pending
			}
		}
		if len(o.state.Redelivered) > 0 {
			if doCopy {
				state.Redelivered = o.copyRedelivered()
			} else {
				state.Redelivered = o.state.Redelivered
			}
		}
		return state, nil
	}

	// Read the state in here from disk..
	buf, err := os.ReadFile(o.ifn)
	if err != nil && !os.IsNotExist(err) {
		return nil, err
	}

	if len(buf) == 0 {
		return state, nil
	}

	// Check on encryption.
	if o.aek != nil {
		ns := o.aek.NonceSize()
		buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
		if err != nil {
			return nil, err
		}
	}

	state, err = decodeConsumerState(buf)
	if err != nil {
		return nil, err
	}

	// Copy this state into our own.
	o.state.Delivered = state.Delivered
	o.state.AckFloor = state.AckFloor
	if len(state.Pending) > 0 {
		if doCopy {
			o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
			for seq, p := range state.Pending {
				o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
			}
		} else {
			o.state.Pending = state.Pending
		}
	}
	if len(state.Redelivered) > 0 {
		if doCopy {
			o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
			for seq, dc := range state.Redelivered {
				o.state.Redelivered[seq] = dc
			}
		} else {
			o.state.Redelivered = state.Redelivered
		}
	}

	return state, nil
}

// Lock should be held. Called at startup.
func (o *consumerFileStore) loadState() {
	if _, err := os.Stat(o.ifn); err == nil {
		// This will load our state in from disk.
		o.stateWithCopyLocked(false)
	}
}

// Decode consumer state.
func decodeConsumerState(buf []byte) (*ConsumerState, error) {
	version, err := checkConsumerHeader(buf)
	if err != nil {
		return nil, err
	}

	bi := hdrLen
	// Helpers, will set i to -1 on error.
	readSeq := func() uint64 {
		if bi < 0 {
			return 0
		}
		seq, n := binary.Uvarint(buf[bi:])
		if n <= 0 {
			bi = -1
			return 0
		}
		bi += n
		return seq
	}
	readTimeStamp := func() int64 {
		if bi < 0 {
			return 0
		}
		ts, n := binary.Varint(buf[bi:])
		if n <= 0 {
			bi = -1
			return -1
		}
		bi += n
		return ts
	}
	// Just for clarity below.
	readLen := readSeq
	readCount := readSeq

	state := &ConsumerState{}
	state.AckFloor.Consumer = readSeq()
	state.AckFloor.Stream = readSeq()
	state.Delivered.Consumer = readSeq()
	state.Delivered.Stream = readSeq()

	if bi == -1 {
		return nil, errCorruptState
	}
	if version == 1 {
		// Adjust back. Version 1 also stored delivered as next to be delivered,
		// so adjust that back down here.
		if state.AckFloor.Consumer > 1 {
			state.Delivered.Consumer += state.AckFloor.Consumer - 1
		}
		if state.AckFloor.Stream > 1 {
			state.Delivered.Stream += state.AckFloor.Stream - 1
		}
	}

	// We have additional stuff.
	if numPending := readLen(); numPending > 0 {
		mints := readTimeStamp()
		state.Pending = make(map[uint64]*Pending, numPending)
		for i := 0; i < int(numPending); i++ {
			sseq := readSeq()
			var dseq uint64
			if version == 2 {
				dseq = readSeq()
			}
			ts := readTimeStamp()
			// Check the state machine for corruption, not the value which could be -1.
			if bi == -1 {
				return nil, errCorruptState
			}
			// Adjust seq back.
			sseq += state.AckFloor.Stream
			if sseq == 0 {
				return nil, errCorruptState
			}
			if version == 2 {
				dseq += state.AckFloor.Consumer
			}
			// Adjust the timestamp back.
			if version == 1 {
				ts = (ts + mints) * int64(time.Second)
			} else {
				ts = (mints - ts) * int64(time.Second)
			}
			// Store in pending.
			state.Pending[sseq] = &Pending{dseq, ts}
		}
	}

	// We have redelivered entries here.
	if numRedelivered := readLen(); numRedelivered > 0 {
		state.Redelivered = make(map[uint64]uint64, numRedelivered)
		for i := 0; i < int(numRedelivered); i++ {
			if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
				// Adjust seq back.
				seq += state.AckFloor.Stream
				state.Redelivered[seq] = n
			}
		}
	}

	return state, nil
}

// Stop the processing of the consumers's state.
func (o *consumerFileStore) Stop() error {
	o.mu.Lock()
	if o.closed {
		o.mu.Unlock()
		return nil
	}
	if o.qch != nil {
		close(o.qch)
		o.qch = nil
	}

	var err error
	var buf []byte

	if o.dirty {
		// Make sure to write this out..
		if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
			if o.aek != nil {
				buf = o.encryptState(buf)
			}
		}
	}

	o.odir = _EMPTY_
	o.closed = true
	ifn, fs := o.ifn, o.fs
	o.mu.Unlock()

	fs.RemoveConsumer(o)

	if len(buf) > 0 {
		o.waitOnFlusher()
		<-dios
		err = os.WriteFile(ifn, buf, defaultFilePerms)
		dios <- struct{}{}
	}
	return err
}

func (o *consumerFileStore) waitOnFlusher() {
	if !o.inFlusher() {
		return
	}

	timeout := time.Now().Add(100 * time.Millisecond)
	for time.Now().Before(timeout) {
		if !o.inFlusher() {
			return
		}
		time.Sleep(10 * time.Millisecond)
	}
}

// Delete the consumer.
func (o *consumerFileStore) Delete() error {
	return o.delete(false)
}

func (o *consumerFileStore) StreamDelete() error {
	return o.delete(true)
}

func (o *consumerFileStore) delete(streamDeleted bool) error {
	o.mu.Lock()
	if o.closed {
		o.mu.Unlock()
		return nil
	}
	if o.qch != nil {
		close(o.qch)
		o.qch = nil
	}

	var err error
	odir := o.odir
	o.odir = _EMPTY_
	o.closed = true
	fs := o.fs
	o.mu.Unlock()

	// If our stream was not deleted this will remove the directories.
	if odir != _EMPTY_ && !streamDeleted {
		<-dios
		err = os.RemoveAll(odir)
		dios <- struct{}{}
	}

	if !streamDeleted {
		fs.RemoveConsumer(o)
	}

	return err
}

func (fs *fileStore) AddConsumer(o ConsumerStore) error {
	fs.mu.Lock()
	defer fs.mu.Unlock()
	fs.cfs = append(fs.cfs, o)
	return nil
}

func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
	fs.mu.Lock()
	defer fs.mu.Unlock()
	for i, cfs := range fs.cfs {
		if o == cfs {
			fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
			break
		}
	}
	return nil
}

////////////////////////////////////////////////////////////////////////////////
// Templates
////////////////////////////////////////////////////////////////////////////////

type templateFileStore struct {
	dir string
	hh  hash.Hash64
}

func newTemplateFileStore(storeDir string) *templateFileStore {
	tdir := filepath.Join(storeDir, tmplsDir)
	key := sha256.Sum256([]byte("templates"))
	hh, err := highwayhash.New64(key[:])
	if err != nil {
		return nil
	}
	return &templateFileStore{dir: tdir, hh: hh}
}

func (ts *templateFileStore) Store(t *streamTemplate) error {
	dir := filepath.Join(ts.dir, t.Name)
	if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
		return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
	}
	meta := filepath.Join(dir, JetStreamMetaFile)
	if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
		return err
	}
	t.mu.Lock()
	b, err := json.Marshal(t)
	t.mu.Unlock()
	if err != nil {
		return err
	}
	if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
		return err
	}
	// FIXME(dlc) - Do checksum
	ts.hh.Reset()
	ts.hh.Write(b)
	checksum := hex.EncodeToString(ts.hh.Sum(nil))
	sum := filepath.Join(dir, JetStreamMetaFileSum)
	if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
		return err
	}
	return nil
}

func (ts *templateFileStore) Delete(t *streamTemplate) error {
	return os.RemoveAll(filepath.Join(ts.dir, t.Name))
}

////////////////////////////////////////////////////////////////////////////////
// Compression
////////////////////////////////////////////////////////////////////////////////

type CompressionInfo struct {
	Algorithm    StoreCompression
	OriginalSize uint64
}

func (c *CompressionInfo) MarshalMetadata() []byte {
	b := make([]byte, 14) // 4 + potentially up to 10 for uint64
	b[0], b[1], b[2] = 'c', 'm', 'p'
	b[3] = byte(c.Algorithm)
	n := binary.PutUvarint(b[4:], c.OriginalSize)
	return b[:4+n]
}

func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
	c.Algorithm = NoCompression
	c.OriginalSize = 0
	if len(b) < 5 { // 4 + min 1 for uvarint uint64
		return 0, nil
	}
	if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
		return 0, nil
	}
	var n int
	c.Algorithm = StoreCompression(b[3])
	c.OriginalSize, n = binary.Uvarint(b[4:])
	if n <= 0 {
		return 0, fmt.Errorf("metadata incomplete")
	}
	return 4 + n, nil
}

func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
	if len(buf) < checksumSize {
		return nil, fmt.Errorf("uncompressed buffer is too short")
	}
	bodyLen := int64(len(buf) - checksumSize)
	var output bytes.Buffer
	var writer io.WriteCloser
	switch alg {
	case NoCompression:
		return buf, nil
	case S2Compression:
		writer = s2.NewWriter(&output)
	default:
		return nil, fmt.Errorf("compression algorithm not known")
	}

	input := bytes.NewReader(buf[:bodyLen])
	checksum := buf[bodyLen:]

	// Compress the block content, but don't compress the checksum.
	// We will preserve it at the end of the block as-is.
	if n, err := io.CopyN(writer, input, bodyLen); err != nil {
		return nil, fmt.Errorf("error writing to compression writer: %w", err)
	} else if n != bodyLen {
		return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
	}
	if err := writer.Close(); err != nil {
		return nil, fmt.Errorf("error closing compression writer: %w", err)
	}

	// Now add the checksum back onto the end of the block.
	if n, err := output.Write(checksum); err != nil {
		return nil, fmt.Errorf("error writing checksum: %w", err)
	} else if n != checksumSize {
		return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
	}

	return output.Bytes(), nil
}

func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
	if len(buf) < checksumSize {
		return nil, fmt.Errorf("compressed buffer is too short")
	}
	bodyLen := int64(len(buf) - checksumSize)
	input := bytes.NewReader(buf[:bodyLen])

	var reader io.ReadCloser
	switch alg {
	case NoCompression:
		return buf, nil
	case S2Compression:
		reader = io.NopCloser(s2.NewReader(input))
	default:
		return nil, fmt.Errorf("compression algorithm not known")
	}

	// Decompress the block content. The checksum isn't compressed so
	// we can preserve it from the end of the block as-is.
	checksum := buf[bodyLen:]
	output, err := io.ReadAll(reader)
	if err != nil {
		return nil, fmt.Errorf("error reading compression reader: %w", err)
	}
	output = append(output, checksum...)

	return output, reader.Close()
}