Files
nats-server/server/filestore.go
Derek Collison dba03dbc2f Optimizations to reduce contention for high connections in a JetStream enabled account with high API usage.
Several strategies which are listed below.

1. Checking a RaftNode to see if it is the leader now uses atomics.
2. Checking if we are the JetStream meta leader from the server now uses an atomic.
3. Accessing the JetStream context no longer requires a server lock, uses atomic.Pointer.
4. Filestore syncBlocks would hold msgBlock locks during sync, now does not.

Signed-off-by: Derek Collison <derek@nats.io>
2023-09-30 14:52:15 -07:00

8592 lines
209 KiB
Go

// Copyright 2019-2023 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package server
import (
"archive/tar"
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"hash"
"io"
"math"
"net"
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/klauspost/compress/s2"
"github.com/minio/highwayhash"
"github.com/nats-io/nats-server/v2/server/avl"
"golang.org/x/crypto/chacha20"
"golang.org/x/crypto/chacha20poly1305"
)
type FileStoreConfig struct {
// Where the parent directory for all storage will be located.
StoreDir string
// BlockSize is the file block size. This also represents the maximum overhead size.
BlockSize uint64
// CacheExpire is how long with no activity until we expire the cache.
CacheExpire time.Duration
// SyncInterval is how often we sync to disk in the background.
SyncInterval time.Duration
// SyncAlways is when the stream should sync all data writes.
SyncAlways bool
// AsyncFlush allows async flush to batch write operations.
AsyncFlush bool
// Cipher is the cipher to use when encrypting.
Cipher StoreCipher
// Compression is the algorithm to use when compressing.
Compression StoreCompression
// Internal reference to our server.
srv *Server
}
// FileStreamInfo allows us to remember created time.
type FileStreamInfo struct {
Created time.Time
StreamConfig
}
type StoreCipher int
const (
ChaCha StoreCipher = iota
AES
NoCipher
)
func (cipher StoreCipher) String() string {
switch cipher {
case ChaCha:
return "ChaCha20-Poly1305"
case AES:
return "AES-GCM"
case NoCipher:
return "None"
default:
return "Unknown StoreCipher"
}
}
type StoreCompression uint8
const (
NoCompression StoreCompression = iota
S2Compression
)
func (alg StoreCompression) String() string {
switch alg {
case NoCompression:
return "None"
case S2Compression:
return "S2"
default:
return "Unknown StoreCompression"
}
}
func (alg StoreCompression) MarshalJSON() ([]byte, error) {
var str string
switch alg {
case S2Compression:
str = "s2"
case NoCompression:
str = "none"
default:
return nil, fmt.Errorf("unknown compression algorithm")
}
return json.Marshal(str)
}
func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
var str string
if err := json.Unmarshal(b, &str); err != nil {
return err
}
switch str {
case "s2":
*alg = S2Compression
case "none":
*alg = NoCompression
default:
return fmt.Errorf("unknown compression algorithm")
}
return nil
}
// File ConsumerInfo is used for creating consumer stores.
type FileConsumerInfo struct {
Created time.Time
Name string
ConsumerConfig
}
// Default file and directory permissions.
const (
defaultDirPerms = os.FileMode(0750)
defaultFilePerms = os.FileMode(0640)
)
type psi struct {
total uint64
fblk uint32
lblk uint32
}
type fileStore struct {
srv *Server
mu sync.RWMutex
state StreamState
tombs []uint64
ld *LostStreamData
scb StorageUpdateHandler
ageChk *time.Timer
syncTmr *time.Timer
cfg FileStreamInfo
fcfg FileStoreConfig
prf keyGen
oldprf keyGen
aek cipher.AEAD
lmb *msgBlock
blks []*msgBlock
bim map[uint32]*msgBlock
psim map[string]*psi
hh hash.Hash64
qch chan struct{}
fch chan struct{}
fsld chan struct{}
cfs []ConsumerStore
sips int
dirty int
closed bool
fip bool
receivedAny bool
}
// Represents a message store block and its data.
type msgBlock struct {
// Here for 32bit systems and atomic.
first msgId
last msgId
mu sync.RWMutex
fs *fileStore
aek cipher.AEAD
bek cipher.Stream
seed []byte
nonce []byte
mfn string
mfd *os.File
cmp StoreCompression // Effective compression at the time of loading the block
liwsz int64
index uint32
bytes uint64 // User visible bytes count.
rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
msgs uint64 // User visible message count.
fss map[string]*SimpleState
kfn string
lwts int64
llts int64
lrts int64
llseq uint64
hh hash.Hash64
cache *cache
cloads uint64
cexp time.Duration
ctmr *time.Timer
werr error
dmap avl.SequenceSet
fch chan struct{}
qch chan struct{}
lchk [8]byte
loading bool
flusher bool
noTrack bool
needSync bool
syncAlways bool
closed bool
// Used to mock write failures.
mockWriteErr bool
}
// Write through caching layer that is also used on loading messages.
type cache struct {
buf []byte
off int
wp int
idx []uint32
lrl uint32
fseq uint64
nra bool
}
type msgId struct {
seq uint64
ts int64
}
const (
// Magic is used to identify the file store files.
magic = uint8(22)
// Version
version = uint8(1)
// New IndexInfo Version
newVersion = uint8(2)
// hdrLen
hdrLen = 2
// This is where we keep the streams.
streamsDir = "streams"
// This is where we keep the message store blocks.
msgDir = "msgs"
// This is where we temporarily move the messages dir.
purgeDir = "__msgs__"
// used to scan blk file names.
blkScan = "%d.blk"
// used for compacted blocks that are staged.
newScan = "%d.new"
// used to scan index file names.
indexScan = "%d.idx"
// to look for orphans
indexScanAll = "*.idx"
// to look for orphans
fssScanAll = "*.fss"
// used to store our block encryption key.
keyScan = "%d.key"
// to look for orphans
keyScanAll = "*.key"
// This is where we keep state on consumers.
consumerDir = "obs"
// Index file for a consumer.
consumerState = "o.dat"
// The suffix that will be given to a new temporary block during compression.
compressTmpSuffix = ".tmp"
// This is where we keep state on templates.
tmplsDir = "templates"
// Maximum size of a write buffer we may consider for re-use.
maxBufReuse = 2 * 1024 * 1024
// default cache buffer expiration
defaultCacheBufferExpiration = 5 * time.Second
// default sync interval
defaultSyncInterval = 2 * time.Minute
// default idle timeout to close FDs.
closeFDsIdle = 30 * time.Second
// coalesceMinimum
coalesceMinimum = 16 * 1024
// maxFlushWait is maximum we will wait to gather messages to flush.
maxFlushWait = 8 * time.Millisecond
// Metafiles for streams and consumers.
JetStreamMetaFile = "meta.inf"
JetStreamMetaFileSum = "meta.sum"
JetStreamMetaFileKey = "meta.key"
// This is the full snapshotted state for the stream.
streamStreamStateFile = "index.db"
// AEK key sizes
minMetaKeySize = 64
minBlkKeySize = 64
// Default stream block size.
defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
// Default for workqueue or interest based.
defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
// E.g. mirrors/sources etc.
defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
// Maximum size for the encrypted head block.
maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
// Default for KV based
defaultKVBlockSize = defaultMediumBlockSize
// max block size for now.
maxBlockSize = defaultLargeBlockSize
// Compact minimum threshold.
compactMinimum = 2 * 1024 * 1024 // 2MB
// FileStoreMinBlkSize is minimum size we will do for a blk size.
FileStoreMinBlkSize = 32 * 1000 // 32kib
// FileStoreMaxBlkSize is maximum size we will do for a blk size.
FileStoreMaxBlkSize = maxBlockSize
// Check for bad record length value due to corrupt data.
rlBadThresh = 32 * 1024 * 1024
// Checksum size for hash for msg records.
recordHashSize = 8
)
func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
}
func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
if cfg.Name == _EMPTY_ {
return nil, fmt.Errorf("name required")
}
if cfg.Storage != FileStorage {
return nil, fmt.Errorf("fileStore requires file storage type in config")
}
// Default values.
if fcfg.BlockSize == 0 {
fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
}
if fcfg.BlockSize > maxBlockSize {
return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
}
if fcfg.CacheExpire == 0 {
fcfg.CacheExpire = defaultCacheBufferExpiration
}
if fcfg.SyncInterval == 0 {
fcfg.SyncInterval = defaultSyncInterval
}
// Check the directory
if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create storage directory - %v", err)
}
} else if stat == nil || !stat.IsDir() {
return nil, fmt.Errorf("storage directory is not a directory")
}
tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
if err != nil {
return nil, fmt.Errorf("storage directory is not writable")
}
tmpfile.Close()
<-dios
os.Remove(tmpfile.Name())
dios <- struct{}{}
fs := &fileStore{
fcfg: fcfg,
psim: make(map[string]*psi),
bim: make(map[uint32]*msgBlock),
cfg: FileStreamInfo{Created: created, StreamConfig: cfg},
prf: prf,
oldprf: oldprf,
qch: make(chan struct{}),
fch: make(chan struct{}, 1),
fsld: make(chan struct{}),
srv: fcfg.srv,
}
// Set flush in place to AsyncFlush which by default is false.
fs.fip = !fcfg.AsyncFlush
// Check if this is a new setup.
mdir := filepath.Join(fcfg.StoreDir, msgDir)
odir := filepath.Join(fcfg.StoreDir, consumerDir)
if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create message storage directory - %v", err)
}
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
}
// Create highway hash for message blocks. Use sha256 of directory as key.
key := sha256.Sum256([]byte(cfg.Name))
fs.hh, err = highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
// Make sure we do not have an encrypted store underneath of us but no main key.
if fs.prf == nil {
if _, err := os.Stat(keyFile); err == nil {
return nil, errNoMainKey
}
}
// Attempt to recover our state.
err = fs.recoverFullState()
if err != nil {
// Hold onto state
prior := fs.state
// Reset anything that could have been set from above.
fs.state = StreamState{}
fs.psim = make(map[string]*psi)
fs.bim = make(map[uint32]*msgBlock)
fs.blks = nil
fs.tombs = nil
// Recover our message state the old way
if err := fs.recoverMsgs(); err != nil {
return nil, err
}
// Check if our prior remember a last past where we can see.
if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
if lmb, err := fs.newMsgBlockForWrite(); err == nil {
lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
} else {
return nil, err
}
}
// Since we recovered here, make sure to kick ourselves to write out our stream state.
fs.dirty++
defer fs.kickFlushStateLoop()
}
// Also make sure we get rid of old idx and fss files on return.
// Do this in separate go routine vs inline and at end of processing.
defer func() {
go func() {
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll))
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll))
}()
}()
// Lock while do enforcements and removals.
fs.mu.Lock()
// Check if we have any left over tombstones to process.
if len(fs.tombs) > 0 {
for _, seq := range fs.tombs {
fs.removeMsg(seq, false, false, false)
fs.removeFromLostData(seq)
}
// Not needed after this phase.
fs.tombs = nil
}
// Limits checks and enforcement.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Do age checks too, make sure to call in place.
if fs.cfg.MaxAge != 0 {
fs.expireMsgsOnRecover()
fs.startAgeChk()
}
// If we have max msgs per subject make sure the is also enforced.
if fs.cfg.MaxMsgsPer > 0 {
fs.enforceMsgPerSubjectLimit()
}
// Grab first sequence for check below while we have lock.
firstSeq := fs.state.FirstSeq
fs.mu.Unlock()
// If the stream has an initial sequence number then make sure we
// have purged up until that point. We will do this only if the
// recovered first sequence number is before our configured first
// sequence. Need to do this locked as by now the age check timer
// has started.
if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
if _, err := fs.purge(cfg.FirstSeq); err != nil {
return nil, err
}
}
// Write our meta data if it does not exist or is zero'd out.
meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
fi, err := os.Stat(meta)
if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
if err := fs.writeStreamMeta(); err != nil {
return nil, err
}
}
// If we expect to be encrypted check that what we are restoring is not plaintext.
// This can happen on snapshot restores or conversions.
if fs.prf != nil {
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
if err := fs.writeStreamMeta(); err != nil {
return nil, err
}
}
}
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
// Spin up the go routine that will write out or full state stream index.
go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld)
return fs, nil
}
// Lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) lockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.Lock()
}
}
// Unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) unlockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.Unlock()
}
}
func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
if fs.isClosed() {
return ErrStoreClosed
}
if cfg.Name == _EMPTY_ {
return fmt.Errorf("name required")
}
if cfg.Storage != FileStorage {
return fmt.Errorf("fileStore requires file storage type in config")
}
fs.mu.Lock()
new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
old_cfg := fs.cfg
// Messages block reference fs.cfg.Subjects (in subjString) under the
// mb's lock, not fs' lock. So do the switch here under all existing
// message blocks' lock in order to silence the DATA RACE detector.
fs.lockAllMsgBlocks()
fs.cfg = new_cfg
fs.unlockAllMsgBlocks()
if err := fs.writeStreamMeta(); err != nil {
fs.lockAllMsgBlocks()
fs.cfg = old_cfg
fs.unlockAllMsgBlocks()
fs.mu.Unlock()
return err
}
// Limits checks and enforcement.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Do age timers.
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.startAgeChk()
}
if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
fs.ageChk.Stop()
fs.ageChk = nil
}
if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
fs.enforceMsgPerSubjectLimit()
}
fs.mu.Unlock()
if cfg.MaxAge != 0 {
fs.expireMsgs()
}
return nil
}
func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
if maxBytes > 0 {
blkSize := (maxBytes / 4) + 1 // (25% overhead)
// Round up to nearest 100
if m := blkSize % 100; m != 0 {
blkSize += 100 - m
}
if blkSize <= FileStoreMinBlkSize {
blkSize = FileStoreMinBlkSize
} else if blkSize >= FileStoreMaxBlkSize {
blkSize = FileStoreMaxBlkSize
} else {
blkSize = defaultMediumBlockSize
}
if encrypted && blkSize > maximumEncryptedBlockSize {
// Notes on this below.
blkSize = maximumEncryptedBlockSize
}
return uint64(blkSize)
}
switch {
case encrypted:
// In the case of encrypted stores, large blocks can result in worsened perf
// since many writes on disk involve re-encrypting the entire block. For now,
// we will enforce a cap on the block size when encryption is enabled to avoid
// this.
return maximumEncryptedBlockSize
case retention == LimitsPolicy:
// TODO(dlc) - Make the blocksize relative to this if set.
return defaultLargeBlockSize
default:
// TODO(dlc) - Make the blocksize relative to this if set.
return defaultMediumBlockSize
}
}
func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
if sc == ChaCha {
ek, err = chacha20poly1305.NewX(seed)
} else if sc == AES {
block, e := aes.NewCipher(seed)
if e != nil {
return nil, err
}
ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
} else {
err = errUnknownCipher
}
return ek, err
}
// Generate an asset encryption key from the context and server PRF.
func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
if fs.prf == nil {
return nil, nil, nil, nil, errNoEncryption
}
// Generate key encryption key.
rb, err := fs.prf([]byte(context))
if err != nil {
return nil, nil, nil, nil, err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return nil, nil, nil, nil, err
}
// Generate random asset encryption key seed.
const seedSize = 32
seed = make([]byte, seedSize)
if n, err := rand.Read(seed); err != nil || n != seedSize {
return nil, nil, nil, nil, err
}
aek, err = genEncryptionKey(sc, seed)
if err != nil {
return nil, nil, nil, nil, err
}
// Generate our nonce. Use same buffer to hold encrypted seed.
nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
rand.Read(nonce)
bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
if err != nil {
return nil, nil, nil, nil, err
}
return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
}
// Will generate the block encryption key.
func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
if sc == ChaCha {
return chacha20.NewUnauthenticatedCipher(seed, nonce)
} else if sc == AES {
block, err := aes.NewCipher(seed)
if err != nil {
return nil, err
}
return cipher.NewCTR(block, nonce), nil
}
return nil, errUnknownCipher
}
// Lock should be held.
func (fs *fileStore) recoverAEK() error {
if fs.prf != nil && fs.aek == nil {
ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
if err != nil {
return err
}
rb, err := fs.prf([]byte(fs.cfg.Name))
if err != nil {
return err
}
kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
return err
}
aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
if err != nil {
return err
}
fs.aek = aek
}
return nil
}
// Lock should be held.
func (fs *fileStore) setupAEK() error {
if fs.prf != nil && fs.aek == nil {
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
if err != nil {
return err
}
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
// Set our aek.
fs.aek = key
}
return nil
}
// Write out meta and the checksum.
// Lock should be held.
func (fs *fileStore) writeStreamMeta() error {
if err := fs.setupAEK(); err != nil {
return err
}
meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
return err
}
b, err := json.Marshal(fs.cfg)
if err != nil {
return err
}
// Encrypt if needed.
if fs.aek != nil {
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
rand.Read(nonce)
b = fs.aek.Seal(nonce, nonce, b, nil)
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
fs.hh.Reset()
fs.hh.Write(b)
checksum := hex.EncodeToString(fs.hh.Sum(nil))
sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
// Pools to recycle the blocks to help with memory pressure.
var blkPoolBig sync.Pool // 16MB
var blkPoolMedium sync.Pool // 8MB
var blkPoolSmall sync.Pool // 2MB
// Get a new msg block based on sz estimate.
func getMsgBlockBuf(sz int) (buf []byte) {
var pb interface{}
if sz <= defaultSmallBlockSize {
pb = blkPoolSmall.Get()
} else if sz <= defaultMediumBlockSize {
pb = blkPoolMedium.Get()
} else {
pb = blkPoolBig.Get()
}
if pb != nil {
buf = *(pb.(*[]byte))
} else {
// Here we need to make a new blk.
// If small leave as is..
if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
sz = defaultMediumBlockSize
} else if sz > defaultMediumBlockSize {
sz = defaultLargeBlockSize
}
buf = make([]byte, sz)
}
return buf[:0]
}
// Recycle the msg block.
func recycleMsgBlockBuf(buf []byte) {
if buf == nil || cap(buf) < defaultSmallBlockSize {
return
}
// Make sure to reset before placing back into pool.
buf = buf[:0]
// We need to make sure the load code gets a block that can fit the maximum for a size block.
// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
// it right back in and making a new []byte.
// From above we know its already >= defaultSmallBlockSize
if sz := cap(buf); sz < defaultMediumBlockSize {
blkPoolSmall.Put(&buf)
} else if sz < defaultLargeBlockSize {
blkPoolMedium.Put(&buf)
} else {
blkPoolBig.Put(&buf)
}
}
const (
msgHdrSize = 22
checksumSize = 8
emptyRecordLen = msgHdrSize + checksumSize
)
// Lock should be held.
func (fs *fileStore) noTrackSubjects() bool {
return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
}
// Will init the basics for a message block.
func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))
if mb.hh == nil {
key := sha256.Sum256(fs.hashKeyForBlock(index))
mb.hh, _ = highwayhash.New64(key[:])
}
return mb
}
// Lock for fs should be held.
func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
if fs.prf == nil {
return nil
}
var createdKeys bool
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
if err != nil {
// We do not seem to have keys even though we should. Could be a plaintext conversion.
// Create the keys and we will double check below.
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
return err
}
createdKeys = true
} else {
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
if err != nil {
return err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
// We may be here on a cipher conversion, so attempt to convert.
if err = mb.convertCipher(); err != nil {
return err
}
} else {
mb.seed, mb.nonce = seed, ekey[:ns]
}
mb.aek, err = genEncryptionKey(sc, mb.seed)
if err != nil {
return err
}
if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
return err
}
}
// If we created keys here, let's check the data and if it is plaintext convert here.
if createdKeys {
if err := mb.convertToEncrypted(); err != nil {
return err
}
}
return nil
}
// Load a last checksum if needed from the block file.
// Lock should be held.
func (mb *msgBlock) ensureLastChecksumLoaded() {
var empty [8]byte
if mb.lchk != empty {
return
}
copy(mb.lchk[0:], mb.lastChecksum())
}
// Lock held on entry
func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
mb := fs.initMsgBlock(index)
// Open up the message file, but we will try to recover from the index file.
// We will check that the last checksums match.
file, err := os.Open(mb.mfn)
if err != nil {
return nil, err
}
defer file.Close()
if fi, err := file.Stat(); fi != nil {
mb.rbytes = uint64(fi.Size())
} else {
return nil, err
}
// Make sure encryption loaded if needed.
fs.loadEncryptionForMsgBlock(mb)
// Grab last checksum from main block file.
var lchk [8]byte
if mb.rbytes >= checksumSize {
if mb.bek != nil {
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
mb.bek.XORKeyStream(buf, buf)
copy(lchk[0:], buf[len(buf)-checksumSize:])
}
} else {
file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
}
}
file.Close()
// Read our index file. Use this as source of truth if possible.
if err := mb.readIndexInfo(); err == nil {
// Quick sanity check here.
// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
fs.populateGlobalPerSubjectInfo(mb)
// Try to dump any state we needed on recovery.
mb.tryForceExpireCacheLocked()
}
fs.addMsgBlock(mb)
return mb, nil
}
}
// If we get data loss rebuilding the message block state record that with the fs itself.
ld, tombs, _ := mb.rebuildState()
if ld != nil {
fs.addLostData(ld)
}
// Collect all tombstones.
if len(tombs) > 0 {
fs.tombs = append(fs.tombs, tombs...)
}
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
fs.populateGlobalPerSubjectInfo(mb)
// Try to dump any state we needed on recovery.
mb.tryForceExpireCacheLocked()
}
mb.closeFDs()
fs.addMsgBlock(mb)
return mb, nil
}
func (fs *fileStore) lostData() *LostStreamData {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.ld == nil {
return nil
}
nld := *fs.ld
return &nld
}
// Lock should be held.
func (fs *fileStore) addLostData(ld *LostStreamData) {
if ld == nil {
return
}
if fs.ld != nil {
var added bool
for _, seq := range ld.Msgs {
if _, found := fs.ld.exists(seq); !found {
fs.ld.Msgs = append(fs.ld.Msgs, seq)
added = true
}
}
if added {
msgs := fs.ld.Msgs
sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
fs.ld.Bytes += ld.Bytes
}
} else {
fs.ld = ld
}
}
// Helper to see if we already have this sequence reported in our lost data.
func (ld *LostStreamData) exists(seq uint64) (int, bool) {
i, found := sort.Find(len(ld.Msgs), func(i int) int {
tseq := ld.Msgs[i]
if tseq < seq {
return -1
}
if tseq > seq {
return +1
}
return 0
})
return i, found
}
func (fs *fileStore) removeFromLostData(seq uint64) {
if fs.ld == nil {
return
}
if i, found := fs.ld.exists(seq); found {
fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
if len(fs.ld.Msgs) == 0 {
fs.ld = nil
}
}
}
func (fs *fileStore) rebuildState(ld *LostStreamData) {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.rebuildStateLocked(ld)
}
// Lock should be held.
func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
fs.addLostData(ld)
fs.state.Msgs, fs.state.Bytes = 0, 0
fs.state.FirstSeq, fs.state.LastSeq = 0, 0
for _, mb := range fs.blks {
mb.mu.RLock()
fs.state.Msgs += mb.msgs
fs.state.Bytes += mb.bytes
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
fs.state.LastSeq = mb.last.seq
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
mb.mu.RUnlock()
}
}
// Attempt to convert the cipher used for this message block.
func (mb *msgBlock) convertCipher() error {
fs := mb.fs
sc := fs.fcfg.Cipher
var osc StoreCipher
switch sc {
case ChaCha:
osc = AES
case AES:
osc = ChaCha
}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
if err != nil {
return err
}
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
type prfWithCipher struct {
keyGen
StoreCipher
}
var prfs []prfWithCipher
if fs.prf != nil {
prfs = append(prfs, prfWithCipher{fs.prf, sc})
prfs = append(prfs, prfWithCipher{fs.prf, osc})
}
if fs.oldprf != nil {
prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
}
for _, prf := range prfs {
// Recover key encryption key.
rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
if err != nil {
continue
}
kek, err := genEncryptionKey(prf.StoreCipher, rb)
if err != nil {
continue
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
continue
}
nonce := ekey[:ns]
bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
if err != nil {
return err
}
buf, _ := mb.loadBlock(nil)
bek.XORKeyStream(buf, buf)
// Make sure we can parse with old cipher and key file.
if err = mb.indexCacheBuf(buf); err != nil {
return err
}
// Reset the cache since we just read everything in.
mb.cache = nil
// Generate new keys. If we error for some reason then we will put
// the old keyfile back.
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
os.WriteFile(keyFile, ekey, defaultFilePerms)
return err
}
mb.bek.XORKeyStream(buf, buf)
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
return err
}
return nil
}
return fmt.Errorf("unable to recover keys")
}
// Convert a plaintext block to encrypted.
func (mb *msgBlock) convertToEncrypted() error {
if mb.bek == nil {
return nil
}
buf, err := mb.loadBlock(nil)
if err != nil {
return err
}
if err := mb.indexCacheBuf(buf); err != nil {
// This likely indicates this was already encrypted or corrupt.
mb.cache = nil
return err
}
// Undo cache from above for later.
mb.cache = nil
mb.bek.XORKeyStream(buf, buf)
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
return err
}
return nil
}
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// We will return any lost data, and we will return any delete tombstones we encountered.
func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.rebuildStateLocked()
}
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// Lock should be held.
func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
startLastSeq := mb.last.seq
// Remove the .fss file and clear any cache we have set.
mb.clearCacheAndOffset()
buf, err := mb.loadBlock(nil)
if err != nil || len(buf) == 0 {
var ld *LostStreamData
// No data to rebuild from here.
if mb.msgs > 0 {
// We need to declare lost data here.
ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
if !mb.dmap.Exists(seq) {
ld.Msgs = append(ld.Msgs, seq)
}
}
// Clear invalid state. We will let this blk be added in here.
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
mb.dmap.Empty()
mb.first.seq = mb.last.seq + 1
}
return ld, nil, err
}
// Clear state we need to rebuild.
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
mb.last.seq, mb.last.ts = 0, 0
firstNeedsSet := true
// Check if we need to decrypt.
if mb.bek != nil && len(buf) > 0 {
// Recreate to reset counter.
mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return nil, nil, err
}
mb.bek.XORKeyStream(buf, buf)
}
// Check for compression.
if buf, err = mb.decompressIfNeeded(buf); err != nil {
return nil, nil, err
}
mb.rbytes = uint64(len(buf))
addToDmap := func(seq uint64) {
if seq == 0 {
return
}
mb.dmap.Insert(seq)
}
var le = binary.LittleEndian
truncate := func(index uint32) {
var fd *os.File
if mb.mfd != nil {
fd = mb.mfd
} else {
fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
if err == nil {
defer fd.Close()
}
}
if fd == nil {
return
}
if err := fd.Truncate(int64(index)); err == nil {
// Update our checksum.
if index >= 8 {
var lchk [8]byte
fd.ReadAt(lchk[:], int64(index-8))
copy(mb.lchk[0:], lchk[:])
}
fd.Sync()
}
}
gatherLost := func(lb uint32) *LostStreamData {
var ld LostStreamData
for seq := mb.last.seq + 1; seq <= startLastSeq; seq++ {
ld.Msgs = append(ld.Msgs, seq)
}
ld.Bytes = uint64(lb)
return &ld
}
// For tombstones that we find and collect.
var (
tombstones []uint64
minTombstoneSeq uint64
minTombstoneTs int64
)
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
if index+msgHdrSize > lbuf {
truncate(index)
return gatherLost(lbuf - index), tombstones, nil
}
hdr := buf[index : index+msgHdrSize]
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
hasHeaders := rl&hbit != 0
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
truncate(index)
return gatherLost(lbuf - index), tombstones, errBadMsg
}
// Check for checksum failures before additional processing.
data := buf[index+msgHdrSize : index+rl]
if hh := mb.hh; hh != nil {
hh.Reset()
hh.Write(hdr[4:20])
hh.Write(data[:slen])
if hasHeaders {
hh.Write(data[slen+4 : dlen-recordHashSize])
} else {
hh.Write(data[slen : dlen-recordHashSize])
}
checksum := hh.Sum(nil)
if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
truncate(index)
return gatherLost(lbuf - index), tombstones, errBadMsg
}
copy(mb.lchk[0:], checksum)
}
// Grab our sequence and timestamp.
seq := le.Uint64(hdr[4:])
ts := int64(le.Uint64(hdr[12:]))
// Check if this is a delete tombstone.
if seq&tbit != 0 {
seq = seq &^ tbit
// Need to process this here and make sure we have accounted for this properly.
tombstones = append(tombstones, seq)
if minTombstoneSeq == 0 || seq < minTombstoneSeq {
minTombstoneSeq, minTombstoneTs = seq, ts
}
index += rl
continue
}
// This is an old erased message, or a new one that we can track.
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
seq = seq &^ ebit
if seq >= mb.first.seq {
// Only add to dmap if past recorded first seq and non-zero.
if seq != 0 {
addToDmap(seq)
}
mb.last.seq = seq
mb.last.ts = ts
if mb.msgs == 0 {
mb.first.seq, mb.first.ts = seq+1, 0
}
}
index += rl
continue
}
// This is for when we have index info that adjusts for deleted messages
// at the head. So the first.seq will be already set here. If this is larger
// replace what we have with this seq.
if firstNeedsSet && seq >= mb.first.seq {
firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts
}
if !mb.dmap.Exists(seq) {
mb.msgs++
mb.bytes += uint64(rl)
// Rebuild per subject info if needed.
if slen > 0 {
if mb.fss == nil {
mb.fss = make(map[string]*SimpleState)
}
// For the lookup, we cast the byte slice and there won't be any copy
if ss := mb.fss[string(data[:slen])]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
// This will either use a subject from the config, or make a copy
// so we don't reference the underlying buffer.
subj := mb.subjString(data[:slen])
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
// Always set last
mb.last.seq = seq
mb.last.ts = ts
// Advance to next record.
index += rl
}
// For empty msg blocks make sure we recover last seq correctly based off of first.
// Or if we seem to have no messages but had a tombstone, which we use to remember
// sequences and timestamps now, use that to properly setup the first and last.
if mb.msgs == 0 {
if mb.first.seq > 0 {
mb.last.seq = mb.first.seq - 1
} else if mb.first.seq == 0 && minTombstoneSeq > 0 {
mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0
if mb.last.seq == 0 {
mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs
}
}
}
return nil, tombstones, nil
}
// For doing warn logging.
// Lock should be held.
func (fs *fileStore) warn(format string, args ...any) {
// No-op if no server configured.
if fs.srv == nil {
return
}
fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
}
// recoverFullState will attempt to receover our last full state and re-process any state changes
// that happened afterwards.
func (fs *fileStore) recoverFullState() (rerr error) {
fs.mu.Lock()
defer fs.mu.Unlock()
// Check for any left over purged messages.
<-dios
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
// Grab our stream state file and load it in.
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
buf, err := os.ReadFile(fn)
dios <- struct{}{}
if err != nil {
if !os.IsNotExist(err) {
fs.warn("Could not read stream state file: %v", err)
}
return err
}
const minLen = 32
if len(buf) < minLen {
os.Remove(fn)
fs.warn("Stream state too short (%d bytes)", len(buf))
return errCorruptState
}
// The highwayhash will be on the end. Check that it still matches.
h := buf[len(buf)-highwayhash.Size64:]
buf = buf[:len(buf)-highwayhash.Size64]
fs.hh.Reset()
fs.hh.Write(buf)
if !bytes.Equal(h, fs.hh.Sum(nil)) {
os.Remove(fn)
fs.warn("Stream state checksum did not match")
return errCorruptState
}
// Decrypt if needed.
if fs.prf != nil {
// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
// since snapshots strip encryption.
if err := fs.recoverAEK(); err == nil {
ns := fs.aek.NonceSize()
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
fs.warn("Stream state error reading encryption key: %v", err)
return err
}
}
}
if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
os.Remove(fn)
fs.warn("Stream state magic and version mismatch")
return errCorruptState
}
bi := hdrLen
readU64 := func() uint64 {
if bi < 0 {
return 0
}
v, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return v
}
readI64 := func() int64 {
if bi < 0 {
return 0
}
v, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return v
}
setTime := func(t *time.Time, ts int64) {
if ts == 0 {
*t = time.Time{}
} else {
*t = time.Unix(0, ts).UTC()
}
}
var state StreamState
state.Msgs = readU64()
state.Bytes = readU64()
state.FirstSeq = readU64()
baseTime := readI64()
setTime(&state.FirstTime, baseTime)
state.LastSeq = readU64()
setTime(&state.LastTime, readI64())
// Check for per subject info.
if numSubjects := int(readU64()); numSubjects > 0 {
fs.psim = make(map[string]*psi, numSubjects)
for i := 0; i < numSubjects; i++ {
if lsubj := int(readU64()); lsubj > 0 {
if bi+lsubj > len(buf) {
os.Remove(fn)
fs.warn("Stream state bad subject len (%d)", lsubj)
return errCorruptState
}
subj := fs.subjString(buf[bi : bi+lsubj])
bi += lsubj
psi := &psi{total: readU64(), fblk: uint32(readU64())}
if psi.total > 1 {
psi.lblk = uint32(readU64())
} else {
psi.lblk = psi.fblk
}
fs.psim[subj] = psi
}
}
}
if numBlocks := readU64(); numBlocks > 0 {
lastIndex := int(numBlocks - 1)
fs.blks = make([]*msgBlock, 0, numBlocks)
for i := 0; i < int(numBlocks); i++ {
index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
if bi < 0 {
break
}
mb := fs.initMsgBlock(index)
mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes
mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
if numDeleted > 0 {
dmap, n, err := avl.Decode(buf[bi:])
if err != nil {
os.Remove(fn)
fs.warn("Stream state error decoding avl dmap: %v", err)
return errCorruptState
}
mb.dmap = *dmap
if mb.msgs > numDeleted {
mb.msgs -= numDeleted
} else {
mb.msgs = 0
}
bi += n
}
// Only add in if not empty or the lmb.
if mb.msgs > 0 || i == lastIndex {
fs.addMsgBlock(mb)
} else {
// Mark dirty to cleanup.
fs.dirty++
}
}
}
// Pull in last block index for the block that had last checksum when we wrote the full state.
blkIndex := uint32(readU64())
var lchk [8]byte
if bi+len(lchk) > len(buf) {
bi = -1
} else {
copy(lchk[0:], buf[bi:bi+len(lchk)])
}
// Check if we had any errors.
if bi < 0 {
os.Remove(fn)
fs.warn("Stream state has no checksum present")
return errCorruptState
}
// Move into place our state, msgBlks and subject info.
fs.state = state
// First let's check the happy path, open the blk file that was the lmb when we created the full state.
// See if we have the last block available.
var matched bool
var mb *msgBlock
if mb = fs.bim[blkIndex]; mb != nil {
if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
// If our saved state is past what we see on disk, fallback and rebuild.
if ld, _, _ := mb.rebuildState(); ld != nil {
fs.addLostData(ld)
}
fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex)
return errPriorState
}
if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
// Remove the last message block since we will re-process below.
fs.removeMsgBlockFromList(mb)
}
}
// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
// If we did not match we re-process the last block.
start := blkIndex
if matched {
start++
}
for bi := start; ; bi++ {
nmb, err := fs.recoverMsgBlock(bi)
if err != nil {
if os.IsNotExist(err) {
return nil
}
os.Remove(fn)
fs.warn("Stream state could not recover msg block %d", bi)
return err
}
if nmb != nil {
// Check if we have to account for a partial message block.
if !matched && mb != nil && mb.index == nmb.index {
if err := fs.adjustAccounting(mb, nmb); err != nil {
fs.warn("Stream state could not adjust accounting: %v", err)
return err
}
}
// Update top level accounting.
if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = nmb.first.seq
fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
}
if nmb.last.seq > fs.state.LastSeq {
fs.state.LastSeq = nmb.last.seq
fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
}
fs.state.Msgs += nmb.msgs
fs.state.Bytes += nmb.bytes
}
}
}
// adjustAccounting will be called when a stream state was only partially accounted for
// with a message block, e.g. additional records were added after the stream state.
// Lock should be held.
func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error {
nmb.mu.Lock()
defer nmb.mu.Unlock()
// First make sure the new block is loaded.
if nmb.cacheNotLoaded() {
nmb.loadMsgsWithLock()
}
nmb.ensurePerSubjectInfoLoaded()
lookupAndAdjust := func(seq uint64) error {
var smv StoreMsg
// Lookup the message.
sm, err := nmb.cacheLookup(seq, &smv)
if err != nil {
return err
}
// Since we found it we just need to adjust fs totals and psim.
fs.state.Msgs--
fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if len(sm.subj) > 0 && fs.psim != nil {
fs.removePerSubject(sm.subj)
}
return nil
}
// Walk all the original mb's sequences that were included in the stream state.
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
// If we had already declared it deleted we can move on since you can not undelete.
if mb.dmap.Exists(seq) {
continue
}
// Lookup the message.
if err := lookupAndAdjust(seq); err != nil {
return err
}
}
// Now check to see if we had a higher first for the recovered state mb vs nmb.
if nmb.first.seq < mb.first.seq {
for seq := nmb.first.seq; seq < mb.first.seq; seq++ {
// Lookup the message.
if err := lookupAndAdjust(seq); err != nil {
return err
}
}
// Now set first for nmb.
nmb.first = mb.first
}
return nil
}
// Grabs last checksum for the named block file.
// Takes into account encryption etc.
func (mb *msgBlock) lastChecksum() []byte {
f, err := os.Open(mb.mfn)
if err != nil {
return nil
}
defer f.Close()
var lchk [8]byte
if fi, _ := f.Stat(); fi != nil {
mb.rbytes = uint64(fi.Size())
}
if mb.rbytes < checksumSize {
return nil
}
// Encrypted?
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
return nil
}
}
if mb.bek != nil {
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return nil
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
copy(lchk[0:], buf[len(buf)-checksumSize:])
}
} else {
f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
}
return lchk[:]
}
func (fs *fileStore) recoverMsgs() error {
fs.mu.Lock()
defer fs.mu.Unlock()
// Check for any left over purged messages.
<-dios
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
f, err := os.Open(mdir)
if err != nil {
dios <- struct{}{}
return errNotReadable
}
dirs, err := f.ReadDir(-1)
f.Close()
dios <- struct{}{}
if err != nil {
return errNotReadable
}
indices := make(sort.IntSlice, 0, len(dirs))
var index int
for _, fi := range dirs {
if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
indices = append(indices, index)
}
}
indices.Sort()
// Recover all of the msg blocks.
// We now guarantee they are coming in order.
for _, index := range indices {
if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
// This is a truncate block with possibly no index. If the OS got shutdown
// out from underneath of us this is possible.
if mb.first.seq == 0 {
mb.dirtyCloseWithRemove(true)
fs.removeMsgBlockFromList(mb)
continue
}
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq
if mb.first.ts == 0 {
fs.state.FirstTime = time.Time{}
} else {
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
}
if mb.last.seq > fs.state.LastSeq {
fs.state.LastSeq = mb.last.seq
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
}
fs.state.Msgs += mb.msgs
fs.state.Bytes += mb.bytes
} else {
return err
}
}
if len(fs.blks) > 0 {
fs.lmb = fs.blks[len(fs.blks)-1]
} else {
_, err = fs.newMsgBlockForWrite()
}
// Check if we encountered any lost data.
if fs.ld != nil {
var emptyBlks []*msgBlock
for _, mb := range fs.blks {
if mb.msgs == 0 && mb.rbytes == 0 {
emptyBlks = append(emptyBlks, mb)
}
}
for _, mb := range emptyBlks {
// Need the mb lock here.
mb.mu.Lock()
fs.removeMsgBlock(mb)
mb.mu.Unlock()
}
}
if err != nil {
return err
}
// Check for keyfiles orphans.
if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
valid := make(map[uint32]bool)
for _, mb := range fs.blks {
valid[mb.index] = true
}
for _, fn := range kms {
var index uint32
shouldRemove := true
if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
shouldRemove = false
}
if shouldRemove {
os.Remove(fn)
}
}
}
return nil
}
// Will expire msgs that have aged out on restart.
// We will treat this differently in case we have a recovery
// that will expire alot of messages on startup.
// Should only be called on startup.
func (fs *fileStore) expireMsgsOnRecover() {
if fs.state.Msgs == 0 {
return
}
var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
var purged, bytes uint64
var deleted int
var nts int64
// If we expire all make sure to write out a tombstone. Need to be done by hand here,
// usually taken care of by fs.removeMsgBlock() but we do not call that here.
var last msgId
deleteEmptyBlock := func(mb *msgBlock) {
// If we are the last keep state to remember first/last sequence.
// Do this part by hand since not deleting one by one.
if mb == fs.lmb {
last = mb.last
}
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
mb.dirtyCloseWithRemove(true)
deleted++
}
for _, mb := range fs.blks {
mb.mu.Lock()
if minAge < mb.first.ts {
nts = mb.first.ts
mb.mu.Unlock()
break
}
// Can we remove whole block here?
if mb.last.ts <= minAge {
purged += mb.msgs
bytes += mb.bytes
deleteEmptyBlock(mb)
mb.mu.Unlock()
continue
}
// If we are here we have to process the interior messages of this blk.
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
break
}
var smv StoreMsg
var needNextFirst bool
// Walk messages and remove if expired.
mb.ensurePerSubjectInfoLoaded()
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
sm, err := mb.cacheLookup(seq, &smv)
// Process interior deleted msgs.
if err == errDeletedMsg {
// Update dmap.
if mb.dmap.Exists(seq) {
mb.dmap.Delete(seq)
}
// Keep this updated just in case since we are removing dmap entries.
mb.first.seq, needNextFirst = seq, true
continue
}
// Break on other errors.
if err != nil || sm == nil {
mb.first.seq, needNextFirst = seq, true
break
}
// No error and sm != nil from here onward.
// Check for done.
if minAge < sm.ts {
mb.first.seq, needNextFirst = sm.seq, false
mb.first.seq = sm.seq
mb.first.ts = sm.ts
nts = sm.ts
break
}
// Delete the message here.
if mb.msgs > 0 {
mb.first.seq, needNextFirst = seq, true
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if sz > mb.bytes {
sz = mb.bytes
}
mb.bytes -= sz
bytes += sz
mb.msgs--
purged++
}
// Update fss
// Make sure we have fss loaded.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
}
// Make sure we have a proper next first sequence.
if needNextFirst {
mb.selectNextFirst()
}
// Check if empty after processing, could happen if tail of messages are all deleted.
if mb.msgs == 0 {
deleteEmptyBlock(mb)
}
mb.mu.Unlock()
break
}
if nts > 0 {
// Make sure to set age check based on this value.
fs.resetAgeChk(nts - minAge)
}
if deleted > 0 {
// Update block map.
if fs.bim != nil {
for _, mb := range fs.blks[:deleted] {
delete(fs.bim, mb.index)
}
}
// Update blks slice.
fs.blks = copyMsgBlocks(fs.blks[deleted:])
if lb := len(fs.blks); lb == 0 {
fs.lmb = nil
} else {
fs.lmb = fs.blks[lb-1]
}
}
// Update top level accounting.
if purged < fs.state.Msgs {
fs.state.Msgs -= purged
} else {
fs.state.Msgs = 0
}
if bytes < fs.state.Bytes {
fs.state.Bytes -= bytes
} else {
fs.state.Bytes = 0
}
// Make sure to we properly set the fs first sequence and timestamp.
fs.selectNextFirst()
// Check if we have no messages and blocks left.
if fs.lmb == nil && last.seq != 0 {
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
lmb.writeTombstone(last.seq, last.ts)
}
// Clear any global subject state.
fs.psim = make(map[string]*psi)
}
// If we purged anything, make sure we kick flush state loop.
if purged > 0 {
fs.dirty++
fs.kickFlushStateLoop()
}
}
func copyMsgBlocks(src []*msgBlock) []*msgBlock {
if src == nil {
return nil
}
dst := make([]*msgBlock, len(src))
copy(dst, src)
return dst
}
// GetSeqFromTime looks for the first sequence number that has
// the message with >= timestamp.
// FIXME(dlc) - inefficient, and dumb really. Make this better.
func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
fs.mu.RLock()
lastSeq := fs.state.LastSeq
closed := fs.closed
fs.mu.RUnlock()
if closed {
return 0
}
mb := fs.selectMsgBlockForStart(t)
if mb == nil {
return lastSeq + 1
}
mb.mu.RLock()
fseq := mb.first.seq
lseq := mb.last.seq
mb.mu.RUnlock()
var smv StoreMsg
// Linear search, hence the dumb part..
ts := t.UnixNano()
for seq := fseq; seq <= lseq; seq++ {
sm, _, _ := mb.fetchMsg(seq, &smv)
if sm != nil && sm.ts >= ts {
return sm.seq
}
}
return 0
}
// Find the first matching message.
func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return nil, false, err
}
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return nil, false, err
}
}
// If we only have 1 subject currently and it matches our filter we can also set isAll.
if !isAll && len(mb.fss) == 1 {
_, isAll = mb.fss[filter]
}
// Skip scan of mb.fss if number of messages in the block are less than
// 1/2 the number of subjects in mb.fss. Or we have a wc and lots of fss entries.
const linearScanMaxFSS = 32
doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS)
if !doLinearScan {
// If we have a wildcard match against all tracked subjects we know about.
if wc {
subs = subs[:0]
for subj := range mb.fss {
if subjectIsSubsetMatch(subj, filter) {
subs = append(subs, subj)
}
}
}
fseq = mb.last.seq + 1
for _, subj := range subs {
ss := mb.fss[subj]
if ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if ss == nil || start > ss.Last || ss.First >= fseq {
continue
}
if ss.First < start {
fseq = start
} else {
fseq = ss.First
}
}
}
if fseq > mb.last.seq {
return nil, false, ErrStoreMsgNotFound
}
if sm == nil {
sm = new(StoreMsg)
}
for seq := fseq; seq <= mb.last.seq; seq++ {
llseq := mb.llseq
fsm, err := mb.cacheLookup(seq, sm)
if err != nil {
continue
}
expireOk := seq == mb.last.seq && mb.llseq == seq
if isAll {
return fsm, expireOk, nil
}
if doLinearScan {
if wc && subjectIsSubsetMatch(fsm.subj, filter) {
return fsm, expireOk, nil
} else if !wc && fsm.subj == filter {
return fsm, expireOk, nil
}
} else {
for _, subj := range subs {
if fsm.subj == subj {
return fsm, expireOk, nil
}
}
}
// If we are here we did not match, so put the llseq back.
mb.llseq = llseq
}
return nil, false, ErrStoreMsgNotFound
}
// This will traverse a message block and generate the filtered pending.
func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.filteredPendingLocked(subj, wc, seq)
}
// This will traverse a message block and generate the filtered pending.
// Lock should be held.
func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
isAll := filter == _EMPTY_ || filter == fwcs
// First check if we can optimize this part.
// This means we want all and the starting sequence was before this block.
if isAll && sseq <= mb.first.seq {
return mb.msgs, mb.first.seq, mb.last.seq
}
update := func(ss *SimpleState) {
total += ss.Msgs
if first == 0 || ss.First < first {
first = ss.First
}
if ss.Last > last {
last = ss.Last
}
}
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
// 1. See if we match any subs from fss.
// 2. If we match and the sseq is past ss.Last then we can use meta only.
// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.
isMatch := func(subj string) bool {
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
var havePartial bool
for subj, ss := range mb.fss {
if isAll || isMatch(subj) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
update(ss)
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
}
}
// If we did not encounter any partials we can return here.
if !havePartial {
return total, first, last
}
// If we are here we need to scan the msgs.
// Clear what we had.
total, first, last = 0, 0, 0
// If we load the cache for a linear scan we want to expire that cache upon exit.
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
var smv StoreMsg
for seq := sseq; seq <= mb.last.seq; seq++ {
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
continue
}
if isAll || isMatch(sm.subj) {
total++
if first == 0 || seq < first {
first = seq
}
if seq > last {
last = seq
}
}
}
// If we loaded this block for this operation go ahead and expire it here.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
return total, first, last
}
// FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
fs.mu.RLock()
defer fs.mu.RUnlock()
lseq := fs.state.LastSeq
if sseq < fs.state.FirstSeq {
sseq = fs.state.FirstSeq
}
// Returned state.
var ss SimpleState
// If past the end no results.
if sseq > lseq {
return ss
}
// If we want all msgs that match we can shortcircuit.
// TODO(dlc) - This can be extended for all cases but would
// need to be careful on total msgs calculations etc.
if sseq == fs.state.FirstSeq {
fs.numFilteredPending(subj, &ss)
} else {
wc := subjectHasWildcard(subj)
// Tracking subject state.
// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
for _, mb := range fs.blks {
// Skip blocks that are less than our starting sequence.
if sseq > atomic.LoadUint64(&mb.last.seq) {
continue
}
t, f, l := mb.filteredPending(subj, wc, sseq)
ss.Msgs += t
if ss.First == 0 || (f > 0 && f < ss.First) {
ss.First = f
}
if l > ss.Last {
ss.Last = l
}
}
}
return ss
}
// Optimized way for getting all num pending matching a filter subject.
// Lock should be held.
func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
isAll := filter == _EMPTY_ || filter == fwcs
// If isAll we do not need to do anything special to calculate the first and last and total.
if isAll {
ss.First = fs.state.FirstSeq
ss.Last = fs.state.LastSeq
ss.Msgs = fs.state.Msgs
return
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
start, stop := uint32(math.MaxUint32), uint32(0)
for subj, psi := range fs.psim {
if isAll {
ss.Msgs += psi.total
} else {
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
if isSubsetMatchTokenized(tts, fts) {
ss.Msgs += psi.total
// Keep track of start and stop indexes for this subject.
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
}
}
}
// If not collecting all we do need to figure out the first and last sequences.
if !isAll {
wc := subjectHasWildcard(filter)
// Do start
mb := fs.bim[start]
if mb != nil {
_, f, _ := mb.filteredPending(filter, wc, 0)
ss.First = f
}
if ss.First == 0 {
// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
for i := start + 1; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
ss.First = f
break
}
}
}
// Now last
if mb = fs.bim[stop]; mb != nil {
_, _, l := mb.filteredPending(filter, wc, 0)
ss.Last = l
}
}
}
// SubjectsState returns a map of SimpleState for all matching subjects.
func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.state.Msgs == 0 {
return nil
}
start, stop := fs.blks[0], fs.lmb
// We can short circuit if not a wildcard using psim for start and stop.
if !subjectHasWildcard(subject) {
info := fs.psim[subject]
if info == nil {
return nil
}
start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
}
// Aggregate fss.
fss := make(map[string]SimpleState)
var startFound bool
for _, mb := range fs.blks {
if !startFound {
if mb != start {
continue
}
startFound = true
}
mb.mu.Lock()
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
oss := fss[subj]
if oss.First == 0 { // New
fss[subj] = *ss
} else {
// Merge here.
oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
fss[subj] = oss
}
}
}
mb.mu.Unlock()
if mb == stop {
break
}
}
return fss
}
// NumPending will return the number of pending messages matching the filter subject starting at sequence.
// Optimized for stream num pending calculations for consumers.
func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
fs.mu.RLock()
defer fs.mu.RUnlock()
// This can always be last for these purposes.
validThrough = fs.state.LastSeq
if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
return 0, validThrough
}
// Track starting for both block for the sseq and staring block that matches any subject.
var seqStart, subjStart int
// See if we need to figure out starting block per sseq.
if sseq > fs.state.FirstSeq {
// This should not, but can return -1, so make sure we check to avoid panic below.
if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 {
seqStart = 0
}
}
var tsa, fsa [32]string
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
isAll := filter == _EMPTY_ || filter == fwcs
wc := subjectHasWildcard(filter)
// See if filter was provided but its the only subject.
if !isAll && !wc && len(fs.psim) == 1 && fs.psim[filter] != nil {
isAll = true
}
// If we are isAll and have no deleted we can do a simpler calculation.
if isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
if sseq == 0 {
return fs.state.Msgs, validThrough
}
return fs.state.LastSeq - sseq + 1, validThrough
}
isMatch := func(subj string) bool {
if isAll {
return true
}
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
// If we would need to scan more from the beginning, revert back to calculating directly here.
// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
if lastPerSubject || seqStart >= (len(fs.blks)/2) {
// If we need to track seen for last per subject.
var seen map[string]bool
if lastPerSubject {
seen = make(map[string]bool)
}
for i := seqStart; i < len(fs.blks); i++ {
mb := fs.blks[i]
mb.mu.Lock()
var t uint64
if isAll && sseq <= mb.first.seq {
if lastPerSubject {
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
if !seen[subj] {
total++
seen[subj] = true
}
}
} else {
total += mb.msgs
}
mb.mu.Unlock()
continue
}
// If we are here we need to at least scan the subject fss.
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
var havePartial bool
for subj, ss := range mb.fss {
if !seen[subj] && isMatch(subj) {
if lastPerSubject {
// Can't have a partials with last by subject.
if sseq <= ss.Last {
t++
seen[subj] = true
}
} else {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
t += ss.Msgs
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
}
}
}
// See if we need to scan msgs here.
if havePartial {
// Clear on partial.
t = 0
// If we load the cache for a linear scan we want to expire that cache upon exit.
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
var smv StoreMsg
for seq := sseq; seq <= mb.last.seq; seq++ {
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) {
t++
}
}
// If we loaded this block for this operation go ahead and expire it here.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
}
mb.mu.Unlock()
total += t
}
return total, validThrough
}
// If we are here its better to calculate totals from psim and adjust downward by scanning less blocks.
// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
start := uint32(math.MaxUint32)
for subj, psi := range fs.psim {
if isMatch(subj) {
if lastPerSubject {
total++
// Keep track of start index for this subject.
// Use last block in this case.
if psi.lblk < start {
start = psi.lblk
}
} else {
total += psi.total
// Keep track of start index for this subject.
if psi.fblk < start {
start = psi.fblk
}
}
}
}
// See if we were asked for all, if so we are done.
if sseq <= fs.state.FirstSeq {
return total, validThrough
}
// If we are here we need to calculate partials for the first blocks.
subjStart = int(start)
firstSubjBlk := fs.bim[uint32(subjStart)]
var firstSubjBlkFound bool
var smv StoreMsg
// Adjust in case not found.
if firstSubjBlk == nil {
firstSubjBlkFound = true
}
// Track how many we need to adjust against the total.
var adjust uint64
for i := 0; i <= seqStart; i++ {
mb := fs.blks[i]
// We can skip blks if we know they are below the first one that has any subject matches.
if !firstSubjBlkFound {
if mb == firstSubjBlk {
firstSubjBlkFound = true
} else {
continue
}
}
// We need to scan this block.
var shouldExpire bool
mb.mu.Lock()
// Check if we should include all of this block in adjusting. If so work with metadata.
if sseq > mb.last.seq {
if isAll && !lastPerSubject {
adjust += mb.msgs
} else {
// We need to adjust for all matches in this block.
// We will scan fss state vs messages themselves.
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
if isMatch(subj) {
if lastPerSubject {
adjust++
} else {
adjust += ss.Msgs
}
}
}
}
} else {
// This is the last block. We need to scan per message here.
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
return 0, 0
}
shouldExpire = true
}
var last = mb.last.seq
if sseq < last {
last = sseq
}
for seq := mb.first.seq; seq < last; seq++ {
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
continue
}
// Check if it matches our filter.
if isMatch(sm.subj) && sm.seq < sseq {
adjust++
}
}
}
// If we loaded the block try to force expire.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
mb.mu.Unlock()
}
// Make final adjustment.
total -= adjust
return total, validThrough
}
// SubjectsTotal return message totals per subject.
func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
fs.mu.RLock()
defer fs.mu.RUnlock()
if len(fs.psim) == 0 {
return nil
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
isAll := filter == _EMPTY_ || filter == fwcs
wc := subjectHasWildcard(filter)
isMatch := func(subj string) bool {
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
fst := make(map[string]uint64)
for subj, psi := range fs.psim {
if isAll || isMatch(subj) {
fst[subj] = psi.total
}
}
return fst
}
// RegisterStorageUpdates registers a callback for updates to storage changes.
// It will present number of messages and bytes as a signed integer and an
// optional sequence number of the message if a single.
func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
fs.mu.Lock()
fs.scb = cb
bsz := fs.state.Bytes
fs.mu.Unlock()
if cb != nil && bsz > 0 {
cb(0, int64(bsz), 0, _EMPTY_)
}
}
// Helper to get hash key for specific message block.
// Lock should be held
func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
}
func (mb *msgBlock) setupWriteCache(buf []byte) {
// Make sure we have a cache setup.
if mb.cache != nil {
return
}
// Setup simple cache.
mb.cache = &cache{buf: buf}
// Make sure we set the proper cache offset if we have existing data.
var fi os.FileInfo
if mb.mfd != nil {
fi, _ = mb.mfd.Stat()
} else if mb.mfn != _EMPTY_ {
fi, _ = os.Stat(mb.mfn)
}
if fi != nil {
mb.cache.off = int(fi.Size())
}
mb.llts = time.Now().UnixNano()
mb.startCacheExpireTimer()
}
// This rolls to a new append msg block.
// Lock should be held.
func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
index := uint32(1)
var rbuf []byte
if lmb := fs.lmb; lmb != nil {
index = lmb.index + 1
// Determine if we can reclaim any resources here.
if fs.fip {
lmb.mu.Lock()
lmb.closeFDsLocked()
if lmb.cache != nil {
// Reset write timestamp and see if we can expire this cache.
rbuf = lmb.tryExpireWriteCache()
}
lmb.mu.Unlock()
}
}
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
// Lock should be held to quiet race detector.
mb.mu.Lock()
mb.setupWriteCache(rbuf)
mb.fss = make(map[string]*SimpleState)
// Set cache time to creation time to start.
ts := time.Now().UnixNano()
mb.llts, mb.lwts = 0, ts
// Remember our last sequence number.
mb.first.seq = fs.state.LastSeq + 1
mb.last.seq = fs.state.LastSeq
mb.mu.Unlock()
// Now do local hash.
key := sha256.Sum256(fs.hashKeyForBlock(index))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
mb.hh = hh
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, mb.index))
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
if err != nil {
mb.dirtyCloseWithRemove(true)
return nil, fmt.Errorf("Error creating msg block file [%q]: %v", mb.mfn, err)
}
mb.mfd = mfd
// Check if encryption is enabled.
if fs.prf != nil {
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
return nil, err
}
}
// If we know we will need this so go ahead and spin up.
if !fs.fip {
mb.spinUpFlushLoop()
}
// Add to our list of blocks and mark as last.
fs.addMsgBlock(mb)
if fs.dirty > 0 {
fs.kickFlushStateLoop()
}
return mb, nil
}
// Generate the keys for this message block and write them out.
func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
if mb == nil {
return nil
}
key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
if err != nil {
return err
}
mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
mb.kfn = keyFile
return nil
}
// Stores a raw message with expected sequence number and timestamp.
// Lock should be held.
func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
if fs.closed {
return ErrStoreClosed
}
// Per subject max check needed.
mmp := uint64(fs.cfg.MaxMsgsPer)
var psmc uint64
psmax := mmp > 0 && len(subj) > 0
if psmax {
if info, ok := fs.psim[subj]; ok {
psmc = info.total
}
}
var fseq uint64
// Check if we are discarding new messages when we reach the limit.
if fs.cfg.Discard == DiscardNew {
var asl bool
if psmax && psmc >= mmp {
// If we are instructed to discard new per subject, this is an error.
if fs.cfg.DiscardNewPer {
return ErrMaxMsgsPerSubject
}
if fseq, err = fs.firstSeqForSubj(subj); err != nil {
return err
}
asl = true
}
if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
return ErrMaxMsgs
}
if fs.cfg.MaxBytes > 0 && fs.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(fs.cfg.MaxBytes) {
if !asl || fs.sizeForSeq(fseq) <= len(msg)+len(hdr) {
return ErrMaxBytes
}
}
}
// Check sequence.
if seq != fs.state.LastSeq+1 {
if seq > 0 {
return ErrSequenceMismatch
}
seq = fs.state.LastSeq + 1
}
// Write msg record.
n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
if err != nil {
return err
}
// Adjust top level tracking of per subject msg counts.
if len(subj) > 0 {
index := fs.lmb.index
if info, ok := fs.psim[subj]; ok {
info.total++
if index > info.lblk {
info.lblk = index
}
} else {
fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index}
}
}
// Adjust first if needed.
now := time.Unix(0, ts).UTC()
if fs.state.Msgs == 0 {
fs.state.FirstSeq = seq
fs.state.FirstTime = now
}
fs.state.Msgs++
fs.state.Bytes += n
fs.state.LastSeq = seq
fs.state.LastTime = now
// Enforce per message limits.
// We snapshotted psmc before our actual write, so >= comparison needed.
if psmax && psmc >= mmp {
// We may have done this above.
if fseq == 0 {
fseq, _ = fs.firstSeqForSubj(subj)
}
if ok, _ := fs.removeMsgViaLimits(fseq); ok {
// Make sure we are below the limit.
if psmc--; psmc >= mmp {
for info, ok := fs.psim[subj]; ok && info.total > mmp; info, ok = fs.psim[subj] {
if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
if ok, _ := fs.removeMsgViaLimits(seq); !ok {
break
}
} else {
break
}
}
}
} else if mb := fs.selectMsgBlock(fseq); mb != nil {
// If we are here we could not remove fseq from above, so rebuild.
var ld *LostStreamData
if ld, _, _ = mb.rebuildState(); ld != nil {
fs.rebuildStateLocked(ld)
}
}
}
// Limits checks and enforcement.
// If they do any deletions they will update the
// byte count on their own, so no need to compensate.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Check if we have and need the age expiration timer running.
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.startAgeChk()
}
return nil
}
// StoreRawMsg stores a raw message with expected sequence number and timestamp.
func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
fs.mu.Lock()
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
cb := fs.scb
// Check if first message timestamp requires expiry
// sooner than initial replica expiry timer set to MaxAge when initializing.
if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
fs.receivedAny = true
// don't block here by calling expireMsgs directly.
// Instead, set short timeout.
fs.resetAgeChk(int64(time.Millisecond * 50))
}
fs.mu.Unlock()
if err == nil && cb != nil {
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
}
return err
}
// Store stores a message. We hold the main filestore lock for any write operation.
func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
fs.mu.Lock()
seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
cb := fs.scb
fs.mu.Unlock()
if err != nil {
seq, ts = 0, 0
} else if cb != nil {
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
}
return seq, ts, err
}
// skipMsg will update this message block for a skipped message.
// If we do not have any messages, just update the metadata, otherwise
// we will place an empty record marking the sequence as used. The
// sequence will be marked erased.
// fs lock should be held.
func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
if mb == nil {
return
}
var needsRecord bool
nowts := now.UnixNano()
mb.mu.Lock()
// If we are empty can just do meta.
if mb.msgs == 0 {
mb.last.seq = seq
mb.last.ts = nowts
mb.first.seq = seq + 1
mb.first.ts = nowts
} else {
needsRecord = true
mb.dmap.Insert(seq)
}
mb.mu.Unlock()
if needsRecord {
mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
} else {
mb.kickFlusher()
}
}
// SkipMsg will use the next sequence number but not store anything.
func (fs *fileStore) SkipMsg() uint64 {
fs.mu.Lock()
defer fs.mu.Unlock()
// Grab time and last seq.
now, seq := time.Now().UTC(), fs.state.LastSeq+1
fs.state.LastSeq, fs.state.LastTime = seq, now
if fs.state.Msgs == 0 {
fs.state.FirstSeq, fs.state.FirstTime = seq, now
}
if seq == fs.state.FirstSeq {
fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
}
fs.lmb.skipMsg(seq, now)
return seq
}
// Lock should be held.
func (fs *fileStore) rebuildFirst() {
if len(fs.blks) == 0 {
return
}
fmb := fs.blks[0]
if fmb == nil {
return
}
ld, _, _ := fmb.rebuildState()
fmb.mu.RLock()
isEmpty := fmb.msgs == 0
fmb.mu.RUnlock()
if isEmpty {
fmb.mu.Lock()
fs.removeMsgBlock(fmb)
fmb.mu.Unlock()
}
fs.selectNextFirst()
fs.rebuildStateLocked(ld)
}
// Optimized helper function to return first sequence.
// subj will always be publish subject here, meaning non-wildcard.
// We assume a fast check that this subj even exists already happened.
// Lock should be held.
func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
if len(fs.blks) == 0 {
return 0, nil
}
// See if we can optimize where we start.
start, stop := fs.blks[0].index, fs.lmb.index
if info, ok := fs.psim[subj]; ok {
start, stop = info.fblk, info.lblk
}
for i := start; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
return 0, err
}
if ss := mb.fss[subj]; ss != nil {
// Adjust first if it was not where we thought it should be.
if i != start {
if info, ok := fs.psim[subj]; ok {
info.fblk = i
}
}
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
mb.mu.Unlock()
return ss.First, nil
}
mb.mu.Unlock()
}
return 0, nil
}
// Will check the msg limit and drop firstSeq msg if needed.
// Lock should be held.
func (fs *fileStore) enforceMsgLimit() {
if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
return
}
for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
fs.rebuildFirst()
return
}
}
}
// Will check the bytes limit and drop msgs if needed.
// Lock should be held.
func (fs *fileStore) enforceBytesLimit() {
if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
return
}
for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
fs.rebuildFirst()
return
}
}
}
// Will make sure we have limits honored for max msgs per subject on recovery or config update.
// We will make sure to go through all msg blocks etc. but in practice this
// will most likely only be the last one, so can take a more conservative approach.
// Lock should be held.
func (fs *fileStore) enforceMsgPerSubjectLimit() {
maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)
// We want to suppress callbacks from remove during this process
// since these should have already been deleted and accounted for.
cb := fs.scb
fs.scb = nil
defer func() { fs.scb = cb }()
var numMsgs uint64
// collect all that are not correct.
needAttention := make(map[string]*psi)
for subj, psi := range fs.psim {
numMsgs += psi.total
if psi.total > maxMsgsPer {
needAttention[subj] = psi
}
}
// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
if numMsgs != fs.state.Msgs {
fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs)
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
ld, _, err := mb.rebuildState()
if err != nil && ld != nil {
fs.addLostData(ld)
}
fs.populateGlobalPerSubjectInfo(mb)
}
// Rebuild fs state too.
fs.rebuildStateLocked(nil)
// Need to redo blocks that need attention.
needAttention = make(map[string]*psi)
for subj, psi := range fs.psim {
if psi.total > maxMsgsPer {
needAttention[subj] = psi
}
}
}
// Collect all the msgBlks we alter.
blks := make(map[*msgBlock]struct{})
// For re-use below.
var sm StoreMsg
// Walk all subjects that need attention here.
for subj, info := range needAttention {
total, start, stop := info.total, info.fblk, info.lblk
for i := start; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
// Grab the ss entry for this subject in case sparse.
mb.mu.Lock()
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
mb.mu.Unlock()
if ss == nil {
continue
}
for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
m, _, err := mb.firstMatching(subj, false, seq, &sm)
if err == nil {
seq = m.seq + 1
if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
total--
blks[mb] = struct{}{}
}
} else {
// On error just do single increment.
seq++
}
}
}
}
// Expire the cache if we can.
for mb := range blks {
mb.mu.Lock()
if mb.msgs > 0 {
mb.tryForceExpireCacheLocked()
}
mb.mu.Unlock()
}
}
// Lock should be held.
func (fs *fileStore) deleteFirstMsg() (bool, error) {
return fs.removeMsgViaLimits(fs.state.FirstSeq)
}
// If we remove via limits that can always be recovered on a restart we
// do not force the system to update the index file.
// Lock should be held.
func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
return fs.removeMsg(seq, false, true, false)
}
// RemoveMsg will remove the message from this store.
// Will return the number of bytes removed.
func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
return fs.removeMsg(seq, false, false, true)
}
func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
return fs.removeMsg(seq, true, false, true)
}
// Convenience function to remove per subject tracking at the filestore level.
// Lock should be held.
func (fs *fileStore) removePerSubject(subj string) {
if len(subj) == 0 {
return
}
// We do not update sense of fblk here but will do so when we resolve during lookup.
if info, ok := fs.psim[subj]; ok {
info.total--
if info.total == 0 {
delete(fs.psim, subj)
}
}
}
// Remove a message, optionally rewriting the mb file.
func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
if seq == 0 {
return false, ErrStoreMsgNotFound
}
fsLock := func() {
if needFSLock {
fs.mu.Lock()
}
}
fsUnlock := func() {
if needFSLock {
fs.mu.Unlock()
}
}
fsLock()
if fs.closed {
fsUnlock()
return false, ErrStoreClosed
}
if !viaLimits && fs.sips > 0 {
fsUnlock()
return false, ErrStoreSnapshotInProgress
}
// If in encrypted mode negate secure rewrite here.
if secure && fs.prf != nil {
secure = false
}
if fs.state.Msgs == 0 {
var err = ErrStoreEOF
if seq <= fs.state.LastSeq {
err = ErrStoreMsgNotFound
}
fsUnlock()
return false, err
}
mb := fs.selectMsgBlock(seq)
if mb == nil {
var err = ErrStoreEOF
if seq <= fs.state.LastSeq {
err = ErrStoreMsgNotFound
}
fsUnlock()
return false, err
}
mb.mu.Lock()
// See if we are closed or the sequence number is still relevant.
if mb.closed || seq < mb.first.seq {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// Now check dmap if it is there.
if mb.dmap.Exists(seq) {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
// Now just load regardless.
// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
if mb.cacheNotLoaded() {
// We do not want to block possible activity within another msg block.
// We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another
// go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block.
mb.mu.Unlock()
fsUnlock()
if err := mb.loadMsgs(); err != nil {
return false, err
}
fsLock()
// We need to check if things changed out from underneath us.
if fs.closed {
fsUnlock()
return false, ErrStoreClosed
}
mb.mu.Lock()
if mb.closed || seq < mb.first.seq {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// cacheLookup below will do dmap check so no need to repeat here.
}
var smv StoreMsg
sm, err := mb.cacheLookup(seq, &smv)
if err != nil {
mb.mu.Unlock()
fsUnlock()
// Mimic err behavior from above check to dmap. No error returned if already removed.
if err == errDeletedMsg {
err = nil
}
return false, err
}
// Grab size
msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
// Set cache timestamp for last remove.
mb.lrts = time.Now().UnixNano()
// Global stats
if fs.state.Msgs > 0 {
fs.state.Msgs--
}
if msz < fs.state.Bytes {
fs.state.Bytes -= msz
} else {
fs.state.Bytes = 0
}
// Now local mb updates.
if mb.msgs > 0 {
mb.msgs--
}
if msz < mb.bytes {
mb.bytes -= msz
} else {
mb.bytes = 0
}
// Mark as dirty for stream state.
fs.dirty++
// If we are tracking subjects here make sure we update that accounting.
mb.ensurePerSubjectInfoLoaded()
// If we are tracking multiple subjects here make sure we update that accounting.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
if secure {
// Grab record info.
ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
mb.eraseMsg(seq, int(ri), int(rl))
}
fifo := seq == mb.first.seq
isLastBlock := mb == fs.lmb
isEmpty := mb.msgs == 0
if fifo {
mb.selectNextFirst()
if !isEmpty {
// Can update this one in place.
if seq == fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq // new one.
if mb.first.ts == 0 {
fs.state.FirstTime = time.Time{}
} else {
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
}
}
} else if !isEmpty {
if mb.dmap.IsEmpty() {
// Mark initial base for delete set.
mb.dmap.SetInitialMin(mb.first.seq)
}
// Out of order delete.
mb.dmap.Insert(seq)
// Check if <25% utilization and minimum size met.
if mb.rbytes > compactMinimum && !isLastBlock {
// Remove the interior delete records
rbytes := mb.rbytes - uint64(mb.dmap.Size()*emptyRecordLen)
if rbytes>>2 > mb.bytes {
mb.compact()
fs.kickFlushStateLoop()
}
}
}
if secure {
if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
// We have the mb lock here, this needs the mb locks so do in its own go routine.
go fs.rebuildState(ld)
}
}
// If empty remove this block and check if we need to update first sequence.
// We will write a tombstone at the end.
var firstSeqNeedsUpdate bool
if isEmpty {
// This writes tombstone iff mb == lmb, so no need to do below.
fs.removeMsgBlock(mb)
firstSeqNeedsUpdate = seq == fs.state.FirstSeq
}
mb.mu.Unlock()
// If we emptied the current message block and the seq was state.FirstSeq
// then we need to jump message blocks. We will also write the index so
// we don't lose track of the first sequence.
if firstSeqNeedsUpdate {
fs.selectNextFirst()
}
// Check if we need to write a deleted record tombstone.
// This is for user initiated removes or to hold the first seq
// when the last block is empty.
// If not via limits and not empty and last (empty writes tombstone above if last) write tombstone.
if !viaLimits && !(isEmpty && isLastBlock) {
if lmb := fs.lmb; sm != nil && lmb != nil {
lmb.writeTombstone(sm.seq, sm.ts)
}
}
if cb := fs.scb; cb != nil {
// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
fs.mu.Unlock()
// Storage updates.
var subj string
if sm != nil {
subj = sm.subj
}
delta := int64(msz)
cb(-1, -delta, seq, subj)
if !needFSLock {
fs.mu.Lock()
}
} else if needFSLock {
// We acquired it so release it.
fs.mu.Unlock()
}
return true, nil
}
// This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
// This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
// writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
// Write lock needs to be held.
func (mb *msgBlock) compact() {
wasLoaded := mb.cacheAlreadyLoaded()
if !wasLoaded {
if err := mb.loadMsgsWithLock(); err != nil {
return
}
}
buf := mb.cache.buf
nbuf := make([]byte, 0, len(buf))
var le = binary.LittleEndian
var firstSet bool
isDeleted := func(seq uint64) bool {
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
return true
}
return mb.dmap.Exists(seq)
}
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
if index+msgHdrSize > lbuf {
return
}
hdr := buf[index : index+msgHdrSize]
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
return
}
// Only need to process non-deleted messages.
seq := le.Uint64(hdr[4:])
if !isDeleted(seq) {
// Check for tombstones.
if seq&tbit != 0 {
// If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb.
if mb == mb.fs.lmb && seq < mb.first.seq {
nbuf = append(nbuf, buf[index:index+rl]...)
}
} else {
// Normal message here.
nbuf = append(nbuf, buf[index:index+rl]...)
if !firstSet {
firstSet = true
mb.first.seq = seq
}
}
}
// Always set last as long as not a tombstone.
if seq&tbit == 0 {
mb.last.seq = seq &^ ebit
}
// Advance to next record.
index += rl
}
// Check for encryption.
if mb.bek != nil && len(nbuf) > 0 {
// Recreate to reset counter.
rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return
}
rbek.XORKeyStream(nbuf, nbuf)
}
// Close FDs first.
mb.closeFDsLocked()
// We will write to a new file and mv/rename it in case of failure.
mfn := filepath.Join(filepath.Join(mb.fs.fcfg.StoreDir, msgDir), fmt.Sprintf(newScan, mb.index))
if err := os.WriteFile(mfn, nbuf, defaultFilePerms); err != nil {
os.Remove(mfn)
return
}
if err := os.Rename(mfn, mb.mfn); err != nil {
os.Remove(mfn)
return
}
// Remove index file and wipe delete map, then rebuild.
mb.dmap.Empty()
mb.rebuildStateLocked()
// If we entered with the msgs loaded make sure to reload them.
if wasLoaded {
mb.loadMsgsWithLock()
}
}
// Grab info from a slot.
// Lock should be held.
func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
if mb.cache == nil || slot >= len(mb.cache.idx) {
return 0, 0, false, errPartialCache
}
bi := mb.cache.idx[slot]
ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0
// If this is a deleted slot return here.
if bi == dbit {
return 0, 0, false, errDeletedMsg
}
// Determine record length
var rl uint32
if len(mb.cache.idx) > slot+1 {
ni := mb.cache.idx[slot+1] &^ hbit
rl = ni - ri
} else {
rl = mb.cache.lrl
}
if rl < msgHdrSize {
return 0, 0, false, errBadMsg
}
return uint32(ri), rl, hashChecked, nil
}
func (fs *fileStore) isClosed() bool {
fs.mu.RLock()
closed := fs.closed
fs.mu.RUnlock()
return closed
}
// Will spin up our flush loop.
func (mb *msgBlock) spinUpFlushLoop() {
mb.mu.Lock()
defer mb.mu.Unlock()
// Are we already running or closed?
if mb.flusher || mb.closed {
return
}
mb.flusher = true
mb.fch = make(chan struct{}, 1)
mb.qch = make(chan struct{})
fch, qch := mb.fch, mb.qch
go mb.flushLoop(fch, qch)
}
// Raw low level kicker for flush loops.
func kickFlusher(fch chan struct{}) {
if fch != nil {
select {
case fch <- struct{}{}:
default:
}
}
}
// Kick flusher for this message block.
func (mb *msgBlock) kickFlusher() {
mb.mu.RLock()
defer mb.mu.RUnlock()
kickFlusher(mb.fch)
}
func (mb *msgBlock) setInFlusher() {
mb.mu.Lock()
mb.flusher = true
mb.mu.Unlock()
}
func (mb *msgBlock) clearInFlusher() {
mb.mu.Lock()
mb.flusher = false
mb.mu.Unlock()
}
// flushLoop watches for messages, index info, or recently closed msg block updates.
func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
mb.setInFlusher()
defer mb.clearInFlusher()
for {
select {
case <-fch:
// If we have pending messages process them first.
if waiting := mb.pendingWriteSize(); waiting != 0 {
ts := 1 * time.Millisecond
var waited time.Duration
for waiting < coalesceMinimum {
time.Sleep(ts)
select {
case <-qch:
return
default:
}
newWaiting := mb.pendingWriteSize()
if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
break
}
waiting = newWaiting
ts *= 2
}
mb.flushPendingMsgs()
// Check if we are no longer the last message block. If we are
// not we can close FDs and exit.
mb.fs.mu.RLock()
notLast := mb != mb.fs.lmb
mb.fs.mu.RUnlock()
if notLast {
if err := mb.closeFDs(); err == nil {
return
}
}
}
case <-qch:
return
}
}
}
// Lock should be held.
func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
var le = binary.LittleEndian
var hdr [msgHdrSize]byte
le.PutUint32(hdr[0:], uint32(rl))
le.PutUint64(hdr[4:], seq|ebit)
le.PutUint64(hdr[12:], 0)
le.PutUint16(hdr[20:], 0)
// Randomize record
data := make([]byte, rl-emptyRecordLen)
rand.Read(data)
// Now write to underlying buffer.
var b bytes.Buffer
b.Write(hdr[:])
b.Write(data)
// Calculate hash.
mb.hh.Reset()
mb.hh.Write(hdr[4:20])
mb.hh.Write(data)
checksum := mb.hh.Sum(nil)
// Write to msg record.
b.Write(checksum)
// Update both cache and disk.
nbytes := b.Bytes()
// Cache
if ri >= mb.cache.off {
li := ri - mb.cache.off
buf := mb.cache.buf[li : li+rl]
copy(buf, nbytes)
}
// Disk
if mb.cache.off+mb.cache.wp > ri {
mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
if err != nil {
return err
}
defer mfd.Close()
if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
mfd.Sync()
}
if err != nil {
return err
}
}
return nil
}
// Truncate this message block to the storedMsg.
func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
// Make sure we are loaded to process messages etc.
if err := mb.loadMsgs(); err != nil {
return 0, 0, err
}
// Calculate new eof using slot info from our new last sm.
ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
if err != nil {
return 0, 0, err
}
// Calculate new eof.
eof := int64(ri + rl)
var purged, bytes uint64
mb.mu.Lock()
checkDmap := mb.dmap.Size() > 0
var smv StoreMsg
for seq := mb.last.seq; seq > sm.seq; seq-- {
if checkDmap {
if mb.dmap.Exists(seq) {
// Delete and skip to next.
mb.dmap.Delete(seq)
checkDmap = !mb.dmap.IsEmpty()
continue
}
}
// We should have a valid msg to calculate removal stats.
if m, err := mb.cacheLookup(seq, &smv); err == nil {
if mb.msgs > 0 {
rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
mb.msgs--
if rl > mb.bytes {
rl = mb.bytes
}
mb.bytes -= rl
mb.rbytes -= rl
// For return accounting.
purged++
bytes += uint64(rl)
}
}
}
// If the block is compressed then we have to load it into memory
// and decompress it, truncate it and then write it back out.
// Otherwise, truncate the file itself and close the descriptor.
if mb.cmp != NoCompression {
buf, err := mb.loadBlock(nil)
if err != nil {
return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
}
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return 0, 0, err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
buf, err = mb.decompressIfNeeded(buf)
if err != nil {
return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
}
buf = buf[:eof]
copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
buf, err = mb.cmp.Compress(buf)
if err != nil {
return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
}
meta := &CompressionInfo{
Algorithm: mb.cmp,
OriginalSize: uint64(eof),
}
buf = append(meta.MarshalMetadata(), buf...)
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return 0, 0, err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
n, err := mb.writeAt(buf, 0)
if err != nil {
return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
}
if n != len(buf) {
return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
}
mb.mfd.Truncate(int64(len(buf)))
mb.mfd.Sync()
} else if mb.mfd != nil {
mb.mfd.Truncate(eof)
mb.mfd.Sync()
// Update our checksum.
var lchk [8]byte
mb.mfd.ReadAt(lchk[:], eof-8)
copy(mb.lchk[0:], lchk[:])
} else {
mb.mu.Unlock()
return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
}
// Update our last msg.
mb.last.seq = sm.seq
mb.last.ts = sm.ts
// Clear our cache.
mb.clearCacheAndOffset()
// Redo per subject info for this block.
mb.resetPerSubjectInfo()
mb.mu.Unlock()
// Load msgs again.
mb.loadMsgs()
return purged, bytes, nil
}
// Lock should be held.
func (mb *msgBlock) isEmpty() bool {
return mb.first.seq > mb.last.seq
}
// Lock should be held.
func (mb *msgBlock) selectNextFirst() {
var seq uint64
for seq = mb.first.seq + 1; seq <= mb.last.seq; seq++ {
if mb.dmap.Exists(seq) {
// We will move past this so we can delete the entry.
mb.dmap.Delete(seq)
} else {
break
}
}
// Set new first sequence.
mb.first.seq = seq
// Check if we are empty..
if mb.isEmpty() {
mb.first.ts = 0
return
}
// Need to get the timestamp.
// We will try the cache direct and fallback if needed.
var smv StoreMsg
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
// Slow path, need to unlock.
mb.mu.Unlock()
sm, _, _ = mb.fetchMsg(seq, &smv)
mb.mu.Lock()
}
if sm != nil {
mb.first.ts = sm.ts
} else {
mb.first.ts = 0
}
}
// Select the next FirstSeq
// Lock should be held.
func (fs *fileStore) selectNextFirst() {
if len(fs.blks) > 0 {
mb := fs.blks[0]
mb.mu.RLock()
fs.state.FirstSeq = mb.first.seq
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
mb.mu.RUnlock()
} else {
// Could not find anything, so treat like purge
fs.state.FirstSeq = fs.state.LastSeq + 1
fs.state.FirstTime = time.Time{}
}
}
// Lock should be held.
func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
if td == 0 {
td = mb.cexp
}
if mb.ctmr == nil {
mb.ctmr = time.AfterFunc(td, mb.expireCache)
} else {
mb.ctmr.Reset(td)
}
}
// Lock should be held.
func (mb *msgBlock) startCacheExpireTimer() {
mb.resetCacheExpireTimer(0)
}
// Used when we load in a message block.
// Lock should be held.
func (mb *msgBlock) clearCacheAndOffset() {
// Reset linear scan tracker.
mb.llseq = 0
if mb.cache != nil {
mb.cache.off = 0
mb.cache.wp = 0
}
mb.clearCache()
}
// Lock should be held.
func (mb *msgBlock) clearCache() {
if mb.ctmr != nil && mb.fss == nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
if mb.cache == nil {
return
}
buf := mb.cache.buf
if mb.cache.off == 0 {
mb.cache = nil
} else {
// Clear msgs and index.
mb.cache.buf = nil
mb.cache.idx = nil
mb.cache.wp = 0
}
recycleMsgBlockBuf(buf)
}
// Called to possibly expire a message block cache.
func (mb *msgBlock) expireCache() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.expireCacheLocked()
}
func (mb *msgBlock) tryForceExpireCache() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.tryForceExpireCacheLocked()
}
// We will attempt to force expire this by temporarily clearing the last load time.
func (mb *msgBlock) tryForceExpireCacheLocked() {
llts := mb.llts
mb.llts = 0
mb.expireCacheLocked()
mb.llts = llts
}
// This is for expiration of the write cache, which will be partial with fip.
// So we want to bypass the Pools here.
// Lock should be held.
func (mb *msgBlock) tryExpireWriteCache() []byte {
if mb.cache == nil {
return nil
}
lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
mb.lwts, mb.cache.nra = 0, true
mb.expireCacheLocked()
mb.lwts = lwts
if mb.cache != nil {
mb.cache.nra = nra
}
// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
// Clear last write time since we now are about to move on to a new lmb.
mb.lwts = 0
return buf[:0]
}
return nil
}
// Lock should be held.
func (mb *msgBlock) expireCacheLocked() {
if mb.cache == nil && mb.fss == nil {
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
return
}
// Can't expire if we still have pending.
if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
mb.resetCacheExpireTimer(mb.cexp)
return
}
// Grab timestamp to compare.
tns := time.Now().UnixNano()
// For the core buffer of messages, we care about reads and writes, but not removes.
bufts := mb.llts
if mb.lwts > bufts {
bufts = mb.lwts
}
// Check for activity on the cache that would prevent us from expiring.
if tns-bufts <= int64(mb.cexp) {
mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
return
}
// If we are here we will at least expire the core msg buffer.
// We need to capture offset in case we do a write next before a full load.
if mb.cache != nil {
mb.cache.off += len(mb.cache.buf)
if !mb.cache.nra {
recycleMsgBlockBuf(mb.cache.buf)
}
mb.cache.buf = nil
mb.cache.wp = 0
}
// Check if we can clear out our fss and idx unless under force expire.
// We used to hold onto the idx longer but removes need buf now so no point.
mb.fss = nil
mb.clearCache()
}
func (fs *fileStore) startAgeChk() {
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
}
}
// Lock should be held.
func (fs *fileStore) resetAgeChk(delta int64) {
if fs.cfg.MaxAge == 0 {
return
}
fireIn := fs.cfg.MaxAge
if delta > 0 && time.Duration(delta) < fireIn {
fireIn = time.Duration(delta)
}
if fs.ageChk != nil {
fs.ageChk.Reset(fireIn)
} else {
fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
}
}
// Lock should be held.
func (fs *fileStore) cancelAgeChk() {
if fs.ageChk != nil {
fs.ageChk.Stop()
fs.ageChk = nil
}
}
// Will expire msgs that are too old.
func (fs *fileStore) expireMsgs() {
// We need to delete one by one here and can not optimize for the time being.
// Reason is that we need more information to adjust ack pending in consumers.
var smv StoreMsg
var sm *StoreMsg
fs.mu.RLock()
maxAge := int64(fs.cfg.MaxAge)
minAge := time.Now().UnixNano() - maxAge
fs.mu.RUnlock()
for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
fs.mu.Lock()
fs.removeMsgViaLimits(sm.seq)
fs.mu.Unlock()
// Recalculate in case we are expiring a bunch.
minAge = time.Now().UnixNano() - maxAge
}
fs.mu.Lock()
defer fs.mu.Unlock()
// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
if fs.state.Msgs == 0 {
fs.cancelAgeChk()
} else {
if sm == nil {
fs.resetAgeChk(0)
} else {
fs.resetAgeChk(sm.ts - minAge)
}
}
}
// Lock should be held.
func (fs *fileStore) checkAndFlushAllBlocks() {
for _, mb := range fs.blks {
if mb.pendingWriteSize() > 0 {
// Since fs lock is held need to pull this apart in case we need to rebuild state.
mb.mu.Lock()
ld, _ := mb.flushPendingMsgsLocked()
mb.mu.Unlock()
if ld != nil {
fs.rebuildStateLocked(ld)
}
}
}
}
// This will check all the checksums on messages and report back any sequence numbers with errors.
func (fs *fileStore) checkMsgs() *LostStreamData {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.checkAndFlushAllBlocks()
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
// Make sure encryption loaded if needed for the block.
fs.loadEncryptionForMsgBlock(mb)
// FIXME(dlc) - check tombstones here too?
if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
// Rebuild fs state too.
mb.fs.rebuildStateLocked(ld)
}
fs.populateGlobalPerSubjectInfo(mb)
}
return fs.ld
}
// Lock should be held.
func (mb *msgBlock) enableForWriting(fip bool) error {
if mb == nil {
return errNoMsgBlk
}
if mb.mfd != nil {
return nil
}
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
if err != nil {
return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
}
mb.mfd = mfd
// Spin up our flusher loop if needed.
if !fip {
mb.spinUpFlushLoop()
}
return nil
}
// Helper function to place a delete tombstone.
func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
}
// Will write the message record to the underlying message block.
// filestore lock will be held.
func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
mb.mu.Lock()
defer mb.mu.Unlock()
// Enable for writing if our mfd is not open.
if mb.mfd == nil {
if err := mb.enableForWriting(flush); err != nil {
return err
}
}
// Make sure we have a cache setup.
if mb.cache == nil {
mb.setupWriteCache(nil)
}
// Check if we are tracking per subject for our simple state.
// Do this before changing the cache that would trigger a flush pending msgs call
// if we needed to regenerate the per subject info.
// Note that tombstones have no subject so will not trigger here.
if len(subj) > 0 && !mb.noTrack {
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return err
}
if ss := mb.fss[subj]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
// Indexing
index := len(mb.cache.buf) + int(mb.cache.off)
// Formats
// Format with no header
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
// With headers, high bit on total length will be set.
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)
// First write header, etc.
var le = binary.LittleEndian
var hdr [msgHdrSize]byte
l := uint32(rl)
hasHeaders := len(mhdr) > 0
if hasHeaders {
l |= hbit
}
le.PutUint32(hdr[0:], l)
le.PutUint64(hdr[4:], seq)
le.PutUint64(hdr[12:], uint64(ts))
le.PutUint16(hdr[20:], uint16(len(subj)))
// Now write to underlying buffer.
mb.cache.buf = append(mb.cache.buf, hdr[:]...)
mb.cache.buf = append(mb.cache.buf, subj...)
if hasHeaders {
var hlen [4]byte
le.PutUint32(hlen[0:], uint32(len(mhdr)))
mb.cache.buf = append(mb.cache.buf, hlen[:]...)
mb.cache.buf = append(mb.cache.buf, mhdr...)
}
mb.cache.buf = append(mb.cache.buf, msg...)
// Calculate hash.
mb.hh.Reset()
mb.hh.Write(hdr[4:20])
mb.hh.Write([]byte(subj))
if hasHeaders {
mb.hh.Write(mhdr)
}
mb.hh.Write(msg)
checksum := mb.hh.Sum(nil)
// Grab last checksum
copy(mb.lchk[0:], checksum)
// Update write through cache.
// Write to msg record.
mb.cache.buf = append(mb.cache.buf, checksum...)
mb.cache.lrl = uint32(rl)
// Set cache timestamp for last store.
mb.lwts = ts
// Only update index and do accounting if not a delete tombstone.
if seq&tbit == 0 {
// Accounting, do this before stripping ebit, it is ebit aware.
mb.updateAccounting(seq, ts, rl)
// Strip ebit if set.
seq = seq &^ ebit
if mb.cache.fseq == 0 {
mb.cache.fseq = seq
}
// Write index
mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
}
fch, werr := mb.fch, mb.werr
// If we should be flushing, or had a write error, do so here.
if flush || werr != nil {
ld, err := mb.flushPendingMsgsLocked()
if ld != nil && mb.fs != nil {
// We have the mb lock here, this needs the mb locks so do in its own go routine.
go mb.fs.rebuildState(ld)
}
if err != nil {
return err
}
} else {
// Kick the flusher here.
kickFlusher(fch)
}
return nil
}
// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSize() int {
if mb == nil {
return 0
}
mb.mu.RLock()
defer mb.mu.RUnlock()
return mb.pendingWriteSizeLocked()
}
// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSizeLocked() int {
if mb == nil {
return 0
}
var pending int
if !mb.closed && mb.mfd != nil && mb.cache != nil {
pending = len(mb.cache.buf) - int(mb.cache.wp)
}
return pending
}
// Try to close our FDs if we can.
func (mb *msgBlock) closeFDs() error {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.closeFDsLocked()
}
func (mb *msgBlock) closeFDsLocked() error {
if buf, _ := mb.bytesPending(); len(buf) > 0 {
return errPendingData
}
mb.closeFDsLockedNoCheck()
return nil
}
func (mb *msgBlock) closeFDsLockedNoCheck() {
if mb.mfd != nil {
mb.mfd.Close()
mb.mfd = nil
}
}
// bytesPending returns the buffer to be used for writing to the underlying file.
// This marks we are in flush and will return nil if asked again until cleared.
// Lock should be held.
func (mb *msgBlock) bytesPending() ([]byte, error) {
if mb == nil || mb.mfd == nil {
return nil, errNoPending
}
if mb.cache == nil {
return nil, errNoCache
}
if len(mb.cache.buf) <= mb.cache.wp {
return nil, errNoPending
}
buf := mb.cache.buf[mb.cache.wp:]
if len(buf) == 0 {
return nil, errNoPending
}
return buf, nil
}
// Returns the current blkSize including deleted msgs etc.
func (mb *msgBlock) blkSize() uint64 {
mb.mu.RLock()
nb := mb.rbytes
mb.mu.RUnlock()
return nb
}
// Update accounting on a write msg.
// Lock should be held.
func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
isDeleted := seq&ebit != 0
if isDeleted {
seq = seq &^ ebit
}
if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq {
mb.first.seq = seq
mb.first.ts = ts
}
// Need atomics here for selectMsgBlock speed.
atomic.StoreUint64(&mb.last.seq, seq)
mb.last.ts = ts
mb.rbytes += rl
if !isDeleted {
mb.bytes += rl
mb.msgs++
}
}
// Lock should be held.
func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
var err error
// Get size for this message.
rl := fileStoreMsgSize(subj, hdr, msg)
if rl&hbit != 0 {
return 0, ErrMsgTooLarge
}
// Grab our current last message block.
mb := fs.lmb
// Mark as dirty for stream state.
fs.dirty++
if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
if mb != nil && fs.fcfg.Compression != NoCompression {
// We've now reached the end of this message block, if we want
// to compress blocks then now's the time to do it.
go mb.recompressOnDiskIfNeeded()
}
if mb, err = fs.newMsgBlockForWrite(); err != nil {
return 0, err
}
}
// Ask msg block to store in write through cache.
err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)
return rl, err
}
func (mb *msgBlock) recompressOnDiskIfNeeded() error {
// Wait for disk I/O slots to become available. This prevents us from
// running away with system resources.
<-dios
defer func() {
dios <- struct{}{}
}()
alg := mb.fs.fcfg.Compression
mb.mu.Lock()
defer mb.mu.Unlock()
origFN := mb.mfn // The original message block on disk.
tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.
// Open up the file block and read in the entire contents into memory.
// One of two things will happen:
// 1. The block will be compressed already and have a valid metadata
// header, in which case we do nothing.
// 2. The block will be uncompressed, in which case we will compress it
// and then write it back out to disk, reencrypting if necessary.
origBuf, err := os.ReadFile(origFN)
if err != nil {
return fmt.Errorf("failed to read original block from disk: %w", err)
}
// If the block is encrypted then we will need to decrypt it before
// doing anything. We always encrypt after compressing because then the
// compression can be as efficient as possible on the raw data, whereas
// the encrypted ciphertext will not compress anywhere near as well.
// The block encryption also covers the optional compression metadata.
if mb.bek != nil && len(origBuf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(origBuf, origBuf)
}
meta := &CompressionInfo{}
if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
// An error is only returned here if there's a problem with parsing
// the metadata. If the file has no metadata at all, no error is
// returned and the algorithm defaults to no compression.
return fmt.Errorf("failed to read existing metadata header: %w", err)
}
if meta.Algorithm == alg {
// The block is already compressed with the chosen algorithm so there
// is nothing else to do. This is not a common case, it is here only
// to ensure we don't do unnecessary work in case something asked us
// to recompress an already compressed block with the same algorithm.
return nil
} else if alg != NoCompression {
// The block is already compressed using some algorithm, so we need
// to decompress the block using the existing algorithm before we can
// recompress it with the new one.
if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
return fmt.Errorf("failed to decompress original block: %w", err)
}
}
// Rather than modifying the existing block on disk (which is a dangerous
// operation if something goes wrong), create a new temporary file. We will
// write out the new block here and then swap the files around afterwards
// once everything else has succeeded correctly.
tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
if err != nil {
return fmt.Errorf("failed to create temporary file: %w", err)
}
// The original buffer at this point is uncompressed, so we will now compress
// it if needed. Note that if the selected algorithm is NoCompression, the
// Compress function will just return the input buffer unmodified.
cmpBuf, err := alg.Compress(origBuf)
if err != nil {
return fmt.Errorf("failed to compress block: %w", err)
}
// We only need to write out the metadata header if compression is enabled.
// If we're trying to uncompress the file on disk at this point, don't bother
// writing metadata.
if alg != NoCompression {
meta := &CompressionInfo{
Algorithm: alg,
OriginalSize: uint64(len(origBuf)),
}
cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
}
// Re-encrypt the block if necessary.
if mb.bek != nil && len(cmpBuf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(cmpBuf, cmpBuf)
}
// Write the new block data (which might be compressed or encrypted) to the
// temporary file.
errorCleanup := func(err error) error {
tmpFD.Close()
os.Remove(tmpFN)
return err
}
if n, err := tmpFD.Write(cmpBuf); err != nil {
return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
} else if n != len(cmpBuf) {
return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
}
if err := tmpFD.Sync(); err != nil {
return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
}
if err := tmpFD.Close(); err != nil {
return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
}
// Now replace the original file with the newly updated temp file.
if err := os.Rename(tmpFN, origFN); err != nil {
return fmt.Errorf("failed to move temporary file into place: %w", err)
}
// Since the message block might be retained in memory, make sure the
// compression algorithm is up-to-date, since this will be needed when
// compacting or truncating.
mb.cmp = alg
return nil
}
func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
var meta CompressionInfo
if n, err := meta.UnmarshalMetadata(buf); err != nil {
// There was a problem parsing the metadata header of the block.
// If there's no metadata header, an error isn't returned here,
// we will instead just use default values of no compression.
return nil, err
} else if n == 0 {
// There were no metadata bytes, so we assume the block is not
// compressed and return it as-is.
return buf, nil
} else {
// Metadata was present so it's quite likely the block contents
// are compressed. If by any chance the metadata claims that the
// block is uncompressed, then the input slice is just returned
// unmodified.
return meta.Algorithm.Decompress(buf[n:])
}
}
// Sync msg and index files as needed. This is called from a timer.
func (fs *fileStore) syncBlocks() {
fs.mu.RLock()
if fs.closed {
fs.mu.RUnlock()
return
}
blks := append([]*msgBlock(nil), fs.blks...)
fs.mu.RUnlock()
for _, mb := range blks {
// Do actual sync. Hold lock for consistency.
mb.mu.Lock()
if mb.closed {
mb.mu.Unlock()
continue
}
// See if we can close FDs due to being idle.
if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
mb.dirtyCloseWithRemove(false)
}
// Check if we need to sync. We will not hold lock during actual sync.
var fn string
if mb.needSync {
// Flush anything that may be pending.
if mb.pendingWriteSizeLocked() > 0 {
mb.flushPendingMsgsLocked()
}
fn = mb.mfn
mb.needSync = false
}
mb.mu.Unlock()
// Check if we need to sync.
// This is done not holding any locks.
if fn != _EMPTY_ {
if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
fd.Sync()
fd.Close()
}
}
}
fs.mu.Lock()
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
syncAlways := fs.fcfg.SyncAlways
fs.mu.Unlock()
if !syncAlways {
if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
fd.Sync()
fd.Close()
}
}
}
// Select the message block where this message should be found.
// Return nil if not in the set.
// Read lock should be held.
func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
_, mb := fs.selectMsgBlockWithIndex(seq)
return mb
}
// Lock should be held.
func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
// Check for out of range.
if seq < fs.state.FirstSeq || seq > fs.state.LastSeq {
return -1, nil
}
const linearThresh = 32
nb := len(fs.blks) - 1
if nb < linearThresh {
for i, mb := range fs.blks {
if seq <= atomic.LoadUint64(&mb.last.seq) {
return i, mb
}
}
return -1, nil
}
// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
mb := fs.blks[mid]
// Right now these atomic loads do not factor in, so fine to leave. Was considering
// uplifting these to fs scope to avoid atomic load but not needed.
first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
if seq > last {
low = mid + 1
} else if seq < first {
// A message block's first sequence can change here meaning we could find a gap.
// We want to behave like above, which if inclusive (we check at start) should
// always return an index and a valid mb.
// If we have a gap then our seq would be > fs.blks[mid-1].last.seq
if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) {
return mid, mb
}
high = mid - 1
} else {
return mid, mb
}
}
return -1, nil
}
// Select the message block where this message should be found.
// Return nil if not in the set.
func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
fs.mu.RLock()
defer fs.mu.RUnlock()
t := minTime.UnixNano()
for _, mb := range fs.blks {
mb.mu.RLock()
found := t <= mb.last.ts
mb.mu.RUnlock()
if found {
return mb
}
}
return nil
}
// Index a raw msg buffer.
// Lock should be held.
func (mb *msgBlock) indexCacheBuf(buf []byte) error {
var le = binary.LittleEndian
var fseq uint64
var idx []uint32
var index uint32
if mb.cache == nil {
// Approximation, may adjust below.
fseq = mb.first.seq
idx = make([]uint32, 0, mb.msgs)
mb.cache = &cache{}
} else {
fseq = mb.cache.fseq
idx = mb.cache.idx
if len(idx) == 0 {
idx = make([]uint32, 0, mb.msgs)
}
index = uint32(len(mb.cache.buf))
buf = append(mb.cache.buf, buf...)
}
// Create FSS if we should track.
if !mb.noTrack {
mb.fss = make(map[string]*SimpleState)
}
lbuf := uint32(len(buf))
for index < lbuf {
if index+msgHdrSize > lbuf {
return errCorruptState
}
hdr := buf[index : index+msgHdrSize]
rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:]))
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
// This means something is off.
// TODO(dlc) - Add into bad list?
return errCorruptState
}
// Check for tombstones which we can skip in terms of indexing.
if seq&tbit != 0 {
index += rl
continue
}
// Clear any erase bits.
erased := seq&ebit != 0
seq = seq &^ ebit
// We defer checksum checks to individual msg cache lookups to amortorize costs and
// not introduce latency for first message from a newly loaded block.
if seq >= mb.first.seq {
// Track that we do not have holes.
if slot := int(seq - mb.first.seq); slot != len(idx) {
// If we have a hole fill it.
for dseq := mb.first.seq + uint64(len(idx)); dseq < seq; dseq++ {
idx = append(idx, dbit)
mb.dmap.Insert(dseq)
}
}
// Add to our index.
idx = append(idx, index)
mb.cache.lrl = uint32(rl)
// Adjust if we guessed wrong.
if seq != 0 && seq < fseq {
fseq = seq
}
// Make sure our dmap has this entry if it was erased.
if erased {
mb.dmap.Insert(seq)
}
// Handle FSS inline here.
if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
if ss := mb.fss[string(bsubj)]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
subj := mb.subjString(bsubj)
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
index += rl
}
mb.cache.buf = buf
mb.cache.idx = idx
mb.cache.fseq = fseq
mb.cache.wp += int(lbuf)
return nil
}
// flushPendingMsgs writes out any messages for this message block.
func (mb *msgBlock) flushPendingMsgs() error {
mb.mu.Lock()
fsLostData, err := mb.flushPendingMsgsLocked()
fs := mb.fs
mb.mu.Unlock()
// Signals us that we need to rebuild filestore state.
if fsLostData != nil && fs != nil {
// Rebuild fs state too.
fs.rebuildState(fsLostData)
}
return err
}
// Write function for actual data.
// mb.mfd should not be nil.
// Lock should held.
func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
// Used to mock write failures.
if mb.mockWriteErr {
// Reset on trip.
mb.mockWriteErr = false
return 0, errors.New("mock write error")
}
return mb.mfd.WriteAt(buf, woff)
}
// flushPendingMsgsLocked writes out any messages for this message block.
// Lock should be held.
func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
// Signals us that we need to rebuild filestore state.
var fsLostData *LostStreamData
if mb.cache == nil || mb.mfd == nil {
return nil, nil
}
buf, err := mb.bytesPending()
// If we got an error back return here.
if err != nil {
// No pending data to be written is not an error.
if err == errNoPending || err == errNoCache {
err = nil
}
return nil, err
}
woff := int64(mb.cache.off + mb.cache.wp)
lob := len(buf)
// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
// under heavy load.
// Check if we need to encrypt.
if mb.bek != nil && lob > 0 {
// Need to leave original alone.
var dst []byte
if lob <= defaultLargeBlockSize {
dst = getMsgBlockBuf(lob)[:lob]
} else {
dst = make([]byte, lob)
}
mb.bek.XORKeyStream(dst, buf)
buf = dst
}
// Append new data to the message block file.
for lbb := lob; lbb > 0; lbb = len(buf) {
n, err := mb.writeAt(buf, woff)
if err != nil {
mb.dirtyCloseWithRemove(false)
ld, _, _ := mb.rebuildStateLocked()
mb.werr = err
return ld, err
}
// Update our write offset.
woff += int64(n)
// Partial write.
if n != lbb {
buf = buf[n:]
} else {
// Done.
break
}
}
// Clear any error.
mb.werr = nil
// Cache may be gone.
if mb.cache == nil || mb.mfd == nil {
return fsLostData, mb.werr
}
// Check if we are in sync always mode.
if mb.syncAlways {
mb.mfd.Sync()
} else {
mb.needSync = true
}
// Check for additional writes while we were writing to the disk.
moreBytes := len(mb.cache.buf) - mb.cache.wp - lob
// Decide what we want to do with the buffer in hand. If we have load interest
// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
mb.cache.wp += lob
} else {
if cap(mb.cache.buf) <= maxBufReuse {
buf = mb.cache.buf[:0]
} else {
recycleMsgBlockBuf(mb.cache.buf)
buf = nil
}
if moreBytes > 0 {
nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
buf = nbuf
} else {
buf = append(buf, nbuf...)
}
}
// Update our cache offset.
mb.cache.off = int(woff)
// Reset write pointer.
mb.cache.wp = 0
// Place buffer back in the cache structure.
mb.cache.buf = buf
// Mark fseq to 0
mb.cache.fseq = 0
}
return fsLostData, mb.werr
}
// Lock should be held.
func (mb *msgBlock) clearLoading() {
mb.loading = false
}
// Will load msgs from disk.
func (mb *msgBlock) loadMsgs() error {
// We hold the lock here the whole time by design.
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.loadMsgsWithLock()
}
// Lock should be held.
func (mb *msgBlock) cacheAlreadyLoaded() bool {
if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
return false
}
numEntries := mb.msgs + uint64(mb.dmap.Size()) + (mb.first.seq - mb.cache.fseq)
return numEntries == uint64(len(mb.cache.idx))
}
// Lock should be held.
func (mb *msgBlock) cacheNotLoaded() bool {
return !mb.cacheAlreadyLoaded()
}
// Used to load in the block contents.
// Lock should be held and all conditionals satisfied prior.
func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
f, err := os.Open(mb.mfn)
if err != nil {
if os.IsNotExist(err) {
err = errNoBlkData
}
return nil, err
}
defer f.Close()
var sz int
if info, err := f.Stat(); err == nil {
sz64 := info.Size()
if int64(int(sz64)) == sz64 {
sz = int(sz64)
} else {
return nil, errMsgBlkTooBig
}
}
if buf == nil {
buf = getMsgBlockBuf(sz)
if sz > cap(buf) {
// We know we will make a new one so just recycle for now.
recycleMsgBlockBuf(buf)
buf = nil
}
}
if sz > cap(buf) {
buf = make([]byte, sz)
} else {
buf = buf[:sz]
}
n, err := io.ReadFull(f, buf)
// On success capture raw bytes size.
if err == nil {
mb.rbytes = uint64(n)
}
return buf[:n], err
}
// Lock should be held.
func (mb *msgBlock) loadMsgsWithLock() error {
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
return err
}
}
// Check to see if we are loading already.
if mb.loading {
return nil
}
// Set loading status.
mb.loading = true
defer mb.clearLoading()
var nchecks int
checkCache:
nchecks++
if nchecks > 8 {
return errCorruptState
}
// Check to see if we have a full cache.
if mb.cacheAlreadyLoaded() {
return nil
}
mb.llts = time.Now().UnixNano()
// FIXME(dlc) - We could be smarter here.
if buf, _ := mb.bytesPending(); len(buf) > 0 {
ld, err := mb.flushPendingMsgsLocked()
if ld != nil && mb.fs != nil {
// We do not know if fs is locked or not at this point.
// This should be an exceptional condition so do so in Go routine.
go mb.fs.rebuildState(ld)
}
if err != nil {
return err
}
goto checkCache
}
// Load in the whole block.
// We want to hold the mb lock here to avoid any changes to state.
buf, err := mb.loadBlock(nil)
if err != nil {
if err == errNoBlkData {
if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
// Rebuild fs state too.
go mb.fs.rebuildState(ld)
}
}
return err
}
// Reset the cache since we just read everything in.
// Make sure this is cleared in case we had a partial when we started.
mb.clearCacheAndOffset()
// Check if we need to decrypt.
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
// Check for compression.
if buf, err = mb.decompressIfNeeded(buf); err != nil {
return err
}
if err := mb.indexCacheBuf(buf); err != nil {
if err == errCorruptState {
var ld *LostStreamData
if ld, _, err = mb.rebuildStateLocked(); ld != nil {
// We do not know if fs is locked or not at this point.
// This should be an exceptional condition so do so in Go routine.
go mb.fs.rebuildState(ld)
}
}
if err != nil {
return err
}
goto checkCache
}
if len(buf) > 0 {
mb.cloads++
mb.startCacheExpireTimer()
}
return nil
}
// Fetch a message from this block, possibly reading in and caching the messages.
// We assume the block was selected and is correct, so we do not do range checks.
func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return nil, false, err
}
}
fsm, err := mb.cacheLookup(seq, sm)
if err != nil {
return nil, false, err
}
expireOk := seq == mb.last.seq && mb.llseq == seq
return fsm, expireOk, err
}
var (
errNoCache = errors.New("no message cache")
errBadMsg = errors.New("malformed or corrupt message")
errDeletedMsg = errors.New("deleted message")
errPartialCache = errors.New("partial cache")
errNoPending = errors.New("message block does not have pending data")
errNotReadable = errors.New("storage directory not readable")
errCorruptState = errors.New("corrupt state file")
errPriorState = errors.New("prior state file")
errPendingData = errors.New("pending data still present")
errNoEncryption = errors.New("encryption not enabled")
errBadKeySize = errors.New("encryption bad key size")
errNoMsgBlk = errors.New("no message block")
errMsgBlkTooBig = errors.New("message block size exceeded int capacity")
errUnknownCipher = errors.New("unknown cipher")
errNoMainKey = errors.New("encrypted store encountered with no main key")
errNoBlkData = errors.New("message block data missing")
)
const (
// Used for marking messages that have had their checksums checked.
// Used to signal a message record with headers.
hbit = 1 << 31
// Used for marking erased messages sequences.
ebit = 1 << 63
// Used for marking tombstone sequences.
tbit = 1 << 62
// Used to mark a bad index as deleted.
dbit = 1 << 30
)
// Will do a lookup from cache.
// Lock should be held.
func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
if seq < mb.first.seq || seq > mb.last.seq {
return nil, ErrStoreMsgNotFound
}
// If we have a delete map check it.
if mb.dmap.Exists(seq) {
mb.llts = time.Now().UnixNano()
return nil, errDeletedMsg
}
// Detect no cache loaded.
if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
return nil, errNoCache
}
// Check partial cache status.
if seq < mb.cache.fseq {
return nil, errPartialCache
}
bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
if err != nil {
return nil, err
}
// Update cache activity.
mb.llts = time.Now().UnixNano()
// The llseq signals us when we can expire a cache at the end of a linear scan.
// We want to only update when we know the last reads (multiple consumers) are sequential.
if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 {
mb.llseq = seq
}
li := int(bi) - mb.cache.off
if li >= len(mb.cache.buf) {
return nil, errPartialCache
}
buf := mb.cache.buf[li:]
// We use the high bit to denote we have already checked the checksum.
var hh hash.Hash64
if !hashChecked {
hh = mb.hh // This will force the hash check in msgFromBuf.
}
// Parse from the raw buffer.
fsm, err := mb.msgFromBuf(buf, sm, hh)
if err != nil || fsm == nil {
return nil, err
}
// Deleted messages that are decoded return a 0 for sequence.
if fsm.seq == 0 {
return nil, errDeletedMsg
}
if seq != fsm.seq {
recycleMsgBlockBuf(mb.cache.buf)
mb.cache.buf = nil
return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
}
// Clear the check bit here after we know all is good.
if !hashChecked {
mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
}
return fsm, nil
}
// Used when we are checking if discarding a message due to max msgs per subject will give us
// enough room for a max bytes condition.
// Lock should be already held.
func (fs *fileStore) sizeForSeq(seq uint64) int {
if seq == 0 {
return 0
}
var smv StoreMsg
if mb := fs.selectMsgBlock(seq); mb != nil {
if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
}
}
return 0
}
// Will return message for the given sequence number.
func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
// be stalled. Need another lock if want to happen in parallel.
fs.mu.RLock()
if fs.closed {
fs.mu.RUnlock()
return nil, ErrStoreClosed
}
// Indicates we want first msg.
if seq == 0 {
seq = fs.state.FirstSeq
}
// Make sure to snapshot here.
mb, lmb, lseq := fs.selectMsgBlock(seq), fs.lmb, fs.state.LastSeq
fs.mu.RUnlock()
if mb == nil {
var err = ErrStoreEOF
if seq <= lseq {
err = ErrStoreMsgNotFound
}
return nil, err
}
fsm, expireOk, err := mb.fetchMsg(seq, sm)
if err != nil {
return nil, err
}
// We detected a linear scan and access to the last message.
// If we are not the last message block we can try to expire the cache.
if mb != lmb && expireOk {
mb.tryForceExpireCache()
}
return fsm, nil
}
// Internal function to return msg parts from a raw buffer.
// Lock should be held.
func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
if len(buf) < emptyRecordLen {
return nil, errBadMsg
}
var le = binary.LittleEndian
hdr := buf[:msgHdrSize]
rl := le.Uint32(hdr[0:])
hasHeaders := rl&hbit != 0
rl &^= hbit // clear header bit
dlen := int(rl) - msgHdrSize
slen := int(le.Uint16(hdr[20:]))
// Simple sanity check.
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
return nil, errBadMsg
}
data := buf[msgHdrSize : msgHdrSize+dlen]
// Do checksum tests here if requested.
if hh != nil {
hh.Reset()
hh.Write(hdr[4:20])
hh.Write(data[:slen])
if hasHeaders {
hh.Write(data[slen+4 : dlen-recordHashSize])
} else {
hh.Write(data[slen : dlen-recordHashSize])
}
if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
return nil, errBadMsg
}
}
seq := le.Uint64(hdr[4:])
if seq&ebit != 0 {
seq = 0
}
ts := int64(le.Uint64(hdr[12:]))
// Create a StoreMsg if needed.
if sm == nil {
sm = new(StoreMsg)
} else {
sm.clear()
}
// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
// layers and for us to be safe to expire, and recycle, the large msgBlocks.
end := dlen - 8
if hasHeaders {
hl := le.Uint32(data[slen:])
bi := slen + 4
li := bi + int(hl)
sm.buf = append(sm.buf, data[bi:end]...)
li, end = li-bi, end-bi
sm.hdr = sm.buf[0:li:li]
sm.msg = sm.buf[li:end]
} else {
sm.buf = append(sm.buf, data[slen:end]...)
sm.msg = sm.buf[0 : end-slen]
}
sm.seq, sm.ts = seq, ts
// Treat subject a bit different to not reference underlying buf.
if slen > 0 {
sm.subj = mb.subjString(data[:slen])
}
return sm, nil
}
// Used to intern strings for subjects.
// Based on idea from https://github.com/josharian/intern/blob/master/intern.go
var subjPool = sync.Pool{
New: func() any {
return make(map[string]string)
},
}
// Get an interned string from a byte slice.
func subjFromBytes(b []byte) string {
sm := subjPool.Get().(map[string]string)
defer subjPool.Put(sm)
subj, ok := sm[string(b)]
if ok {
return subj
}
s := string(b)
sm[s] = s
return s
}
// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (fs *fileStore) subjString(skey []byte) string {
if fs == nil || len(skey) == 0 {
return _EMPTY_
}
if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 {
if lsubjs == 1 {
// The cast for the comparison does not make a copy
if string(skey) == fs.cfg.Subjects[0] {
return fs.cfg.Subjects[0]
}
} else {
for _, subj := range fs.cfg.Subjects {
if string(skey) == subj {
return subj
}
}
}
}
return subjFromBytes(skey)
}
// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (mb *msgBlock) subjString(skey []byte) string {
return mb.fs.subjString(skey)
}
// LoadMsg will lookup the message by sequence number and return it if found.
func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
return fs.msgForSeq(seq, sm)
}
// loadLast will load the last message for a subject. Subject should be non empty and not ">".
func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.closed || fs.lmb == nil {
return nil, ErrStoreClosed
}
if len(fs.blks) == 0 {
return nil, ErrStoreMsgNotFound
}
start, stop := fs.lmb.index, fs.blks[0].index
wc := subjectHasWildcard(subj)
// If literal subject check for presence.
if !wc {
if info := fs.psim[subj]; info == nil {
return nil, ErrStoreMsgNotFound
} else {
start, stop = info.lblk, info.fblk
}
}
// Walk blocks backwards.
for i := start; i >= stop; i-- {
mb := fs.bim[i]
if mb == nil {
continue
}
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
return nil, err
}
var l uint64
// Optimize if subject is not a wildcard.
if !wc {
if ss := mb.fss[subj]; ss != nil {
l = ss.Last
}
}
if l == 0 {
_, _, l = mb.filteredPendingLocked(subj, wc, mb.first.seq)
}
if l > 0 {
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
return nil, err
}
}
lsm, err = mb.cacheLookup(l, sm)
}
mb.mu.Unlock()
if l > 0 {
break
}
}
return lsm, err
}
// LoadLastMsg will return the last message we have that matches a given subject.
// The subject can be a wildcard.
func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
if subject == _EMPTY_ || subject == fwcs {
sm, err = fs.msgForSeq(fs.lastSeq(), smv)
} else {
sm, err = fs.loadLast(subject, smv)
}
if sm == nil || (err != nil && err != ErrStoreClosed) {
err = ErrStoreMsgNotFound
}
return sm, err
}
func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.closed {
return nil, 0, ErrStoreClosed
}
if start < fs.state.FirstSeq {
start = fs.state.FirstSeq
}
if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
for i := bi; i < len(fs.blks); i++ {
mb := fs.blks[i]
if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
if expireOk && mb != fs.lmb {
mb.tryForceExpireCache()
}
return sm, sm.seq, nil
} else if err != ErrStoreMsgNotFound {
return nil, 0, err
}
}
}
return nil, fs.state.LastSeq, ErrStoreEOF
}
// Type returns the type of the underlying store.
func (fs *fileStore) Type() StorageType {
return FileStorage
}
// Returns number of subjects in this store.
// Lock should be held.
func (fs *fileStore) numSubjects() int {
return len(fs.psim)
}
// FastState will fill in state with only the following.
// Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
func (fs *fileStore) FastState(state *StreamState) {
fs.mu.RLock()
state.Msgs = fs.state.Msgs
state.Bytes = fs.state.Bytes
state.FirstSeq = fs.state.FirstSeq
state.FirstTime = fs.state.FirstTime
state.LastSeq = fs.state.LastSeq
state.LastTime = fs.state.LastTime
if state.LastSeq > state.FirstSeq {
state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
if state.NumDeleted < 0 {
state.NumDeleted = 0
}
}
state.Consumers = len(fs.cfs)
state.NumSubjects = fs.numSubjects()
fs.mu.RUnlock()
}
// State returns the current state of the stream.
func (fs *fileStore) State() StreamState {
fs.mu.RLock()
state := fs.state
state.Consumers = len(fs.cfs)
state.NumSubjects = fs.numSubjects()
state.Deleted = nil // make sure.
if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
state.Deleted = make([]uint64, 0, numDeleted)
cur := fs.state.FirstSeq
for _, mb := range fs.blks {
mb.mu.Lock()
fseq := mb.first.seq
// Account for messages missing from the head.
if fseq > cur {
for seq := cur; seq < fseq; seq++ {
state.Deleted = append(state.Deleted, seq)
}
}
cur = mb.last.seq + 1 // Expected next first.
mb.dmap.Range(func(seq uint64) bool {
if seq < fseq {
mb.dmap.Delete(seq)
} else {
state.Deleted = append(state.Deleted, seq)
}
return true
})
mb.mu.Unlock()
}
}
fs.mu.RUnlock()
state.Lost = fs.lostData()
// Can not be guaranteed to be sorted.
if len(state.Deleted) > 0 {
sort.Slice(state.Deleted, func(i, j int) bool {
return state.Deleted[i] < state.Deleted[j]
})
state.NumDeleted = len(state.Deleted)
}
return state
}
func (fs *fileStore) Utilization() (total, reported uint64, err error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
for _, mb := range fs.blks {
mb.mu.RLock()
reported += mb.bytes
total += mb.rbytes
mb.mu.RUnlock()
}
return total, reported, nil
}
func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
if len(hdr) == 0 {
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
return uint64(22 + len(subj) + len(msg) + 8)
}
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
}
func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
return uint64(emptyRecordLen + slen + 4 + maxPayload)
}
// Determine time since last write or remove of a message.
// Read lock should be held.
func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
if mb.closed {
return 0
}
last := mb.lwts
if mb.lrts > last {
last = mb.lrts
}
return time.Since(time.Unix(0, last).UTC())
}
func checkNewHeader(hdr []byte) error {
if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
(hdr[1] != version && hdr[1] != newVersion) {
return errCorruptState
}
return nil
}
// readIndexInfo will read in the index information for the message block.
func (mb *msgBlock) readIndexInfo() error {
ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
buf, err := os.ReadFile(ifn)
if err != nil {
return err
}
// Set if first time.
if mb.liwsz == 0 {
mb.liwsz = int64(len(buf))
}
// Decrypt if needed.
if mb.aek != nil {
buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
if err != nil {
return err
}
}
if err := checkNewHeader(buf); err != nil {
defer os.Remove(ifn)
return fmt.Errorf("bad index file")
}
bi := hdrLen
// Helpers, will set i to -1 on error.
readSeq := func() uint64 {
if bi < 0 {
return 0
}
seq, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return seq &^ ebit
}
readCount := readSeq
readTimeStamp := func() int64 {
if bi < 0 {
return 0
}
ts, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return ts
}
mb.msgs = readCount()
mb.bytes = readCount()
mb.first.seq = readSeq()
mb.first.ts = readTimeStamp()
mb.last.seq = readSeq()
mb.last.ts = readTimeStamp()
dmapLen := readCount()
// Check if this is a short write index file.
if bi < 0 || bi+checksumSize > len(buf) {
os.Remove(ifn)
return fmt.Errorf("short index file")
}
// Check for consistency if accounting. If something is off bail and we will rebuild.
if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen {
os.Remove(ifn)
return fmt.Errorf("accounting inconsistent")
}
// Checksum
copy(mb.lchk[0:], buf[bi:bi+checksumSize])
bi += checksumSize
// Now check for presence of a delete map
if dmapLen > 0 {
// New version is encoded avl seqset.
if buf[1] == newVersion {
dmap, _, err := avl.Decode(buf[bi:])
if err != nil {
return fmt.Errorf("could not decode avl dmap: %v", err)
}
mb.dmap = *dmap
} else {
// This is the old version.
for i := 0; i < int(dmapLen); i++ {
seq := readSeq()
if seq == 0 {
break
}
mb.dmap.Insert(seq + mb.first.seq)
}
}
}
return nil
}
// Will return total number of cache loads.
func (fs *fileStore) cacheLoads() uint64 {
var tl uint64
fs.mu.RLock()
for _, mb := range fs.blks {
tl += mb.cloads
}
fs.mu.RUnlock()
return tl
}
// Will return total number of cached bytes.
func (fs *fileStore) cacheSize() uint64 {
var sz uint64
fs.mu.RLock()
for _, mb := range fs.blks {
mb.mu.RLock()
if mb.cache != nil {
sz += uint64(len(mb.cache.buf))
}
mb.mu.RUnlock()
}
fs.mu.RUnlock()
return sz
}
// Will return total number of dmapEntries for all msg blocks.
func (fs *fileStore) dmapEntries() int {
var total int
fs.mu.RLock()
for _, mb := range fs.blks {
total += mb.dmap.Size()
}
fs.mu.RUnlock()
return total
}
// Fixed helper for iterating.
func subjectsEqual(a, b string) bool {
return a == b
}
func subjectsAll(a, b string) bool {
return true
}
func compareFn(subject string) func(string, string) bool {
if subject == _EMPTY_ || subject == fwcs {
return subjectsAll
}
if subjectHasWildcard(subject) {
return subjectIsSubsetMatch
}
return subjectsEqual
}
// PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
// Will return the number of purged messages.
func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
if subject == _EMPTY_ || subject == fwcs {
if keep == 0 && (sequence == 0 || sequence == 1) {
return fs.Purge()
}
if sequence > 1 {
return fs.Compact(sequence)
}
}
eq, wc := compareFn(subject), subjectHasWildcard(subject)
var firstSeqNeedsUpdate bool
var bytes uint64
// If we have a "keep" designation need to get full filtered state so we know how many to purge.
var maxp uint64
if keep > 0 {
ss := fs.FilteredState(1, subject)
if keep >= ss.Msgs {
return 0, nil
}
maxp = ss.Msgs - keep
}
var smv StoreMsg
fs.mu.Lock()
// We may remove blocks as we purge, so don't range directly on fs.blks
// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
for i := 0; i < len(fs.blks); i++ {
mb := fs.blks[i]
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
continue
}
t, f, l := mb.filteredPendingLocked(subject, wc, mb.first.seq)
if t == 0 {
mb.mu.Unlock()
continue
}
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
if sequence > 1 && sequence <= l {
l = sequence - 1
}
for seq := f; seq <= l; seq++ {
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
// Do fast in place remove.
// Stats
if mb.msgs > 0 {
// Msgs
fs.state.Msgs--
mb.msgs--
// Bytes, make sure to not go negative.
if rl > fs.state.Bytes {
rl = fs.state.Bytes
}
if rl > mb.bytes {
rl = mb.bytes
}
fs.state.Bytes -= rl
mb.bytes -= rl
// Totals
purged++
bytes += rl
}
// FSS updates.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
// Check for first message.
if seq == mb.first.seq {
mb.selectNextFirst()
if mb.isEmpty() {
fs.removeMsgBlock(mb)
i--
// keep flag set, if set previously
firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
} else if seq == fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq // new one.
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
} else {
// Out of order delete.
mb.dmap.Insert(seq)
}
if maxp > 0 && purged >= maxp {
break
}
}
}
// Expire if we were responsible for loading.
if shouldExpire {
// Expire this cache before moving on.
mb.tryForceExpireCacheLocked()
}
mb.mu.Unlock()
// Check if we should break out of top level too.
if maxp > 0 && purged >= maxp {
break
}
}
if firstSeqNeedsUpdate {
fs.selectNextFirst()
}
fs.dirty++
cb := fs.scb
fs.mu.Unlock()
fs.kickFlushStateLoop()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return purged, nil
}
// Purge will remove all messages from this store.
// Will return the number of purged messages.
func (fs *fileStore) Purge() (uint64, error) {
return fs.purge(0)
}
func (fs *fileStore) purge(fseq uint64) (uint64, error) {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return 0, ErrStoreClosed
}
purged := fs.state.Msgs
rbytes := int64(fs.state.Bytes)
fs.state.FirstSeq = fs.state.LastSeq + 1
fs.state.FirstTime = time.Time{}
fs.state.Bytes = 0
fs.state.Msgs = 0
for _, mb := range fs.blks {
mb.dirtyClose()
}
fs.blks = nil
fs.lmb = nil
fs.bim = make(map[uint32]*msgBlock)
// Clear any per subject tracking.
fs.psim = make(map[string]*psi)
// Mark dirty
fs.dirty++
// Move the msgs directory out of the way, will delete out of band.
// FIXME(dlc) - These can error and we need to change api above to propagate?
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
// If purge directory still exists then we need to wait
// in place and remove since rename would fail.
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
os.Rename(mdir, pdir)
go os.RemoveAll(pdir)
// Create new one.
os.MkdirAll(mdir, defaultDirPerms)
// Make sure we have a lmb to write to.
if _, err := fs.newMsgBlockForWrite(); err != nil {
fs.mu.Unlock()
return purged, err
}
// Check if we need to set the first seq to a new number.
if fseq > fs.state.FirstSeq {
fs.state.FirstSeq = fseq
fs.state.LastSeq = fseq - 1
}
lmb := fs.lmb
lmb.first.seq = fs.state.FirstSeq
lmb.last.seq = fs.state.LastSeq
lmb.last.ts = fs.state.LastTime.UnixNano()
if fs.lmb.last.seq > 1 {
// Leave a tombstone so we can remember our starting sequence in case
// full state becomes corrupted.
lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts)
}
cb := fs.scb
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -rbytes, 0, _EMPTY_)
}
return purged, nil
}
// Compact will remove all messages from this store up to
// but not including the seq parameter.
// Will return the number of purged messages.
func (fs *fileStore) Compact(seq uint64) (uint64, error) {
if seq == 0 {
return fs.purge(seq)
}
var purged, bytes uint64
fs.mu.Lock()
// Same as purge all.
if lseq := fs.state.LastSeq; seq > lseq {
fs.mu.Unlock()
return fs.purge(seq)
}
// We have to delete interior messages.
smb := fs.selectMsgBlock(seq)
if smb == nil {
fs.mu.Unlock()
return 0, nil
}
// All msgblocks up to this one can be thrown away.
var deleted int
for _, mb := range fs.blks {
if mb == smb {
break
}
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
// Now close.
mb.dirtyCloseWithRemove(true)
mb.mu.Unlock()
deleted++
}
var smv StoreMsg
var err error
var isEmpty bool
smb.mu.Lock()
if smb.first.seq == seq {
goto SKIP
}
// Make sure we have the messages loaded.
if smb.cacheNotLoaded() {
if err = smb.loadMsgsWithLock(); err != nil {
goto SKIP
}
}
for mseq := smb.first.seq; mseq < seq; mseq++ {
sm, err := smb.cacheLookup(mseq, &smv)
if err == errDeletedMsg {
// Update dmap.
if !smb.dmap.IsEmpty() {
smb.dmap.Delete(seq)
}
} else if sm != nil {
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if smb.msgs > 0 {
smb.msgs--
if sz > smb.bytes {
sz = smb.bytes
}
smb.bytes -= sz
bytes += sz
purged++
}
// Update fss
smb.removeSeqPerSubject(sm.subj, mseq)
fs.removePerSubject(sm.subj)
}
}
// Check if empty after processing, could happen if tail of messages are all deleted.
isEmpty = smb.msgs == 0
if isEmpty {
smb.dirtyCloseWithRemove(true)
// Update fs first here as well.
fs.state.FirstSeq = smb.last.seq + 1
fs.state.FirstTime = time.Time{}
deleted++
} else {
// Make sure to sync changes.
smb.needSync = true
// Update fs first seq and time.
smb.first.seq = seq - 1 // Just for start condition for selectNextFirst.
smb.selectNextFirst()
fs.state.FirstSeq = smb.first.seq
fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
// Check if we should reclaim the head space from this block.
// This will be optimistic only, so don't continue if we encounter any errors here.
if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
var moff uint32
moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq))
if err != nil || moff >= uint32(len(smb.cache.buf)) {
goto SKIP
}
buf := smb.cache.buf[moff:]
// Don't reuse, copy to new recycled buf.
nbuf := getMsgBlockBuf(len(buf))
nbuf = append(nbuf, buf...)
smb.closeFDsLockedNoCheck()
// Check for encryption.
if smb.bek != nil && len(nbuf) > 0 {
// Recreate to reset counter.
bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
if err != nil {
goto SKIP
}
// For future writes make sure to set smb.bek to keep counter correct.
smb.bek = bek
smb.bek.XORKeyStream(nbuf, nbuf)
}
// Recompress if necessary (smb.cmp contains the algorithm used when
// the block was loaded from disk, or defaults to NoCompression if not)
if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
goto SKIP
}
if err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms); err != nil {
goto SKIP
}
// Make sure to remove fss state.
smb.fss = nil
smb.clearCacheAndOffset()
smb.rbytes = uint64(len(nbuf))
}
}
SKIP:
smb.mu.Unlock()
if deleted > 0 {
// Update block map.
if fs.bim != nil {
for _, mb := range fs.blks[:deleted] {
delete(fs.bim, mb.index)
}
}
// Update blks slice.
fs.blks = copyMsgBlocks(fs.blks[deleted:])
if lb := len(fs.blks); lb == 0 {
fs.lmb = nil
} else {
fs.lmb = fs.blks[lb-1]
}
}
// Update top level accounting.
if purged > fs.state.Msgs {
purged = fs.state.Msgs
}
fs.state.Msgs -= purged
if bytes > fs.state.Bytes {
bytes = fs.state.Bytes
}
fs.state.Bytes -= bytes
fs.dirty++
fs.kickFlushStateLoop()
cb := fs.scb
fs.mu.Unlock()
if cb != nil && purged > 0 {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return purged, err
}
// Will completely reset our store.
func (fs *fileStore) reset() error {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
if fs.sips > 0 {
fs.mu.Unlock()
return ErrStoreSnapshotInProgress
}
var purged, bytes uint64
cb := fs.scb
for _, mb := range fs.blks {
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
mb.dirtyCloseWithRemove(true)
mb.mu.Unlock()
}
// Reset
fs.state.FirstSeq = 0
fs.state.FirstTime = time.Time{}
fs.state.LastSeq = 0
fs.state.LastTime = time.Now().UTC()
// Update msgs and bytes.
fs.state.Msgs = 0
fs.state.Bytes = 0
// Reset blocks.
fs.blks, fs.lmb = nil, nil
// Reset subject mappings.
fs.psim = make(map[string]*psi)
fs.bim = make(map[uint32]*msgBlock)
// If we purged anything, make sure we kick flush state loop.
if purged > 0 {
fs.dirty++
fs.kickFlushStateLoop()
}
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return nil
}
// Truncate will truncate a stream store up to seq. Sequence needs to be valid.
func (fs *fileStore) Truncate(seq uint64) error {
// Check for request to reset.
if seq == 0 {
return fs.reset()
}
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
if fs.sips > 0 {
fs.mu.Unlock()
return ErrStoreSnapshotInProgress
}
nlmb := fs.selectMsgBlock(seq)
if nlmb == nil {
fs.mu.Unlock()
return ErrInvalidSequence
}
lsm, _, _ := nlmb.fetchMsg(seq, nil)
if lsm == nil {
fs.mu.Unlock()
return ErrInvalidSequence
}
// Set lmb to nlmb and make sure writeable.
fs.lmb = nlmb
if err := nlmb.enableForWriting(fs.fip); err != nil {
return err
}
var purged, bytes uint64
// Truncate our new last message block.
nmsgs, nbytes, err := nlmb.truncate(lsm)
if err != nil {
fs.mu.Unlock()
return fmt.Errorf("nlmb.truncate: %w", err)
}
// Account for the truncated msgs and bytes.
purged += nmsgs
bytes += nbytes
// Remove any left over msg blocks.
getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
fs.removeMsgBlock(mb)
mb.mu.Unlock()
}
// Reset last.
fs.state.LastSeq = lsm.seq
fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
// Update msgs and bytes.
if purged > fs.state.Msgs {
purged = fs.state.Msgs
}
fs.state.Msgs -= purged
if bytes > fs.state.Bytes {
bytes = fs.state.Bytes
}
fs.state.Bytes -= bytes
// Reset our subject lookup info.
fs.resetGlobalPerSubjectInfo()
fs.dirty++
fs.kickFlushStateLoop()
cb := fs.scb
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return nil
}
func (fs *fileStore) lastSeq() uint64 {
fs.mu.RLock()
seq := fs.state.LastSeq
fs.mu.RUnlock()
return seq
}
// Returns number of msg blks.
func (fs *fileStore) numMsgBlocks() int {
fs.mu.RLock()
defer fs.mu.RUnlock()
return len(fs.blks)
}
// Will add a new msgBlock.
// Lock should be held.
func (fs *fileStore) addMsgBlock(mb *msgBlock) {
fs.blks = append(fs.blks, mb)
fs.lmb = mb
fs.bim[mb.index] = mb
}
// Remove from our list of blks.
// Both locks should be held.
func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
// Remove from list.
for i, omb := range fs.blks {
if mb == omb {
fs.dirty++
blks := append(fs.blks[:i], fs.blks[i+1:]...)
fs.blks = copyMsgBlocks(blks)
if fs.bim != nil {
delete(fs.bim, mb.index)
}
break
}
}
}
// Removes the msgBlock
// Both locks should be held.
func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
mb.dirtyCloseWithRemove(true)
fs.removeMsgBlockFromList(mb)
// Check for us being last message block
if mb == fs.lmb {
last := mb.last
// Creating a new message write block requires that the lmb lock is not held.
mb.mu.Unlock()
// Write the tombstone to remember since this was last block.
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
lmb.writeTombstone(last.seq, last.ts)
}
mb.mu.Lock()
}
}
// Called by purge to simply get rid of the cache and close our fds.
// Lock should not be held.
func (mb *msgBlock) dirtyClose() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.dirtyCloseWithRemove(false)
}
// Should be called with lock held.
func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
if mb == nil {
return
}
// Stop cache expiration timer.
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
// Clear any tracking by subject.
mb.fss = nil
// Close cache
mb.clearCacheAndOffset()
// Quit our loops.
if mb.qch != nil {
close(mb.qch)
mb.qch = nil
}
if mb.mfd != nil {
mb.mfd.Close()
mb.mfd = nil
}
if remove {
if mb.mfn != _EMPTY_ {
os.Remove(mb.mfn)
mb.mfn = _EMPTY_
}
if mb.kfn != _EMPTY_ {
os.Remove(mb.kfn)
}
// Since we are removing a block kick the state flusher.
mb.fs.kickFlushStateLoop()
}
}
// Remove a seq from the fss and select new first.
// Lock should be held.
func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss == nil {
return
}
if ss.Msgs == 1 {
delete(mb.fss, subj)
return
}
ss.Msgs--
// Only one left.
if ss.Msgs == 1 {
if seq == ss.Last {
ss.Last = ss.First
} else {
ss.First = ss.Last
}
ss.firstNeedsUpdate = false
return
}
// We can lazily calculate the first sequence when needed.
ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
}
// Will recalulate the first sequence for this subject in this block.
// Will avoid slower path message lookups and scan the cache directly instead.
func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
// Need to make sure messages are loaded.
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return
}
}
// Mark first as updated.
ss.firstNeedsUpdate = false
startSeq++
startSlot := int(startSeq - mb.cache.fseq)
if startSlot >= len(mb.cache.idx) {
ss.First = ss.Last
return
} else if startSlot < 0 {
startSlot = 0
}
var le = binary.LittleEndian
for slot := startSlot; slot < len(mb.cache.idx); slot++ {
li := int(mb.cache.idx[slot]&^hbit) - mb.cache.off
if li >= len(mb.cache.buf) {
ss.First = ss.Last
return
}
buf := mb.cache.buf[li:]
hdr := buf[:msgHdrSize]
slen := int(le.Uint16(hdr[20:]))
if subj == string(buf[msgHdrSize:msgHdrSize+slen]) {
seq := le.Uint64(hdr[4:])
if seq < mb.first.seq || seq&ebit != 0 {
continue
}
if mb.dmap.Exists(seq) {
continue
}
ss.First = seq
return
}
}
}
// Lock should be held.
func (fs *fileStore) resetGlobalPerSubjectInfo() {
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
fs.populateGlobalPerSubjectInfo(mb)
}
}
// Lock should be held.
func (mb *msgBlock) resetPerSubjectInfo() error {
mb.fss = nil
return mb.generatePerSubjectInfo()
}
// generatePerSubjectInfo will generate the per subject info via the raw msg block.
// Lock should be held.
func (mb *msgBlock) generatePerSubjectInfo() error {
// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
if mb.msgs == 0 {
return nil
}
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return err
}
// indexCaceheBuf can produce fss now, so if non-nil we are good.
if mb.fss != nil {
return nil
}
}
// Create new one regardless.
mb.fss = make(map[string]*SimpleState)
var smv StoreMsg
fseq, lseq := mb.first.seq, mb.last.seq
for seq := fseq; seq <= lseq; seq++ {
sm, err := mb.cacheLookup(seq, &smv)
if err != nil {
// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
if err == ErrStoreMsgNotFound || err == errDeletedMsg {
continue
}
if err == errNoCache {
return nil
}
return err
}
if sm != nil && len(sm.subj) > 0 {
if ss := mb.fss[sm.subj]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
if len(mb.fss) > 0 {
// Make sure we run the cache expire timer.
mb.llts = time.Now().UnixNano()
mb.startCacheExpireTimer()
}
return nil
}
// Helper to make sure fss loaded if we are tracking.
// Lock should be held
func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
if mb.fss != nil || mb.noTrack {
return nil
}
if mb.msgs == 0 {
mb.fss = make(map[string]*SimpleState)
return nil
}
return mb.generatePerSubjectInfo()
}
// Called on recovery to populate the global psim state.
// Lock should be held.
func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
mb.mu.Lock()
defer mb.mu.Unlock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return
}
// Now populate psim.
for subj, ss := range mb.fss {
if len(subj) > 0 {
if info, ok := fs.psim[subj]; ok {
info.total += ss.Msgs
if mb.index > info.lblk {
info.lblk = mb.index
}
} else {
fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}
}
}
}
}
// Close the message block.
func (mb *msgBlock) close(sync bool) {
if mb == nil {
return
}
mb.mu.Lock()
defer mb.mu.Unlock()
if mb.closed {
return
}
// Stop cache expiration timer.
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
mb.fss = nil
// Close cache
mb.clearCacheAndOffset()
// Quit our loops.
if mb.qch != nil {
close(mb.qch)
mb.qch = nil
}
if mb.mfd != nil {
if sync {
mb.mfd.Sync()
}
mb.mfd.Close()
}
mb.mfd = nil
// Mark as closed.
mb.closed = true
}
func (fs *fileStore) closeAllMsgBlocks(sync bool) {
for _, mb := range fs.blks {
mb.close(sync)
}
}
func (fs *fileStore) Delete() error {
if fs.isClosed() {
// Always attempt to remove since we could have been closed beforehand.
os.RemoveAll(fs.fcfg.StoreDir)
// Since we did remove, if we did have anything remaining make sure to
// call into any storage updates that had been registered.
fs.mu.Lock()
cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
// Guard against double accounting if called twice.
fs.state.Msgs, fs.state.Bytes = 0, 0
fs.mu.Unlock()
if msgs > 0 && cb != nil {
cb(-msgs, -bytes, 0, _EMPTY_)
}
return ErrStoreClosed
}
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
// If purge directory still exists then we need to wait
// in place and remove since rename would fail.
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
// Do Purge() since if we have lots of blocks uses a mv/rename.
fs.Purge()
if err := fs.Stop(); err != nil {
return err
}
err := os.RemoveAll(fs.fcfg.StoreDir)
if err == nil {
return nil
}
ttl := time.Now().Add(time.Second)
for time.Now().Before(ttl) {
time.Sleep(10 * time.Millisecond)
if err = os.RemoveAll(fs.fcfg.StoreDir); err == nil {
return nil
}
}
return err
}
// Lock should be held.
func (fs *fileStore) cancelSyncTimer() {
if fs.syncTmr != nil {
fs.syncTmr.Stop()
fs.syncTmr = nil
}
}
const (
fullStateMagic = uint8(11)
fullStateVersion = uint8(1)
)
// This go routine runs and receives kicks to write out our full stream state index.
// This will get kicked when we create a new block or when we delete a block in general.
// This is also called during Stop().
func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) {
for {
select {
case <-fch:
fs.writeFullState()
case <-qch:
close(done)
return
}
}
}
// Kick the flusher.
func (fs *fileStore) kickFlushStateLoop() {
kickFlusher(fs.fch)
}
// Helper since unixnano of zero time undefined.
func timestampNormalized(t time.Time) int64 {
if t.IsZero() {
return 0
}
return t.UnixNano()
}
// This will write the full binary state for the stream.
// This plus everything new since last hash will be the total recovered state.
// This state dump will have the following.
// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
// 4. Last block index and hash of record inclusive to this stream state.
func (fs *fileStore) writeFullState() error {
fs.mu.Lock()
if fs.closed || fs.dirty == 0 {
fs.mu.Unlock()
return nil
}
var _buf [32 * 1024]byte
_buf[0], _buf[1] = fullStateMagic, fullStateVersion
buf := _buf[:hdrLen]
buf = binary.AppendUvarint(buf, fs.state.Msgs)
buf = binary.AppendUvarint(buf, fs.state.Bytes)
buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
buf = binary.AppendUvarint(buf, fs.state.LastSeq)
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))
// Do per subject information map if applicable.
numSubjects := len(fs.psim)
buf = binary.AppendUvarint(buf, uint64(numSubjects))
if numSubjects > 0 {
for subj, psi := range fs.psim {
buf = binary.AppendUvarint(buf, uint64(len(subj)))
buf = append(buf, subj...)
buf = binary.AppendUvarint(buf, psi.total)
buf = binary.AppendUvarint(buf, uint64(psi.fblk))
if psi.total > 1 {
buf = binary.AppendUvarint(buf, uint64(psi.lblk))
}
}
}
// Now walk all blocks and write out first and last and optional dmap encoding.
var lbi uint32
var lchk [8]byte
nb := len(fs.blks)
buf = binary.AppendUvarint(buf, uint64(nb))
// Use basetime to save some space.
baseTime := timestampNormalized(fs.state.FirstTime)
for _, mb := range fs.blks {
mb.mu.RLock()
buf = binary.AppendUvarint(buf, uint64(mb.index))
buf = binary.AppendUvarint(buf, mb.bytes)
buf = binary.AppendUvarint(buf, mb.first.seq)
buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
buf = binary.AppendUvarint(buf, mb.last.seq)
buf = binary.AppendVarint(buf, mb.last.ts-baseTime)
numDeleted := mb.dmap.Size()
buf = binary.AppendUvarint(buf, uint64(numDeleted))
if numDeleted > 0 {
var scratch [8 * 1024]byte
dmap, _ := mb.dmap.Encode(scratch[:0])
buf = append(buf, dmap...)
}
// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
// We use this to quickly open this file on recovery.
if mb == fs.lmb {
lbi = mb.index
mb.ensureLastChecksumLoaded()
copy(lchk[0:], mb.lchk[:])
}
mb.mu.RUnlock()
}
// Place block index and hash onto the end.
buf = binary.AppendUvarint(buf, uint64(lbi))
buf = append(buf, lchk[:]...)
// Encrypt if needed.
if fs.prf != nil {
if err := fs.setupAEK(); err != nil {
fs.mu.Unlock()
return err
}
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
rand.Read(nonce)
buf = fs.aek.Seal(nonce, nonce, buf, nil)
}
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
fs.hh.Reset()
fs.hh.Write(buf)
buf = fs.hh.Sum(buf)
// Snapshot prior dirty count.
priorDirty := fs.dirty
// Release lock.
fs.mu.Unlock()
// Write to a tmp file and rename.
const tmpPre = streamStreamStateFile + tsep
f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
if err != nil {
return err
}
tmpName := f.Name()
defer os.Remove(tmpName)
if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
f.Sync()
}
f.Close()
if err != nil {
return err
}
// Rename into position under our lock, clear prior dirty pending on success.
fs.mu.Lock()
if !fs.closed {
if err := os.Rename(tmpName, fn); err != nil {
fs.mu.Unlock()
return err
}
fs.dirty -= priorDirty
}
fs.mu.Unlock()
return nil
}
// Stop the current filestore.
func (fs *fileStore) Stop() error {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
fs.checkAndFlushAllBlocks()
fs.closeAllMsgBlocks(false)
fs.cancelSyncTimer()
fs.cancelAgeChk()
// Release the state flusher loop.
close(fs.qch)
// Wait for the state flush loop to exit.
fsld := fs.fsld
fs.mu.Unlock()
<-fsld
// Write full state if needed. If not dirty this is a no-op.
fs.writeFullState()
fs.mu.Lock()
// Mark as closed.
fs.closed = true
fs.lmb = nil
// We should update the upper usage layer on a stop.
cb, bytes := fs.scb, int64(fs.state.Bytes)
var _cfs [256]ConsumerStore
cfs := append(_cfs[:0], fs.cfs...)
fs.cfs = nil
fs.mu.Unlock()
for _, o := range cfs {
o.Stop()
}
if bytes > 0 && cb != nil {
cb(0, -bytes, 0, _EMPTY_)
}
return nil
}
const errFile = "errors.txt"
// Stream our snapshot through S2 compression and tar.
func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) {
defer w.Close()
enc := s2.NewWriter(w)
defer enc.Close()
tw := tar.NewWriter(enc)
defer tw.Close()
defer func() {
fs.mu.Lock()
fs.sips--
fs.mu.Unlock()
}()
modTime := time.Now().UTC()
writeFile := func(name string, buf []byte) error {
hdr := &tar.Header{
Name: name,
Mode: 0600,
ModTime: modTime,
Uname: "nats",
Gname: "nats",
Size: int64(len(buf)),
Format: tar.FormatPAX,
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(buf); err != nil {
return err
}
return nil
}
writeErr := func(err string) {
writeFile(errFile, []byte(err))
}
fs.mu.Lock()
blks := fs.blks
// Grab our general meta data.
// We do this now instead of pulling from files since they could be encrypted.
meta, err := json.Marshal(fs.cfg)
if err != nil {
fs.mu.Unlock()
writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
return
}
hh := fs.hh
hh.Reset()
hh.Write(meta)
sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
fs.mu.Unlock()
// Meta first.
if writeFile(JetStreamMetaFile, meta) != nil {
return
}
if writeFile(JetStreamMetaFileSum, sum) != nil {
return
}
// Can't use join path here, tar only recognizes relative paths with forward slashes.
msgPre := msgDir + "/"
var bbuf []byte
const minLen = 32
sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
if fs.aek != nil {
ns := fs.aek.NonceSize()
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
if err == nil {
// Redo hash checksum at end on plaintext.
fs.mu.Lock()
hh.Reset()
hh.Write(buf)
buf = fs.hh.Sum(buf)
fs.mu.Unlock()
}
}
if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
return
}
}
// Now do messages themselves.
for _, mb := range blks {
if mb.pendingWriteSize() > 0 {
mb.flushPendingMsgs()
}
mb.mu.Lock()
// We could stream but don't want to hold the lock and prevent changes, so just read in and
// release the lock for now.
bbuf, err = mb.loadBlock(bbuf)
if err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
return
}
// Check for encryption.
if mb.bek != nil && len(bbuf) > 0 {
rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
return
}
rbek.XORKeyStream(bbuf, bbuf)
}
// Check for compression.
if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
return
}
mb.mu.Unlock()
// Do this one unlocked.
if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
return
}
}
// Bail if no consumers requested.
if !includeConsumers {
return
}
// Do consumers' state last.
fs.mu.RLock()
cfs := fs.cfs
fs.mu.RUnlock()
for _, cs := range cfs {
o, ok := cs.(*consumerFileStore)
if !ok {
continue
}
o.mu.Lock()
// Grab our general meta data.
// We do this now instead of pulling from files since they could be encrypted.
meta, err := json.Marshal(o.cfg)
if err != nil {
o.mu.Unlock()
writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
return
}
o.hh.Reset()
o.hh.Write(meta)
sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))
// We can have the running state directly encoded now.
state, err := o.encodeState()
if err != nil {
o.mu.Unlock()
writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
return
}
odirPre := filepath.Join(consumerDir, o.name)
o.mu.Unlock()
// Write all the consumer files.
if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
return
}
if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
return
}
writeFile(filepath.Join(odirPre, consumerState), state)
}
}
// Create a snapshot of this stream and its consumer's state along with messages.
func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return nil, ErrStoreClosed
}
// Only allow one at a time.
if fs.sips > 0 {
fs.mu.Unlock()
return nil, ErrStoreSnapshotInProgress
}
// Mark us as snapshotting
fs.sips += 1
fs.mu.Unlock()
if checkMsgs {
ld := fs.checkMsgs()
if ld != nil && len(ld.Msgs) > 0 {
return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
}
}
pr, pw := net.Pipe()
// Set a write deadline here to protect ourselves.
if deadline > 0 {
pw.SetWriteDeadline(time.Now().Add(deadline))
}
// We can add to our stream while snapshotting but not "user" delete anything.
var state StreamState
fs.FastState(&state)
// Stream in separate Go routine.
go fs.streamSnapshot(pw, &state, includeConsumers)
return &SnapshotResult{pr, state}, nil
}
// Helper to return the config.
func (fs *fileStore) fileStoreConfig() FileStoreConfig {
fs.mu.RLock()
defer fs.mu.RUnlock()
return fs.fcfg
}
// Read lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readLockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.RLock()
}
}
// Read unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readUnlockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.RUnlock()
}
}
// Binary encoded state snapshot, >= v2.10 server.
func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
// Calculate deleted.
var numDeleted int64
if fs.state.LastSeq > fs.state.FirstSeq {
numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
if numDeleted < 0 {
numDeleted = 0
}
}
// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
var buf [1024]byte
buf[0], buf[1] = streamStateMagic, streamStateVersion
n := hdrLen
n += binary.PutUvarint(buf[n:], fs.state.Msgs)
n += binary.PutUvarint(buf[n:], fs.state.Bytes)
n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
n += binary.PutUvarint(buf[n:], failed)
n += binary.PutUvarint(buf[n:], uint64(numDeleted))
b := buf[0:n]
if numDeleted > 0 {
var scratch [4 * 1024]byte
fs.readLockAllMsgBlocks()
defer fs.readUnlockAllMsgBlocks()
for _, db := range fs.deleteBlocks() {
switch db := db.(type) {
case *DeleteRange:
first, _, num := db.State()
scratch[0] = runLengthMagic
i := 1
i += binary.PutUvarint(scratch[i:], first)
i += binary.PutUvarint(scratch[i:], num)
b = append(b, scratch[0:i]...)
case *avl.SequenceSet:
buf, err := db.Encode(scratch[:0])
if err != nil {
return nil, err
}
b = append(b, buf...)
default:
return nil, errors.New("no impl")
}
}
}
return b, nil
}
// We used to be more sophisticated to save memory, but speed is more important.
// All blocks should be at least read locked.
func (fs *fileStore) deleteBlocks() DeleteBlocks {
var dbs DeleteBlocks
var prevLast uint64
for _, mb := range fs.blks {
// Detect if we have a gap between these blocks.
if prevLast > 0 && prevLast+1 != mb.first.seq {
gap := mb.first.seq - prevLast - 1
dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: gap})
}
if mb.dmap.Size() > 0 {
dbs = append(dbs, &mb.dmap)
}
prevLast = mb.last.seq
}
return dbs
}
// SyncDeleted will make sure this stream has same deleted state as dbs.
func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
if len(dbs) == 0 {
return
}
fs.mu.Lock()
defer fs.mu.Unlock()
var needsCheck DeleteBlocks
fs.readLockAllMsgBlocks()
mdbs := fs.deleteBlocks()
for i, db := range dbs {
// If the block is same as what we have we can skip.
if i < len(mdbs) {
first, last, num := db.State()
eFirst, eLast, eNum := mdbs[i].State()
if first == eFirst && last == eLast && num == eNum {
continue
}
}
// Need to insert these.
needsCheck = append(needsCheck, db)
}
fs.readUnlockAllMsgBlocks()
for _, db := range needsCheck {
db.Range(func(dseq uint64) bool {
fs.removeMsg(dseq, false, true, false)
return true
})
}
}
////////////////////////////////////////////////////////////////////////////////
// Consumers
////////////////////////////////////////////////////////////////////////////////
type consumerFileStore struct {
mu sync.Mutex
fs *fileStore
cfg *FileConsumerInfo
prf keyGen
aek cipher.AEAD
name string
odir string
ifn string
hh hash.Hash64
state ConsumerState
fch chan struct{}
qch chan struct{}
flusher bool
writing bool
dirty bool
closed bool
}
func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
if fs == nil {
return nil, fmt.Errorf("filestore is nil")
}
if fs.isClosed() {
return nil, ErrStoreClosed
}
if cfg == nil || name == _EMPTY_ {
return nil, fmt.Errorf("bad consumer config")
}
// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
if cfg.MemoryStorage {
// Create directly here.
o := &consumerMemStore{ms: fs, cfg: *cfg}
fs.AddConsumer(o)
return o, nil
}
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create consumer directory - %v", err)
}
csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
o := &consumerFileStore{
fs: fs,
cfg: csi,
prf: fs.prf,
name: name,
odir: odir,
ifn: filepath.Join(odir, consumerState),
}
key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
o.hh = hh
// Check for encryption.
if o.prf != nil {
if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
if len(ekey) < minBlkKeySize {
return nil, errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
if err != nil {
return nil, err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return nil, err
}
ns := kek.NonceSize()
nonce := ekey[:ns]
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
if err != nil {
// We may be here on a cipher conversion, so attempt to convert.
if err = o.convertCipher(); err != nil {
return nil, err
}
} else {
o.aek, err = genEncryptionKey(sc, seed)
}
if err != nil {
return nil, err
}
}
}
// Track if we are creating the directory so that we can clean up if we encounter an error.
var didCreate bool
// Write our meta data iff does not exist.
meta := filepath.Join(odir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
didCreate = true
csi.Created = time.Now().UTC()
if err := o.writeConsumerMeta(); err != nil {
os.RemoveAll(odir)
return nil, err
}
}
// If we expect to be encrypted check that what we are restoring is not plaintext.
// This can happen on snapshot restores or conversions.
if o.prf != nil {
keyFile := filepath.Join(odir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
if err := o.writeConsumerMeta(); err != nil {
if didCreate {
os.RemoveAll(odir)
}
return nil, err
}
// Redo the state file as well here if we have one and we can tell it was plaintext.
if buf, err := os.ReadFile(o.ifn); err == nil {
if _, err := decodeConsumerState(buf); err == nil {
if err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms); err != nil {
if didCreate {
os.RemoveAll(odir)
}
return nil, err
}
}
}
}
}
// Create channels to control our flush go routine.
o.fch = make(chan struct{}, 1)
o.qch = make(chan struct{})
go o.flushLoop(o.fch, o.qch)
// Make sure to load in our state from disk if needed.
o.loadState()
// Assign to filestore.
fs.AddConsumer(o)
return o, nil
}
func (o *consumerFileStore) convertCipher() error {
fs := o.fs
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)
ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
if err != nil {
return err
}
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
if err != nil {
return err
}
// Do these in reverse since converting.
sc := fs.fcfg.Cipher
osc := AES
if sc == AES {
osc = ChaCha
}
kek, err := genEncryptionKey(osc, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
nonce := ekey[:ns]
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
if err != nil {
return err
}
aek, err := genEncryptionKey(osc, seed)
if err != nil {
return err
}
// Now read in and decode our state using the old cipher.
buf, err := os.ReadFile(o.ifn)
if err != nil {
return err
}
buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
return err
}
// Since we are here we recovered our old state.
// Now write our meta, which will generate the new keys with the new cipher.
if err := o.writeConsumerMeta(); err != nil {
return err
}
// Now write out or state with the new cipher.
return o.writeState(buf)
}
// Kick flusher for this consumer.
// Lock should be held.
func (o *consumerFileStore) kickFlusher() {
if o.fch != nil {
select {
case o.fch <- struct{}{}:
default:
}
}
o.dirty = true
}
// Set in flusher status
func (o *consumerFileStore) setInFlusher() {
o.mu.Lock()
o.flusher = true
o.mu.Unlock()
}
// Clear in flusher status
func (o *consumerFileStore) clearInFlusher() {
o.mu.Lock()
o.flusher = false
o.mu.Unlock()
}
// Report in flusher status
func (o *consumerFileStore) inFlusher() bool {
o.mu.Lock()
defer o.mu.Unlock()
return o.flusher
}
// flushLoop watches for consumer updates and the quit channel.
func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {
o.setInFlusher()
defer o.clearInFlusher()
// Maintain approximately 10 updates per second per consumer under load.
const minTime = 100 * time.Millisecond
var lastWrite time.Time
var dt *time.Timer
setDelayTimer := func(addWait time.Duration) {
if dt == nil {
dt = time.NewTimer(addWait)
return
}
if !dt.Stop() {
select {
case <-dt.C:
default:
}
}
dt.Reset(addWait)
}
for {
select {
case <-fch:
if ts := time.Since(lastWrite); ts < minTime {
setDelayTimer(minTime - ts)
select {
case <-dt.C:
case <-qch:
return
}
}
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return
}
buf, err := o.encodeState()
o.mu.Unlock()
if err != nil {
return
}
// TODO(dlc) - if we error should start failing upwards.
if err := o.writeState(buf); err == nil {
lastWrite = time.Now()
}
case <-qch:
return
}
}
}
// SetStarting sets our starting stream sequence.
func (o *consumerFileStore) SetStarting(sseq uint64) error {
o.mu.Lock()
o.state.Delivered.Stream = sseq
buf, err := o.encodeState()
o.mu.Unlock()
if err != nil {
return err
}
return o.writeState(buf)
}
// HasState returns if this store has a recorded state.
func (o *consumerFileStore) HasState() bool {
o.mu.Lock()
_, err := os.Stat(o.ifn)
o.mu.Unlock()
return err == nil
}
// UpdateDelivered is called whenever a new message has been delivered.
func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
o.mu.Lock()
defer o.mu.Unlock()
if dc != 1 && o.cfg.AckPolicy == AckNone {
return ErrNoAckPolicy
}
// On restarts the old leader may get a replay from the raft logs that are old.
if dseq <= o.state.AckFloor.Consumer {
return nil
}
// See if we expect an ack for this.
if o.cfg.AckPolicy != AckNone {
// Need to create pending records here.
if o.state.Pending == nil {
o.state.Pending = make(map[uint64]*Pending)
}
var p *Pending
// Check for an update to a message already delivered.
if sseq <= o.state.Delivered.Stream {
if p = o.state.Pending[sseq]; p != nil {
p.Sequence, p.Timestamp = dseq, ts
}
} else {
// Add to pending.
o.state.Pending[sseq] = &Pending{dseq, ts}
}
// Update delivered as needed.
if dseq > o.state.Delivered.Consumer {
o.state.Delivered.Consumer = dseq
}
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq
}
if dc > 1 {
if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
// Make sure to remove from pending.
delete(o.state.Pending, sseq)
}
if o.state.Redelivered == nil {
o.state.Redelivered = make(map[uint64]uint64)
}
// Only update if greater then what we already have.
if o.state.Redelivered[sseq] < dc-1 {
o.state.Redelivered[sseq] = dc - 1
}
}
} else {
// For AckNone just update delivered and ackfloor at the same time.
if dseq > o.state.Delivered.Consumer {
o.state.Delivered.Consumer = dseq
o.state.AckFloor.Consumer = dseq
}
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq
o.state.AckFloor.Stream = sseq
}
}
// Make sure we flush to disk.
o.kickFlusher()
return nil
}
// UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
o.mu.Lock()
defer o.mu.Unlock()
if o.cfg.AckPolicy == AckNone {
return ErrNoAckPolicy
}
// On restarts the old leader may get a replay from the raft logs that are old.
if dseq <= o.state.AckFloor.Consumer {
return nil
}
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
return ErrStoreMsgNotFound
}
// Check for AckAll here.
if o.cfg.AckPolicy == AckAll {
sgap := sseq - o.state.AckFloor.Stream
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
for seq := sseq; seq > sseq-sgap; seq-- {
delete(o.state.Pending, seq)
if len(o.state.Redelivered) > 0 {
delete(o.state.Redelivered, seq)
}
}
o.kickFlusher()
return nil
}
// AckExplicit
// First delete from our pending state.
if p, ok := o.state.Pending[sseq]; ok {
delete(o.state.Pending, sseq)
dseq = p.Sequence // Use the original.
}
if len(o.state.Pending) == 0 {
o.state.AckFloor.Consumer = o.state.Delivered.Consumer
o.state.AckFloor.Stream = o.state.Delivered.Stream
} else if dseq == o.state.AckFloor.Consumer+1 {
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
if o.state.Delivered.Consumer > dseq {
for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
if p, ok := o.state.Pending[ss]; ok {
if p.Sequence > 0 {
o.state.AckFloor.Consumer = p.Sequence - 1
o.state.AckFloor.Stream = ss - 1
}
break
}
}
}
}
// We do these regardless.
delete(o.state.Redelivered, sseq)
o.kickFlusher()
return nil
}
const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen
// Encode our consumer state, version 2.
// Lock should be held.
func (o *consumerFileStore) EncodedState() ([]byte, error) {
o.mu.Lock()
defer o.mu.Unlock()
return o.encodeState()
}
func (o *consumerFileStore) encodeState() ([]byte, error) {
// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
state, err := o.stateWithCopyLocked(false)
if err != nil {
return nil, err
}
return encodeConsumerState(state), nil
}
func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
o.mu.Lock()
defer o.mu.Unlock()
// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
csi := o.cfg
csi.ConsumerConfig = *cfg
return o.writeConsumerMeta()
}
func (o *consumerFileStore) Update(state *ConsumerState) error {
o.mu.Lock()
defer o.mu.Unlock()
// Check to see if this is an outdated update.
if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
return nil
}
// Sanity checks.
if state.AckFloor.Consumer > state.Delivered.Consumer {
return fmt.Errorf("bad ack floor for consumer")
}
if state.AckFloor.Stream > state.Delivered.Stream {
return fmt.Errorf("bad ack floor for stream")
}
// Copy to our state.
var pending map[uint64]*Pending
var redelivered map[uint64]uint64
if len(state.Pending) > 0 {
pending = make(map[uint64]*Pending, len(state.Pending))
for seq, p := range state.Pending {
pending[seq] = &Pending{p.Sequence, p.Timestamp}
if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
}
}
}
if len(state.Redelivered) > 0 {
redelivered = make(map[uint64]uint64, len(state.Redelivered))
for seq, dc := range state.Redelivered {
redelivered[seq] = dc
}
}
o.state.Delivered = state.Delivered
o.state.AckFloor = state.AckFloor
o.state.Pending = pending
o.state.Redelivered = redelivered
o.kickFlusher()
return nil
}
// Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
// Lock should be held.
func (o *consumerFileStore) encryptState(buf []byte) []byte {
if o.aek == nil {
return buf
}
// TODO(dlc) - Optimize on space usage a bit?
nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
rand.Read(nonce)
return o.aek.Seal(nonce, nonce, buf, nil)
}
// Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
// https://github.com/nats-io/nats-server/issues/2742
var dios chan struct{}
// Used to setup our simplistic counting semaphore using buffered channels.
// golang.org's semaphore seemed a bit heavy.
func init() {
// Limit ourselves to a max of 4 blocking IO calls.
const nIO = 4
dios = make(chan struct{}, nIO)
// Fill it up to start.
for i := 0; i < nIO; i++ {
dios <- struct{}{}
}
}
func (o *consumerFileStore) writeState(buf []byte) error {
// Check if we have the index file open.
o.mu.Lock()
if o.writing || len(buf) == 0 {
o.mu.Unlock()
return nil
}
// Check on encryption.
if o.aek != nil {
buf = o.encryptState(buf)
}
o.writing = true
o.dirty = false
ifn := o.ifn
o.mu.Unlock()
// Lock not held here but we do limit number of outstanding calls that could block OS threads.
<-dios
err := os.WriteFile(ifn, buf, defaultFilePerms)
dios <- struct{}{}
o.mu.Lock()
if err != nil {
o.dirty = true
}
o.writing = false
o.mu.Unlock()
return err
}
// Will upodate the config. Only used when recovering ephemerals.
func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
o.mu.Lock()
defer o.mu.Unlock()
o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
return o.writeConsumerMeta()
}
// Write out the consumer meta data, i.e. state.
// Lock should be held.
func (cfs *consumerFileStore) writeConsumerMeta() error {
meta := filepath.Join(cfs.odir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
return err
}
if cfs.prf != nil && cfs.aek == nil {
fs := cfs.fs
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
if err != nil {
return err
}
cfs.aek = key
keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
}
b, err := json.Marshal(cfs.cfg)
if err != nil {
return err
}
// Encrypt if needed.
if cfs.aek != nil {
nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
rand.Read(nonce)
b = cfs.aek.Seal(nonce, nonce, b, nil)
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
cfs.hh.Reset()
cfs.hh.Write(b)
checksum := hex.EncodeToString(cfs.hh.Sum(nil))
sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
// Consumer version.
func checkConsumerHeader(hdr []byte) (uint8, error) {
if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
return 0, errCorruptState
}
version := hdr[1]
switch version {
case 1, 2:
return version, nil
}
return 0, fmt.Errorf("unsupported version: %d", version)
}
func (o *consumerFileStore) copyPending() map[uint64]*Pending {
pending := make(map[uint64]*Pending, len(o.state.Pending))
for seq, p := range o.state.Pending {
pending[seq] = &Pending{p.Sequence, p.Timestamp}
}
return pending
}
func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
for seq, dc := range o.state.Redelivered {
redelivered[seq] = dc
}
return redelivered
}
// Type returns the type of the underlying store.
func (o *consumerFileStore) Type() StorageType { return FileStorage }
// State retrieves the state from the state file.
// This is not expected to be called in high performance code, only on startup.
func (o *consumerFileStore) State() (*ConsumerState, error) {
return o.stateWithCopy(true)
}
// This will not copy pending or redelivered, so should only be done under the
// consumer owner's lock.
func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
return o.stateWithCopy(false)
}
func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
o.mu.Lock()
defer o.mu.Unlock()
return o.stateWithCopyLocked(doCopy)
}
// Lock should be held.
func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
if o.closed {
return nil, ErrStoreClosed
}
state := &ConsumerState{}
// See if we have a running state or if we need to read in from disk.
if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
state.Delivered = o.state.Delivered
state.AckFloor = o.state.AckFloor
if len(o.state.Pending) > 0 {
if doCopy {
state.Pending = o.copyPending()
} else {
state.Pending = o.state.Pending
}
}
if len(o.state.Redelivered) > 0 {
if doCopy {
state.Redelivered = o.copyRedelivered()
} else {
state.Redelivered = o.state.Redelivered
}
}
return state, nil
}
// Read the state in here from disk..
buf, err := os.ReadFile(o.ifn)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if len(buf) == 0 {
return state, nil
}
// Check on encryption.
if o.aek != nil {
ns := o.aek.NonceSize()
buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
return nil, err
}
}
state, err = decodeConsumerState(buf)
if err != nil {
return nil, err
}
// Copy this state into our own.
o.state.Delivered = state.Delivered
o.state.AckFloor = state.AckFloor
if len(state.Pending) > 0 {
if doCopy {
o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
for seq, p := range state.Pending {
o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
}
} else {
o.state.Pending = state.Pending
}
}
if len(state.Redelivered) > 0 {
if doCopy {
o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
for seq, dc := range state.Redelivered {
o.state.Redelivered[seq] = dc
}
} else {
o.state.Redelivered = state.Redelivered
}
}
return state, nil
}
// Lock should be held. Called at startup.
func (o *consumerFileStore) loadState() {
if _, err := os.Stat(o.ifn); err == nil {
// This will load our state in from disk.
o.stateWithCopyLocked(false)
}
}
// Decode consumer state.
func decodeConsumerState(buf []byte) (*ConsumerState, error) {
version, err := checkConsumerHeader(buf)
if err != nil {
return nil, err
}
bi := hdrLen
// Helpers, will set i to -1 on error.
readSeq := func() uint64 {
if bi < 0 {
return 0
}
seq, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return seq
}
readTimeStamp := func() int64 {
if bi < 0 {
return 0
}
ts, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return ts
}
// Just for clarity below.
readLen := readSeq
readCount := readSeq
state := &ConsumerState{}
state.AckFloor.Consumer = readSeq()
state.AckFloor.Stream = readSeq()
state.Delivered.Consumer = readSeq()
state.Delivered.Stream = readSeq()
if bi == -1 {
return nil, errCorruptState
}
if version == 1 {
// Adjust back. Version 1 also stored delivered as next to be delivered,
// so adjust that back down here.
if state.AckFloor.Consumer > 1 {
state.Delivered.Consumer += state.AckFloor.Consumer - 1
}
if state.AckFloor.Stream > 1 {
state.Delivered.Stream += state.AckFloor.Stream - 1
}
}
// We have additional stuff.
if numPending := readLen(); numPending > 0 {
mints := readTimeStamp()
state.Pending = make(map[uint64]*Pending, numPending)
for i := 0; i < int(numPending); i++ {
sseq := readSeq()
var dseq uint64
if version == 2 {
dseq = readSeq()
}
ts := readTimeStamp()
// Check the state machine for corruption, not the value which could be -1.
if bi == -1 {
return nil, errCorruptState
}
// Adjust seq back.
sseq += state.AckFloor.Stream
if sseq == 0 {
return nil, errCorruptState
}
if version == 2 {
dseq += state.AckFloor.Consumer
}
// Adjust the timestamp back.
if version == 1 {
ts = (ts + mints) * int64(time.Second)
} else {
ts = (mints - ts) * int64(time.Second)
}
// Store in pending.
state.Pending[sseq] = &Pending{dseq, ts}
}
}
// We have redelivered entries here.
if numRedelivered := readLen(); numRedelivered > 0 {
state.Redelivered = make(map[uint64]uint64, numRedelivered)
for i := 0; i < int(numRedelivered); i++ {
if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
// Adjust seq back.
seq += state.AckFloor.Stream
state.Redelivered[seq] = n
}
}
}
return state, nil
}
// Stop the processing of the consumers's state.
func (o *consumerFileStore) Stop() error {
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return nil
}
if o.qch != nil {
close(o.qch)
o.qch = nil
}
var err error
var buf []byte
if o.dirty {
// Make sure to write this out..
if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
if o.aek != nil {
buf = o.encryptState(buf)
}
}
}
o.odir = _EMPTY_
o.closed = true
ifn, fs := o.ifn, o.fs
o.mu.Unlock()
fs.RemoveConsumer(o)
if len(buf) > 0 {
o.waitOnFlusher()
<-dios
err = os.WriteFile(ifn, buf, defaultFilePerms)
dios <- struct{}{}
}
return err
}
func (o *consumerFileStore) waitOnFlusher() {
if !o.inFlusher() {
return
}
timeout := time.Now().Add(100 * time.Millisecond)
for time.Now().Before(timeout) {
if !o.inFlusher() {
return
}
time.Sleep(10 * time.Millisecond)
}
}
// Delete the consumer.
func (o *consumerFileStore) Delete() error {
return o.delete(false)
}
func (o *consumerFileStore) StreamDelete() error {
return o.delete(true)
}
func (o *consumerFileStore) delete(streamDeleted bool) error {
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return nil
}
if o.qch != nil {
close(o.qch)
o.qch = nil
}
var err error
odir := o.odir
o.odir = _EMPTY_
o.closed = true
fs := o.fs
o.mu.Unlock()
// If our stream was not deleted this will remove the directories.
if odir != _EMPTY_ && !streamDeleted {
<-dios
err = os.RemoveAll(odir)
dios <- struct{}{}
}
if !streamDeleted {
fs.RemoveConsumer(o)
}
return err
}
func (fs *fileStore) AddConsumer(o ConsumerStore) error {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.cfs = append(fs.cfs, o)
return nil
}
func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
fs.mu.Lock()
defer fs.mu.Unlock()
for i, cfs := range fs.cfs {
if o == cfs {
fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
break
}
}
return nil
}
////////////////////////////////////////////////////////////////////////////////
// Templates
////////////////////////////////////////////////////////////////////////////////
type templateFileStore struct {
dir string
hh hash.Hash64
}
func newTemplateFileStore(storeDir string) *templateFileStore {
tdir := filepath.Join(storeDir, tmplsDir)
key := sha256.Sum256([]byte("templates"))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil
}
return &templateFileStore{dir: tdir, hh: hh}
}
func (ts *templateFileStore) Store(t *streamTemplate) error {
dir := filepath.Join(ts.dir, t.Name)
if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
}
meta := filepath.Join(dir, JetStreamMetaFile)
if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
return err
}
t.mu.Lock()
b, err := json.Marshal(t)
t.mu.Unlock()
if err != nil {
return err
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
// FIXME(dlc) - Do checksum
ts.hh.Reset()
ts.hh.Write(b)
checksum := hex.EncodeToString(ts.hh.Sum(nil))
sum := filepath.Join(dir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
func (ts *templateFileStore) Delete(t *streamTemplate) error {
return os.RemoveAll(filepath.Join(ts.dir, t.Name))
}
////////////////////////////////////////////////////////////////////////////////
// Compression
////////////////////////////////////////////////////////////////////////////////
type CompressionInfo struct {
Algorithm StoreCompression
OriginalSize uint64
}
func (c *CompressionInfo) MarshalMetadata() []byte {
b := make([]byte, 14) // 4 + potentially up to 10 for uint64
b[0], b[1], b[2] = 'c', 'm', 'p'
b[3] = byte(c.Algorithm)
n := binary.PutUvarint(b[4:], c.OriginalSize)
return b[:4+n]
}
func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
c.Algorithm = NoCompression
c.OriginalSize = 0
if len(b) < 5 { // 4 + min 1 for uvarint uint64
return 0, nil
}
if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
return 0, nil
}
var n int
c.Algorithm = StoreCompression(b[3])
c.OriginalSize, n = binary.Uvarint(b[4:])
if n <= 0 {
return 0, fmt.Errorf("metadata incomplete")
}
return 4 + n, nil
}
func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
if len(buf) < checksumSize {
return nil, fmt.Errorf("uncompressed buffer is too short")
}
bodyLen := int64(len(buf) - checksumSize)
var output bytes.Buffer
var writer io.WriteCloser
switch alg {
case NoCompression:
return buf, nil
case S2Compression:
writer = s2.NewWriter(&output)
default:
return nil, fmt.Errorf("compression algorithm not known")
}
input := bytes.NewReader(buf[:bodyLen])
checksum := buf[bodyLen:]
// Compress the block content, but don't compress the checksum.
// We will preserve it at the end of the block as-is.
if n, err := io.CopyN(writer, input, bodyLen); err != nil {
return nil, fmt.Errorf("error writing to compression writer: %w", err)
} else if n != bodyLen {
return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
}
if err := writer.Close(); err != nil {
return nil, fmt.Errorf("error closing compression writer: %w", err)
}
// Now add the checksum back onto the end of the block.
if n, err := output.Write(checksum); err != nil {
return nil, fmt.Errorf("error writing checksum: %w", err)
} else if n != checksumSize {
return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
}
return output.Bytes(), nil
}
func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
if len(buf) < checksumSize {
return nil, fmt.Errorf("compressed buffer is too short")
}
bodyLen := int64(len(buf) - checksumSize)
input := bytes.NewReader(buf[:bodyLen])
var reader io.ReadCloser
switch alg {
case NoCompression:
return buf, nil
case S2Compression:
reader = io.NopCloser(s2.NewReader(input))
default:
return nil, fmt.Errorf("compression algorithm not known")
}
// Decompress the block content. The checksum isn't compressed so
// we can preserve it from the end of the block as-is.
checksum := buf[bodyLen:]
output, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("error reading compression reader: %w", err)
}
output = append(output, checksum...)
return output, reader.Close()
}