Files
nats-server/server/filestore.go
2023-09-04 11:05:13 -07:00

8531 lines
207 KiB
Go

// Copyright 2019-2023 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package server
import (
"archive/tar"
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"hash"
"io"
"math"
"net"
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/klauspost/compress/s2"
"github.com/minio/highwayhash"
"github.com/nats-io/nats-server/v2/server/avl"
"golang.org/x/crypto/chacha20"
"golang.org/x/crypto/chacha20poly1305"
)
type FileStoreConfig struct {
// Where the parent directory for all storage will be located.
StoreDir string
// BlockSize is the file block size. This also represents the maximum overhead size.
BlockSize uint64
// CacheExpire is how long with no activity until we expire the cache.
CacheExpire time.Duration
// SyncInterval is how often we sync to disk in the background.
SyncInterval time.Duration
// SyncAlways is when the stream should sync all data writes.
SyncAlways bool
// AsyncFlush allows async flush to batch write operations.
AsyncFlush bool
// Cipher is the cipher to use when encrypting.
Cipher StoreCipher
// Compression is the algorithm to use when compressing.
Compression StoreCompression
}
// FileStreamInfo allows us to remember created time.
type FileStreamInfo struct {
Created time.Time
StreamConfig
}
type StoreCipher int
const (
ChaCha StoreCipher = iota
AES
NoCipher
)
func (cipher StoreCipher) String() string {
switch cipher {
case ChaCha:
return "ChaCha20-Poly1305"
case AES:
return "AES-GCM"
case NoCipher:
return "None"
default:
return "Unknown StoreCipher"
}
}
type StoreCompression uint8
const (
NoCompression StoreCompression = iota
S2Compression
)
func (alg StoreCompression) String() string {
switch alg {
case NoCompression:
return "None"
case S2Compression:
return "S2"
default:
return "Unknown StoreCompression"
}
}
func (alg StoreCompression) MarshalJSON() ([]byte, error) {
var str string
switch alg {
case S2Compression:
str = "s2"
case NoCompression:
str = "none"
default:
return nil, fmt.Errorf("unknown compression algorithm")
}
return json.Marshal(str)
}
func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
var str string
if err := json.Unmarshal(b, &str); err != nil {
return err
}
switch str {
case "s2":
*alg = S2Compression
case "none":
*alg = NoCompression
default:
return fmt.Errorf("unknown compression algorithm")
}
return nil
}
// File ConsumerInfo is used for creating consumer stores.
type FileConsumerInfo struct {
Created time.Time
Name string
ConsumerConfig
}
// Default file and directory permissions.
const (
defaultDirPerms = os.FileMode(0750)
defaultFilePerms = os.FileMode(0640)
)
type psi struct {
total uint64
fblk uint32
lblk uint32
}
type fileStore struct {
srv *Server
mu sync.RWMutex
state StreamState
tombs []uint64
ld *LostStreamData
scb StorageUpdateHandler
ageChk *time.Timer
syncTmr *time.Timer
cfg FileStreamInfo
fcfg FileStoreConfig
prf keyGen
oldprf keyGen
aek cipher.AEAD
lmb *msgBlock
blks []*msgBlock
bim map[uint32]*msgBlock
psim map[string]*psi
hh hash.Hash64
qch chan struct{}
fch chan struct{}
fsld chan struct{}
cfs []ConsumerStore
sips int
dirty int
closed bool
fip bool
receivedAny bool
}
// Represents a message store block and its data.
type msgBlock struct {
// Here for 32bit systems and atomic.
first msgId
last msgId
mu sync.RWMutex
fs *fileStore
aek cipher.AEAD
bek cipher.Stream
seed []byte
nonce []byte
mfn string
mfd *os.File
cmp StoreCompression // Effective compression at the time of loading the block
liwsz int64
index uint32
bytes uint64 // User visible bytes count.
rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
msgs uint64 // User visible message count.
fss map[string]*SimpleState
kfn string
lwts int64
llts int64
lrts int64
llseq uint64
hh hash.Hash64
cache *cache
cloads uint64
cexp time.Duration
ctmr *time.Timer
werr error
dmap avl.SequenceSet
fch chan struct{}
qch chan struct{}
lchk [8]byte
loading bool
flusher bool
noTrack bool
needSync bool
syncAlways bool
closed bool
// Used to mock write failures.
mockWriteErr bool
}
// Write through caching layer that is also used on loading messages.
type cache struct {
buf []byte
off int
wp int
idx []uint32
lrl uint32
fseq uint64
nra bool
}
type msgId struct {
seq uint64
ts int64
}
const (
// Magic is used to identify the file store files.
magic = uint8(22)
// Version
version = uint8(1)
// New IndexInfo Version
newVersion = uint8(2)
// hdrLen
hdrLen = 2
// This is where we keep the streams.
streamsDir = "streams"
// This is where we keep the message store blocks.
msgDir = "msgs"
// This is where we temporarily move the messages dir.
purgeDir = "__msgs__"
// used to scan blk file names.
blkScan = "%d.blk"
// used for compacted blocks that are staged.
newScan = "%d.new"
// used to scan index file names.
indexScan = "%d.idx"
// to look for orphans
indexScanAll = "*.idx"
// to look for orphans
fssScanAll = "*.fss"
// used to store our block encryption key.
keyScan = "%d.key"
// to look for orphans
keyScanAll = "*.key"
// This is where we keep state on consumers.
consumerDir = "obs"
// Index file for a consumer.
consumerState = "o.dat"
// The suffix that will be given to a new temporary block during compression.
compressTmpSuffix = ".tmp"
// This is where we keep state on templates.
tmplsDir = "templates"
// Maximum size of a write buffer we may consider for re-use.
maxBufReuse = 2 * 1024 * 1024
// default cache buffer expiration
defaultCacheBufferExpiration = 5 * time.Second
// default sync interval
defaultSyncInterval = 2 * time.Minute
// default idle timeout to close FDs.
closeFDsIdle = 30 * time.Second
// coalesceMinimum
coalesceMinimum = 16 * 1024
// maxFlushWait is maximum we will wait to gather messages to flush.
maxFlushWait = 8 * time.Millisecond
// Metafiles for streams and consumers.
JetStreamMetaFile = "meta.inf"
JetStreamMetaFileSum = "meta.sum"
JetStreamMetaFileKey = "meta.key"
// This is the full snapshotted state for the stream.
streamStreamStateFile = "index.db"
// AEK key sizes
minMetaKeySize = 64
minBlkKeySize = 64
// Default stream block size.
defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
// Default for workqueue or interest based.
defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
// E.g. mirrors/sources etc.
defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
// Maximum size for the encrypted head block.
maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
// Default for KV based
defaultKVBlockSize = defaultMediumBlockSize
// max block size for now.
maxBlockSize = defaultLargeBlockSize
// Compact minimum threshold.
compactMinimum = 2 * 1024 * 1024 // 2MB
// FileStoreMinBlkSize is minimum size we will do for a blk size.
FileStoreMinBlkSize = 32 * 1000 // 32kib
// FileStoreMaxBlkSize is maximum size we will do for a blk size.
FileStoreMaxBlkSize = maxBlockSize
// Check for bad record length value due to corrupt data.
rlBadThresh = 32 * 1024 * 1024
// Checksum size for hash for msg records.
recordHashSize = 8
)
func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
}
func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
if cfg.Name == _EMPTY_ {
return nil, fmt.Errorf("name required")
}
if cfg.Storage != FileStorage {
return nil, fmt.Errorf("fileStore requires file storage type in config")
}
// Default values.
if fcfg.BlockSize == 0 {
fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
}
if fcfg.BlockSize > maxBlockSize {
return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
}
if fcfg.CacheExpire == 0 {
fcfg.CacheExpire = defaultCacheBufferExpiration
}
if fcfg.SyncInterval == 0 {
fcfg.SyncInterval = defaultSyncInterval
}
// Check the directory
if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create storage directory - %v", err)
}
} else if stat == nil || !stat.IsDir() {
return nil, fmt.Errorf("storage directory is not a directory")
}
tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
if err != nil {
return nil, fmt.Errorf("storage directory is not writable")
}
tmpfile.Close()
<-dios
os.Remove(tmpfile.Name())
dios <- struct{}{}
fs := &fileStore{
fcfg: fcfg,
psim: make(map[string]*psi),
bim: make(map[uint32]*msgBlock),
cfg: FileStreamInfo{Created: created, StreamConfig: cfg},
prf: prf,
oldprf: oldprf,
qch: make(chan struct{}),
fch: make(chan struct{}, 1),
fsld: make(chan struct{}),
}
// Set flush in place to AsyncFlush which by default is false.
fs.fip = !fcfg.AsyncFlush
// Check if this is a new setup.
mdir := filepath.Join(fcfg.StoreDir, msgDir)
odir := filepath.Join(fcfg.StoreDir, consumerDir)
if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create message storage directory - %v", err)
}
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
}
// Create highway hash for message blocks. Use sha256 of directory as key.
key := sha256.Sum256([]byte(cfg.Name))
fs.hh, err = highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
// Make sure we do not have an encrypted store underneath of us but no main key.
if fs.prf == nil {
if _, err := os.Stat(keyFile); err == nil {
return nil, errNoMainKey
}
}
// Attempt to recover our state.
err = fs.recoverFullState()
if err != nil {
// Hold onto state
prior := fs.state
// Reset anything that could have been set from above.
fs.state = StreamState{}
fs.psim = make(map[string]*psi)
fs.bim = make(map[uint32]*msgBlock)
fs.blks = nil
fs.tombs = nil
// Recover our message state the old way
if err := fs.recoverMsgs(); err != nil {
return nil, err
}
// Check if our prior remember a last past where we can see.
if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
if lmb, err := fs.newMsgBlockForWrite(); err == nil {
lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
} else {
return nil, err
}
}
// Since we recovered here, make sure to kick ourselves to write out our stream state.
fs.dirty++
defer fs.kickFlushStateLoop()
}
// Also make sure we get rid of old idx and fss files on return.
// Do this in separate go routine vs inline and at end of processing.
defer func() {
go func() {
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll))
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll))
}()
}()
// Lock while do enforcements and removals.
fs.mu.Lock()
// Check if we have any left over tombstones to process.
if len(fs.tombs) > 0 {
for _, seq := range fs.tombs {
fs.removeMsg(seq, false, false, false)
fs.removeFromLostData(seq)
}
// Not needed after this phase.
fs.tombs = nil
}
// Limits checks and enforcement.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Do age checks too, make sure to call in place.
if fs.cfg.MaxAge != 0 {
fs.expireMsgsOnRecover()
fs.startAgeChk()
}
// If we have max msgs per subject make sure the is also enforced.
if fs.cfg.MaxMsgsPer > 0 {
fs.enforceMsgPerSubjectLimit()
}
// Grab first sequence for check below while we have lock.
firstSeq := fs.state.FirstSeq
fs.mu.Unlock()
// If the stream has an initial sequence number then make sure we
// have purged up until that point. We will do this only if the
// recovered first sequence number is before our configured first
// sequence. Need to do this locked as by now the age check timer
// has started.
if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
if _, err := fs.purge(cfg.FirstSeq); err != nil {
return nil, err
}
}
// Write our meta data if it does not exist or is zero'd out.
meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
fi, err := os.Stat(meta)
if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
if err := fs.writeStreamMeta(); err != nil {
return nil, err
}
}
// If we expect to be encrypted check that what we are restoring is not plaintext.
// This can happen on snapshot restores or conversions.
if fs.prf != nil {
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
if err := fs.writeStreamMeta(); err != nil {
return nil, err
}
}
}
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
// Spin up the go routine that will write out or full state stream index.
go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld)
return fs, nil
}
func (fs *fileStore) registerServer(s *Server) {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.srv = s
}
// Lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) lockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.Lock()
}
}
// Unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) unlockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.Unlock()
}
}
func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
if fs.isClosed() {
return ErrStoreClosed
}
if cfg.Name == _EMPTY_ {
return fmt.Errorf("name required")
}
if cfg.Storage != FileStorage {
return fmt.Errorf("fileStore requires file storage type in config")
}
fs.mu.Lock()
new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
old_cfg := fs.cfg
// Messages block reference fs.cfg.Subjects (in subjString) under the
// mb's lock, not fs' lock. So do the switch here under all existing
// message blocks' lock in order to silence the DATA RACE detector.
fs.lockAllMsgBlocks()
fs.cfg = new_cfg
fs.unlockAllMsgBlocks()
if err := fs.writeStreamMeta(); err != nil {
fs.lockAllMsgBlocks()
fs.cfg = old_cfg
fs.unlockAllMsgBlocks()
fs.mu.Unlock()
return err
}
// Limits checks and enforcement.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Do age timers.
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.startAgeChk()
}
if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
fs.ageChk.Stop()
fs.ageChk = nil
}
if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
fs.enforceMsgPerSubjectLimit()
}
fs.mu.Unlock()
if cfg.MaxAge != 0 {
fs.expireMsgs()
}
return nil
}
func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
if maxBytes > 0 {
blkSize := (maxBytes / 4) + 1 // (25% overhead)
// Round up to nearest 100
if m := blkSize % 100; m != 0 {
blkSize += 100 - m
}
if blkSize <= FileStoreMinBlkSize {
blkSize = FileStoreMinBlkSize
} else if blkSize >= FileStoreMaxBlkSize {
blkSize = FileStoreMaxBlkSize
} else {
blkSize = defaultMediumBlockSize
}
if encrypted && blkSize > maximumEncryptedBlockSize {
// Notes on this below.
blkSize = maximumEncryptedBlockSize
}
return uint64(blkSize)
}
switch {
case encrypted:
// In the case of encrypted stores, large blocks can result in worsened perf
// since many writes on disk involve re-encrypting the entire block. For now,
// we will enforce a cap on the block size when encryption is enabled to avoid
// this.
return maximumEncryptedBlockSize
case retention == LimitsPolicy:
// TODO(dlc) - Make the blocksize relative to this if set.
return defaultLargeBlockSize
default:
// TODO(dlc) - Make the blocksize relative to this if set.
return defaultMediumBlockSize
}
}
func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
if sc == ChaCha {
ek, err = chacha20poly1305.NewX(seed)
} else if sc == AES {
block, e := aes.NewCipher(seed)
if e != nil {
return nil, err
}
ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
} else {
err = errUnknownCipher
}
return ek, err
}
// Generate an asset encryption key from the context and server PRF.
func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
if fs.prf == nil {
return nil, nil, nil, nil, errNoEncryption
}
// Generate key encryption key.
rb, err := fs.prf([]byte(context))
if err != nil {
return nil, nil, nil, nil, err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return nil, nil, nil, nil, err
}
// Generate random asset encryption key seed.
const seedSize = 32
seed = make([]byte, seedSize)
if n, err := rand.Read(seed); err != nil || n != seedSize {
return nil, nil, nil, nil, err
}
aek, err = genEncryptionKey(sc, seed)
if err != nil {
return nil, nil, nil, nil, err
}
// Generate our nonce. Use same buffer to hold encrypted seed.
nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
rand.Read(nonce)
bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
if err != nil {
return nil, nil, nil, nil, err
}
return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
}
// Will generate the block encryption key.
func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
if sc == ChaCha {
return chacha20.NewUnauthenticatedCipher(seed, nonce)
} else if sc == AES {
block, err := aes.NewCipher(seed)
if err != nil {
return nil, err
}
return cipher.NewCTR(block, nonce), nil
}
return nil, errUnknownCipher
}
// Lock should be held.
func (fs *fileStore) recoverAEK() error {
if fs.prf != nil && fs.aek == nil {
ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
if err != nil {
return err
}
rb, err := fs.prf([]byte(fs.cfg.Name))
if err != nil {
return err
}
kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
return err
}
aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
if err != nil {
return err
}
fs.aek = aek
}
return nil
}
// Lock should be held.
func (fs *fileStore) setupAEK() error {
if fs.prf != nil && fs.aek == nil {
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
if err != nil {
return err
}
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
// Set our aek.
fs.aek = key
}
return nil
}
// Write out meta and the checksum.
// Lock should be held.
func (fs *fileStore) writeStreamMeta() error {
if err := fs.setupAEK(); err != nil {
return err
}
meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
return err
}
b, err := json.Marshal(fs.cfg)
if err != nil {
return err
}
// Encrypt if needed.
if fs.aek != nil {
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
rand.Read(nonce)
b = fs.aek.Seal(nonce, nonce, b, nil)
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
fs.hh.Reset()
fs.hh.Write(b)
checksum := hex.EncodeToString(fs.hh.Sum(nil))
sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
// Pools to recycle the blocks to help with memory pressure.
var blkPoolBig sync.Pool // 16MB
var blkPoolMedium sync.Pool // 8MB
var blkPoolSmall sync.Pool // 2MB
// Get a new msg block based on sz estimate.
func getMsgBlockBuf(sz int) (buf []byte) {
var pb interface{}
if sz <= defaultSmallBlockSize {
pb = blkPoolSmall.Get()
} else if sz <= defaultMediumBlockSize {
pb = blkPoolMedium.Get()
} else {
pb = blkPoolBig.Get()
}
if pb != nil {
buf = *(pb.(*[]byte))
} else {
// Here we need to make a new blk.
// If small leave as is..
if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
sz = defaultMediumBlockSize
} else if sz > defaultMediumBlockSize {
sz = defaultLargeBlockSize
}
buf = make([]byte, sz)
}
return buf[:0]
}
// Recycle the msg block.
func recycleMsgBlockBuf(buf []byte) {
if buf == nil || cap(buf) < defaultSmallBlockSize {
return
}
// Make sure to reset before placing back into pool.
buf = buf[:0]
// We need to make sure the load code gets a block that can fit the maximum for a size block.
// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
// it right back in and making a new []byte.
// From above we know its already >= defaultSmallBlockSize
if sz := cap(buf); sz < defaultMediumBlockSize {
blkPoolSmall.Put(&buf)
} else if sz < defaultLargeBlockSize {
blkPoolMedium.Put(&buf)
} else {
blkPoolBig.Put(&buf)
}
}
const (
msgHdrSize = 22
checksumSize = 8
emptyRecordLen = msgHdrSize + checksumSize
)
// Lock should be held.
func (fs *fileStore) noTrackSubjects() bool {
return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
}
// Will init the basics for a message block.
func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))
if mb.hh == nil {
key := sha256.Sum256(fs.hashKeyForBlock(index))
mb.hh, _ = highwayhash.New64(key[:])
}
return mb
}
// Lock for fs should be held.
func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
if fs.prf == nil {
return nil
}
var createdKeys bool
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
if err != nil {
// We do not seem to have keys even though we should. Could be a plaintext conversion.
// Create the keys and we will double check below.
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
return err
}
createdKeys = true
} else {
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
if err != nil {
return err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
// We may be here on a cipher conversion, so attempt to convert.
if err = mb.convertCipher(); err != nil {
return err
}
} else {
mb.seed, mb.nonce = seed, ekey[:ns]
}
mb.aek, err = genEncryptionKey(sc, mb.seed)
if err != nil {
return err
}
if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
return err
}
}
// If we created keys here, let's check the data and if it is plaintext convert here.
if createdKeys {
if err := mb.convertToEncrypted(); err != nil {
return err
}
}
return nil
}
// Load a last checksum if needed from the block file.
// Lock should be held.
func (mb *msgBlock) ensureLastChecksumLoaded() {
var empty [8]byte
if mb.lchk != empty {
return
}
copy(mb.lchk[0:], mb.lastChecksum())
}
// Lock held on entry
func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
mb := fs.initMsgBlock(index)
// Open up the message file, but we will try to recover from the index file.
// We will check that the last checksums match.
file, err := os.Open(mb.mfn)
if err != nil {
return nil, err
}
defer file.Close()
if fi, err := file.Stat(); fi != nil {
mb.rbytes = uint64(fi.Size())
} else {
return nil, err
}
// Make sure encryption loaded if needed.
fs.loadEncryptionForMsgBlock(mb)
// Grab last checksum from main block file.
var lchk [8]byte
if mb.rbytes >= checksumSize {
if mb.bek != nil {
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
mb.bek.XORKeyStream(buf, buf)
copy(lchk[0:], buf[len(buf)-checksumSize:])
}
} else {
file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
}
}
file.Close()
// Read our index file. Use this as source of truth if possible.
if err := mb.readIndexInfo(); err == nil {
// Quick sanity check here.
// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
fs.populateGlobalPerSubjectInfo(mb)
// Try to dump any state we needed on recovery.
mb.tryForceExpireCacheLocked()
}
fs.addMsgBlock(mb)
return mb, nil
}
}
// If we get data loss rebuilding the message block state record that with the fs itself.
ld, tombs, _ := mb.rebuildState()
if ld != nil {
fs.addLostData(ld)
}
// Collect all tombstones.
if len(tombs) > 0 {
fs.tombs = append(fs.tombs, tombs...)
}
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
fs.populateGlobalPerSubjectInfo(mb)
// Try to dump any state we needed on recovery.
mb.tryForceExpireCacheLocked()
}
mb.closeFDs()
fs.addMsgBlock(mb)
return mb, nil
}
func (fs *fileStore) lostData() *LostStreamData {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.ld == nil {
return nil
}
nld := *fs.ld
return &nld
}
// Lock should be held.
func (fs *fileStore) addLostData(ld *LostStreamData) {
if ld == nil {
return
}
if fs.ld != nil {
var added bool
for _, seq := range ld.Msgs {
if _, found := fs.ld.exists(seq); !found {
fs.ld.Msgs = append(fs.ld.Msgs, seq)
added = true
}
}
if added {
msgs := fs.ld.Msgs
sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
fs.ld.Bytes += ld.Bytes
}
} else {
fs.ld = ld
}
}
// Helper to see if we already have this sequence reported in our lost data.
func (ld *LostStreamData) exists(seq uint64) (int, bool) {
i, found := sort.Find(len(ld.Msgs), func(i int) int {
tseq := ld.Msgs[i]
if tseq < seq {
return -1
}
if tseq > seq {
return +1
}
return 0
})
return i, found
}
func (fs *fileStore) removeFromLostData(seq uint64) {
if fs.ld == nil {
return
}
if i, found := fs.ld.exists(seq); found {
fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
if len(fs.ld.Msgs) == 0 {
fs.ld = nil
}
}
}
func (fs *fileStore) rebuildState(ld *LostStreamData) {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.rebuildStateLocked(ld)
}
// Lock should be held.
func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
fs.addLostData(ld)
fs.state.Msgs, fs.state.Bytes = 0, 0
fs.state.FirstSeq, fs.state.LastSeq = 0, 0
for _, mb := range fs.blks {
mb.mu.RLock()
fs.state.Msgs += mb.msgs
fs.state.Bytes += mb.bytes
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
fs.state.LastSeq = mb.last.seq
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
mb.mu.RUnlock()
}
}
// Attempt to convert the cipher used for this message block.
func (mb *msgBlock) convertCipher() error {
fs := mb.fs
sc := fs.fcfg.Cipher
var osc StoreCipher
switch sc {
case ChaCha:
osc = AES
case AES:
osc = ChaCha
}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
if err != nil {
return err
}
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
type prfWithCipher struct {
keyGen
StoreCipher
}
var prfs []prfWithCipher
if fs.prf != nil {
prfs = append(prfs, prfWithCipher{fs.prf, sc})
prfs = append(prfs, prfWithCipher{fs.prf, osc})
}
if fs.oldprf != nil {
prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
}
for _, prf := range prfs {
// Recover key encryption key.
rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
if err != nil {
continue
}
kek, err := genEncryptionKey(prf.StoreCipher, rb)
if err != nil {
continue
}
ns := kek.NonceSize()
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
if err != nil {
continue
}
nonce := ekey[:ns]
bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
if err != nil {
return err
}
buf, _ := mb.loadBlock(nil)
bek.XORKeyStream(buf, buf)
// Make sure we can parse with old cipher and key file.
if err = mb.indexCacheBuf(buf); err != nil {
return err
}
// Reset the cache since we just read everything in.
mb.cache = nil
// Generate new keys. If we error for some reason then we will put
// the old keyfile back.
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
os.WriteFile(keyFile, ekey, defaultFilePerms)
return err
}
mb.bek.XORKeyStream(buf, buf)
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
return err
}
return nil
}
return fmt.Errorf("unable to recover keys")
}
// Convert a plaintext block to encrypted.
func (mb *msgBlock) convertToEncrypted() error {
if mb.bek == nil {
return nil
}
buf, err := mb.loadBlock(nil)
if err != nil {
return err
}
if err := mb.indexCacheBuf(buf); err != nil {
// This likely indicates this was already encrypted or corrupt.
mb.cache = nil
return err
}
// Undo cache from above for later.
mb.cache = nil
mb.bek.XORKeyStream(buf, buf)
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
return err
}
return nil
}
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// We will return any lost data, and we will return any delete tombstones we encountered.
func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.rebuildStateLocked()
}
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
// Lock should be held.
func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
startLastSeq := mb.last.seq
// Remove the .fss file and clear any cache we have set.
mb.clearCacheAndOffset()
buf, err := mb.loadBlock(nil)
if err != nil || len(buf) == 0 {
var ld *LostStreamData
// No data to rebuild from here.
if mb.msgs > 0 {
// We need to declare lost data here.
ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
if !mb.dmap.Exists(seq) {
ld.Msgs = append(ld.Msgs, seq)
}
}
// Clear invalid state. We will let this blk be added in here.
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
mb.dmap.Empty()
mb.first.seq = mb.last.seq + 1
}
return ld, nil, err
}
// Clear state we need to rebuild.
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
mb.last.seq, mb.last.ts = 0, 0
firstNeedsSet := true
// Check if we need to decrypt.
if mb.bek != nil && len(buf) > 0 {
// Recreate to reset counter.
mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return nil, nil, err
}
mb.bek.XORKeyStream(buf, buf)
}
// Check for compression.
if buf, err = mb.decompressIfNeeded(buf); err != nil {
return nil, nil, err
}
mb.rbytes = uint64(len(buf))
addToDmap := func(seq uint64) {
if seq == 0 {
return
}
mb.dmap.Insert(seq)
}
var le = binary.LittleEndian
truncate := func(index uint32) {
var fd *os.File
if mb.mfd != nil {
fd = mb.mfd
} else {
fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
if err == nil {
defer fd.Close()
}
}
if fd == nil {
return
}
if err := fd.Truncate(int64(index)); err == nil {
// Update our checksum.
if index >= 8 {
var lchk [8]byte
fd.ReadAt(lchk[:], int64(index-8))
copy(mb.lchk[0:], lchk[:])
}
fd.Sync()
}
}
gatherLost := func(lb uint32) *LostStreamData {
var ld LostStreamData
for seq := mb.last.seq + 1; seq <= startLastSeq; seq++ {
ld.Msgs = append(ld.Msgs, seq)
}
ld.Bytes = uint64(lb)
return &ld
}
// For tombstones that we find and collect.
var (
tombstones []uint64
minTombstoneSeq uint64
minTombstoneTs int64
)
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
if index+msgHdrSize > lbuf {
truncate(index)
return gatherLost(lbuf - index), tombstones, nil
}
hdr := buf[index : index+msgHdrSize]
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
hasHeaders := rl&hbit != 0
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
truncate(index)
return gatherLost(lbuf - index), tombstones, errBadMsg
}
// Check for checksum failures before additional processing.
data := buf[index+msgHdrSize : index+rl]
if hh := mb.hh; hh != nil {
hh.Reset()
hh.Write(hdr[4:20])
hh.Write(data[:slen])
if hasHeaders {
hh.Write(data[slen+4 : dlen-recordHashSize])
} else {
hh.Write(data[slen : dlen-recordHashSize])
}
checksum := hh.Sum(nil)
if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
truncate(index)
return gatherLost(lbuf - index), tombstones, errBadMsg
}
copy(mb.lchk[0:], checksum)
}
// Grab our sequence and timestamp.
seq := le.Uint64(hdr[4:])
ts := int64(le.Uint64(hdr[12:]))
// Check if this is a delete tombstone.
if seq&tbit != 0 {
seq = seq &^ tbit
// Need to process this here and make sure we have accounted for this properly.
tombstones = append(tombstones, seq)
if minTombstoneSeq == 0 || seq < minTombstoneSeq {
minTombstoneSeq, minTombstoneTs = seq, ts
}
index += rl
continue
}
// This is an old erased message, or a new one that we can track.
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
seq = seq &^ ebit
if seq >= mb.first.seq {
// Only add to dmap if past recorded first seq and non-zero.
if seq != 0 {
addToDmap(seq)
}
mb.last.seq = seq
mb.last.ts = ts
if mb.msgs == 0 {
mb.first.seq, mb.first.ts = seq+1, 0
}
}
index += rl
continue
}
// This is for when we have index info that adjusts for deleted messages
// at the head. So the first.seq will be already set here. If this is larger
// replace what we have with this seq.
if firstNeedsSet && seq >= mb.first.seq {
firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts
}
if !mb.dmap.Exists(seq) {
mb.msgs++
mb.bytes += uint64(rl)
// Rebuild per subject info if needed.
if slen > 0 {
if mb.fss == nil {
mb.fss = make(map[string]*SimpleState)
}
// For the lookup, we cast the byte slice and there won't be any copy
if ss := mb.fss[string(data[:slen])]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
// This will either use a subject from the config, or make a copy
// so we don't reference the underlying buffer.
subj := mb.subjString(data[:slen])
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
// Always set last
mb.last.seq = seq
mb.last.ts = ts
// Advance to next record.
index += rl
}
// For empty msg blocks make sure we recover last seq correctly based off of first.
// Or if we seem to have no messages but had a tombstone, which we use to remember
// sequences and timestamps now, use that to properly setup the first and last.
if mb.msgs == 0 {
if mb.first.seq > 0 {
mb.last.seq = mb.first.seq - 1
} else if mb.first.seq == 0 && minTombstoneSeq > 0 {
mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0
if mb.last.seq == 0 {
mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs
}
}
}
return nil, tombstones, nil
}
// recoverFullState will attempt to receover our last full state and re-process any state changes
// that happened afterwards.
func (fs *fileStore) recoverFullState() (rerr error) {
fs.mu.Lock()
defer fs.mu.Unlock()
// Check for any left over purged messages.
<-dios
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
// Grab our stream state file and load it in.
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
buf, err := os.ReadFile(fn)
dios <- struct{}{}
if err != nil {
return err
}
const minLen = 32
if len(buf) < minLen {
os.Remove(fn)
return errCorruptState
}
// The highwayhash will be on the end. Check that it still matches.
h := buf[len(buf)-highwayhash.Size64:]
buf = buf[:len(buf)-highwayhash.Size64]
fs.hh.Reset()
fs.hh.Write(buf)
if !bytes.Equal(h, fs.hh.Sum(nil)) {
os.Remove(fn)
return errCorruptState
}
// Decrypt if needed.
if fs.prf != nil {
// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
// since snapshots strip encryption.
if err := fs.recoverAEK(); err == nil {
ns := fs.aek.NonceSize()
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
return err
}
}
}
if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
os.Remove(fn)
return errCorruptState
}
bi := hdrLen
readU64 := func() uint64 {
if bi < 0 {
return 0
}
v, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return v
}
readI64 := func() int64 {
if bi < 0 {
return 0
}
v, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return v
}
setTime := func(t *time.Time, ts int64) {
if ts == 0 {
*t = time.Time{}
} else {
*t = time.Unix(0, ts).UTC()
}
}
var state StreamState
state.Msgs = readU64()
state.Bytes = readU64()
state.FirstSeq = readU64()
baseTime := readI64()
setTime(&state.FirstTime, baseTime)
state.LastSeq = readU64()
setTime(&state.LastTime, readI64())
// Check for per subject info.
if numSubjects := int(readU64()); numSubjects > 0 {
fs.psim = make(map[string]*psi, numSubjects)
for i := 0; i < numSubjects; i++ {
if lsubj := int(readU64()); lsubj > 0 {
if bi+lsubj > len(buf) {
os.Remove(fn)
return errCorruptState
}
subj := fs.subjString(buf[bi : bi+lsubj])
bi += lsubj
psi := &psi{total: readU64(), fblk: uint32(readU64())}
if psi.total > 1 {
psi.lblk = uint32(readU64())
} else {
psi.lblk = psi.fblk
}
fs.psim[subj] = psi
}
}
}
if numBlocks := readU64(); numBlocks > 0 {
fs.blks = make([]*msgBlock, 0, numBlocks)
for i := 0; i < int(numBlocks); i++ {
index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
if bi < 0 {
break
}
mb := fs.initMsgBlock(index)
mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes
mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
if numDeleted > 0 {
dmap, n, err := avl.Decode(buf[bi:])
if err != nil {
os.Remove(fn)
return errCorruptState
}
mb.dmap = *dmap
mb.msgs -= numDeleted
bi += n
}
fs.addMsgBlock(mb)
}
}
// Pull in last block index for the block that had last checksum when we wrote the full state.
blkIndex := uint32(readU64())
var lchk [8]byte
if bi+len(lchk) > len(buf) {
bi = -1
} else {
copy(lchk[0:], buf[bi:bi+len(lchk)])
}
// Check if we had any errors.
if bi < 0 {
os.Remove(fn)
return errCorruptState
}
// Move into place our state, msgBlks and subject info.
fs.state = state
// First let's check the happy path, open the blk file that was the lmb when we created the full state.
// See if we have the last block available.
var matched bool
var mb *msgBlock
if mb = fs.bim[blkIndex]; mb != nil {
if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
// If our saved state is past what we see on disk, fallback and rebuild.
if ld, _, _ := mb.rebuildState(); ld != nil {
fs.addLostData(ld)
}
return errPriorState
}
if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
// Remove the last message block since we will re-process below.
fs.removeMsgBlockFromList(mb)
}
}
// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
// If we did not match we re-process the last block.
start := blkIndex
if matched {
start++
}
for bi := start; ; bi++ {
nmb, err := fs.recoverMsgBlock(bi)
if err != nil {
if os.IsNotExist(err) {
return nil
}
os.Remove(fn)
return err
}
if nmb != nil {
// Check if we have to account for a partial message block.
if !matched && mb != nil && mb.index == nmb.index {
if err := fs.adjustAccounting(mb, nmb); err != nil {
return err
}
}
// Update top level accounting.
if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = nmb.first.seq
fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
}
if nmb.last.seq > fs.state.LastSeq {
fs.state.LastSeq = nmb.last.seq
fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
}
fs.state.Msgs += nmb.msgs
fs.state.Bytes += nmb.bytes
}
}
}
// adjustAccounting will be called when a stream state was only partially accounted for
// with a message block, e.g. additional records were added after the stream state.
// Lock should be held.
func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error {
nmb.mu.Lock()
defer nmb.mu.Unlock()
// First make sure the new block is loaded.
if nmb.cacheNotLoaded() {
nmb.loadMsgsWithLock()
}
nmb.ensurePerSubjectInfoLoaded()
lookupAndAdjust := func(seq uint64) error {
var smv StoreMsg
// Lookup the message.
sm, err := nmb.cacheLookup(seq, &smv)
if err != nil {
return err
}
// Since we found it we just need to adjust fs totals and psim.
fs.state.Msgs--
fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if len(sm.subj) > 0 && fs.psim != nil {
fs.removePerSubject(sm.subj)
}
return nil
}
// Walk all the original mb's sequences that were included in the stream state.
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
// If we had already declared it deleted we can move on since you can not undelete.
if mb.dmap.Exists(seq) {
continue
}
// Lookup the message.
if err := lookupAndAdjust(seq); err != nil {
return err
}
}
// Now check to see if we had a higher first for the recovered state mb vs nmb.
if nmb.first.seq < mb.first.seq {
for seq := nmb.first.seq; seq < mb.first.seq; seq++ {
// Lookup the message.
if err := lookupAndAdjust(seq); err != nil {
return err
}
}
// Now set first for nmb.
nmb.first = mb.first
}
return nil
}
// Grabs last checksum for the named block file.
// Takes into account encryption etc.
func (mb *msgBlock) lastChecksum() []byte {
f, err := os.Open(mb.mfn)
if err != nil {
return nil
}
defer f.Close()
var lchk [8]byte
if fi, _ := f.Stat(); fi != nil {
mb.rbytes = uint64(fi.Size())
}
if mb.rbytes < checksumSize {
return nil
}
// Encrypted?
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
return nil
}
}
if mb.bek != nil {
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return nil
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
copy(lchk[0:], buf[len(buf)-checksumSize:])
}
} else {
f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
}
return lchk[:]
}
func (fs *fileStore) recoverMsgs() error {
fs.mu.Lock()
defer fs.mu.Unlock()
// Check for any left over purged messages.
<-dios
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
f, err := os.Open(mdir)
if err != nil {
dios <- struct{}{}
return errNotReadable
}
dirs, err := f.ReadDir(-1)
f.Close()
dios <- struct{}{}
if err != nil {
return errNotReadable
}
indices := make(sort.IntSlice, 0, len(dirs))
var index int
for _, fi := range dirs {
if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
indices = append(indices, index)
}
}
indices.Sort()
// Recover all of the msg blocks.
// We now guarantee they are coming in order.
for _, index := range indices {
if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
// This is a truncate block with possibly no index. If the OS got shutdown
// out from underneath of us this is possible.
if mb.first.seq == 0 {
mb.dirtyCloseWithRemove(true)
fs.removeMsgBlockFromList(mb)
continue
}
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq
if mb.first.ts == 0 {
fs.state.FirstTime = time.Time{}
} else {
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
}
if mb.last.seq > fs.state.LastSeq {
fs.state.LastSeq = mb.last.seq
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
}
fs.state.Msgs += mb.msgs
fs.state.Bytes += mb.bytes
} else {
return err
}
}
if len(fs.blks) > 0 {
fs.lmb = fs.blks[len(fs.blks)-1]
} else {
_, err = fs.newMsgBlockForWrite()
}
// Check if we encountered any lost data.
if fs.ld != nil {
var emptyBlks []*msgBlock
for _, mb := range fs.blks {
if mb.msgs == 0 && mb.rbytes == 0 {
emptyBlks = append(emptyBlks, mb)
}
}
for _, mb := range emptyBlks {
fs.removeMsgBlock(mb)
}
}
if err != nil {
return err
}
// Check for keyfiles orphans.
if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
valid := make(map[uint32]bool)
for _, mb := range fs.blks {
valid[mb.index] = true
}
for _, fn := range kms {
var index uint32
shouldRemove := true
if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
shouldRemove = false
}
if shouldRemove {
os.Remove(fn)
}
}
}
return nil
}
// Will expire msgs that have aged out on restart.
// We will treat this differently in case we have a recovery
// that will expire alot of messages on startup.
// Should only be called on startup.
func (fs *fileStore) expireMsgsOnRecover() {
if fs.state.Msgs == 0 {
return
}
var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
var purged, bytes uint64
var deleted int
var nts int64
// If we expire all make sure to write out a tombstone. Need to be done by hand here,
// usually taken care of by fs.removeMsgBlock() but we do not call that here.
var last msgId
deleteEmptyBlock := func(mb *msgBlock) {
// If we are the last keep state to remember first/last sequence.
// Do this part by hand since not deleting one by one.
if mb == fs.lmb {
last = mb.last
}
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
mb.dirtyCloseWithRemove(true)
deleted++
}
for _, mb := range fs.blks {
mb.mu.Lock()
if minAge < mb.first.ts {
nts = mb.first.ts
mb.mu.Unlock()
break
}
// Can we remove whole block here?
if mb.last.ts <= minAge {
purged += mb.msgs
bytes += mb.bytes
deleteEmptyBlock(mb)
mb.mu.Unlock()
continue
}
// If we are here we have to process the interior messages of this blk.
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
break
}
var smv StoreMsg
var needNextFirst bool
// Walk messages and remove if expired.
mb.ensurePerSubjectInfoLoaded()
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
sm, err := mb.cacheLookup(seq, &smv)
// Process interior deleted msgs.
if err == errDeletedMsg {
// Update dmap.
if mb.dmap.Exists(seq) {
mb.dmap.Delete(seq)
}
// Keep this updated just in case since we are removing dmap entries.
mb.first.seq, needNextFirst = seq, true
continue
}
// Break on other errors.
if err != nil || sm == nil {
mb.first.seq, needNextFirst = seq, true
break
}
// No error and sm != nil from here onward.
// Check for done.
if minAge < sm.ts {
mb.first.seq, needNextFirst = sm.seq, false
mb.first.seq = sm.seq
mb.first.ts = sm.ts
nts = sm.ts
break
}
// Delete the message here.
if mb.msgs > 0 {
mb.first.seq, needNextFirst = seq, true
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if sz > mb.bytes {
sz = mb.bytes
}
mb.bytes -= sz
bytes += sz
mb.msgs--
purged++
}
// Update fss
// Make sure we have fss loaded.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
}
// Make sure we have a proper next first sequence.
if needNextFirst {
mb.selectNextFirst()
}
// Check if empty after processing, could happen if tail of messages are all deleted.
if mb.msgs == 0 {
deleteEmptyBlock(mb)
}
mb.mu.Unlock()
break
}
if nts > 0 {
// Make sure to set age check based on this value.
fs.resetAgeChk(nts - minAge)
}
if deleted > 0 {
// Update block map.
if fs.bim != nil {
for _, mb := range fs.blks[:deleted] {
delete(fs.bim, mb.index)
}
}
// Update blks slice.
fs.blks = copyMsgBlocks(fs.blks[deleted:])
if lb := len(fs.blks); lb == 0 {
fs.lmb = nil
} else {
fs.lmb = fs.blks[lb-1]
}
}
// Update top level accounting.
if purged < fs.state.Msgs {
fs.state.Msgs -= purged
} else {
fs.state.Msgs = 0
}
if bytes < fs.state.Bytes {
fs.state.Bytes -= bytes
} else {
fs.state.Bytes = 0
}
// Make sure to we properly set the fs first sequence and timestamp.
fs.selectNextFirst()
// Check if we have no messages and blocks left.
if fs.lmb == nil && last.seq != 0 {
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
lmb.writeTombstone(last.seq, last.ts)
}
// Clear any global subject state.
fs.psim = make(map[string]*psi)
}
}
func copyMsgBlocks(src []*msgBlock) []*msgBlock {
if src == nil {
return nil
}
dst := make([]*msgBlock, len(src))
copy(dst, src)
return dst
}
// GetSeqFromTime looks for the first sequence number that has
// the message with >= timestamp.
// FIXME(dlc) - inefficient, and dumb really. Make this better.
func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
fs.mu.RLock()
lastSeq := fs.state.LastSeq
closed := fs.closed
fs.mu.RUnlock()
if closed {
return 0
}
mb := fs.selectMsgBlockForStart(t)
if mb == nil {
return lastSeq + 1
}
mb.mu.RLock()
fseq := mb.first.seq
lseq := mb.last.seq
mb.mu.RUnlock()
var smv StoreMsg
// Linear search, hence the dumb part..
ts := t.UnixNano()
for seq := fseq; seq <= lseq; seq++ {
sm, _, _ := mb.fetchMsg(seq, &smv)
if sm != nil && sm.ts >= ts {
return sm.seq
}
}
return 0
}
// Find the first matching message.
func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return nil, false, err
}
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return nil, false, err
}
}
// If we only have 1 subject currently and it matches our filter we can also set isAll.
if !isAll && len(mb.fss) == 1 {
_, isAll = mb.fss[filter]
}
// Skip scan of mb.fss if number of messages in the block are less than
// 1/2 the number of subjects in mb.fss. Or we have a wc and lots of fss entries.
const linearScanMaxFSS = 32
doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS)
if !doLinearScan {
// If we have a wildcard match against all tracked subjects we know about.
if wc {
subs = subs[:0]
for subj := range mb.fss {
if subjectIsSubsetMatch(subj, filter) {
subs = append(subs, subj)
}
}
}
fseq = mb.last.seq + 1
for _, subj := range subs {
ss := mb.fss[subj]
if ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if ss == nil || start > ss.Last || ss.First >= fseq {
continue
}
if ss.First < start {
fseq = start
} else {
fseq = ss.First
}
}
}
if fseq > mb.last.seq {
return nil, false, ErrStoreMsgNotFound
}
if sm == nil {
sm = new(StoreMsg)
}
for seq := fseq; seq <= mb.last.seq; seq++ {
llseq := mb.llseq
fsm, err := mb.cacheLookup(seq, sm)
if err != nil {
continue
}
expireOk := seq == mb.last.seq && mb.llseq == seq
if isAll {
return fsm, expireOk, nil
}
if doLinearScan {
if wc && subjectIsSubsetMatch(fsm.subj, filter) {
return fsm, expireOk, nil
} else if !wc && fsm.subj == filter {
return fsm, expireOk, nil
}
} else {
for _, subj := range subs {
if fsm.subj == subj {
return fsm, expireOk, nil
}
}
}
// If we are here we did not match, so put the llseq back.
mb.llseq = llseq
}
return nil, false, ErrStoreMsgNotFound
}
// This will traverse a message block and generate the filtered pending.
func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.filteredPendingLocked(subj, wc, seq)
}
// This will traverse a message block and generate the filtered pending.
// Lock should be held.
func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
isAll := filter == _EMPTY_ || filter == fwcs
// First check if we can optimize this part.
// This means we want all and the starting sequence was before this block.
if isAll && sseq <= mb.first.seq {
return mb.msgs, mb.first.seq, mb.last.seq
}
update := func(ss *SimpleState) {
total += ss.Msgs
if first == 0 || ss.First < first {
first = ss.First
}
if ss.Last > last {
last = ss.Last
}
}
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
// 1. See if we match any subs from fss.
// 2. If we match and the sseq is past ss.Last then we can use meta only.
// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.
isMatch := func(subj string) bool {
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
var havePartial bool
for subj, ss := range mb.fss {
if isAll || isMatch(subj) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
update(ss)
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
}
}
// If we did not encounter any partials we can return here.
if !havePartial {
return total, first, last
}
// If we are here we need to scan the msgs.
// Clear what we had.
total, first, last = 0, 0, 0
// If we load the cache for a linear scan we want to expire that cache upon exit.
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
var smv StoreMsg
for seq := sseq; seq <= mb.last.seq; seq++ {
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
continue
}
if isAll || isMatch(sm.subj) {
total++
if first == 0 || seq < first {
first = seq
}
if seq > last {
last = seq
}
}
}
// If we loaded this block for this operation go ahead and expire it here.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
return total, first, last
}
// FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
fs.mu.RLock()
defer fs.mu.RUnlock()
lseq := fs.state.LastSeq
if sseq < fs.state.FirstSeq {
sseq = fs.state.FirstSeq
}
// Returned state.
var ss SimpleState
// If past the end no results.
if sseq > lseq {
return ss
}
// If we want all msgs that match we can shortcircuit.
// TODO(dlc) - This can be extended for all cases but would
// need to be careful on total msgs calculations etc.
if sseq == fs.state.FirstSeq {
fs.numFilteredPending(subj, &ss)
} else {
wc := subjectHasWildcard(subj)
// Tracking subject state.
// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
for _, mb := range fs.blks {
// Skip blocks that are less than our starting sequence.
if sseq > atomic.LoadUint64(&mb.last.seq) {
continue
}
t, f, l := mb.filteredPending(subj, wc, sseq)
ss.Msgs += t
if ss.First == 0 || (f > 0 && f < ss.First) {
ss.First = f
}
if l > ss.Last {
ss.Last = l
}
}
}
return ss
}
// Optimized way for getting all num pending matching a filter subject.
// Lock should be held.
func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
isAll := filter == _EMPTY_ || filter == fwcs
// If isAll we do not need to do anything special to calculate the first and last and total.
if isAll {
ss.First = fs.state.FirstSeq
ss.Last = fs.state.LastSeq
ss.Msgs = fs.state.Msgs
return
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
start, stop := uint32(math.MaxUint32), uint32(0)
for subj, psi := range fs.psim {
if isAll {
ss.Msgs += psi.total
} else {
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
if isSubsetMatchTokenized(tts, fts) {
ss.Msgs += psi.total
// Keep track of start and stop indexes for this subject.
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
}
}
}
// If not collecting all we do need to figure out the first and last sequences.
if !isAll {
wc := subjectHasWildcard(filter)
// Do start
mb := fs.bim[start]
if mb != nil {
_, f, _ := mb.filteredPending(filter, wc, 0)
ss.First = f
}
if ss.First == 0 {
// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
for i := start + 1; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
ss.First = f
break
}
}
}
// Now last
if mb = fs.bim[stop]; mb != nil {
_, _, l := mb.filteredPending(filter, wc, 0)
ss.Last = l
}
}
}
// SubjectsState returns a map of SimpleState for all matching subjects.
func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.state.Msgs == 0 {
return nil
}
start, stop := fs.blks[0], fs.lmb
// We can short circuit if not a wildcard using psim for start and stop.
if !subjectHasWildcard(subject) {
info := fs.psim[subject]
if info == nil {
return nil
}
start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
}
// Aggregate fss.
fss := make(map[string]SimpleState)
var startFound bool
for _, mb := range fs.blks {
if !startFound {
if mb != start {
continue
}
startFound = true
}
mb.mu.Lock()
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
oss := fss[subj]
if oss.First == 0 { // New
fss[subj] = *ss
} else {
// Merge here.
oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
fss[subj] = oss
}
}
}
mb.mu.Unlock()
if mb == stop {
break
}
}
return fss
}
// NumPending will return the number of pending messages matching the filter subject starting at sequence.
// Optimized for stream num pending calculations for consumers.
func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
fs.mu.RLock()
defer fs.mu.RUnlock()
// This can always be last for these purposes.
validThrough = fs.state.LastSeq
if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
return 0, validThrough
}
// Track starting for both block for the sseq and staring block that matches any subject.
var seqStart, subjStart int
// See if we need to figure out starting block per sseq.
if sseq > fs.state.FirstSeq {
seqStart, _ = fs.selectMsgBlockWithIndex(sseq)
}
var tsa, fsa [32]string
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
isAll := filter == _EMPTY_ || filter == fwcs
wc := subjectHasWildcard(filter)
// See if filter was provided but its the only subject.
if !isAll && !wc && len(fs.psim) == 1 && fs.psim[filter] != nil {
isAll = true
}
// If we are isAll and have no deleted we can do a simpler calculation.
if isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
if sseq == 0 {
return fs.state.Msgs, validThrough
}
return fs.state.LastSeq - sseq + 1, validThrough
}
isMatch := func(subj string) bool {
if isAll {
return true
}
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
// If we would need to scan more from the beginning, revert back to calculating directly here.
// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
if lastPerSubject || seqStart >= (len(fs.blks)/2) {
// If we need to track seen for last per subject.
var seen map[string]bool
if lastPerSubject {
seen = make(map[string]bool)
}
for i := seqStart; i < len(fs.blks); i++ {
mb := fs.blks[i]
mb.mu.Lock()
var t uint64
if isAll && sseq <= mb.first.seq {
if lastPerSubject {
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
if !seen[subj] {
total++
seen[subj] = true
}
}
} else {
total += mb.msgs
}
mb.mu.Unlock()
continue
}
// If we are here we need to at least scan the subject fss.
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
var havePartial bool
for subj, ss := range mb.fss {
if !seen[subj] && isMatch(subj) {
if lastPerSubject {
// Can't have a partials with last by subject.
if sseq <= ss.Last {
t++
seen[subj] = true
}
} else {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
t += ss.Msgs
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
}
}
}
// See if we need to scan msgs here.
if havePartial {
// Clear on partial.
t = 0
// If we load the cache for a linear scan we want to expire that cache upon exit.
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
var smv StoreMsg
for seq := sseq; seq <= mb.last.seq; seq++ {
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) {
t++
}
}
// If we loaded this block for this operation go ahead and expire it here.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
}
mb.mu.Unlock()
total += t
}
return total, validThrough
}
// If we are here its better to calculate totals from psim and adjust downward by scanning less blocks.
// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
start := uint32(math.MaxUint32)
for subj, psi := range fs.psim {
if isMatch(subj) {
if lastPerSubject {
total++
// Keep track of start index for this subject.
// Use last block in this case.
if psi.lblk < start {
start = psi.lblk
}
} else {
total += psi.total
// Keep track of start index for this subject.
if psi.fblk < start {
start = psi.fblk
}
}
}
}
// See if we were asked for all, if so we are done.
if sseq <= fs.state.FirstSeq {
return total, validThrough
}
// If we are here we need to calculate partials for the first blocks.
subjStart = int(start)
firstSubjBlk := fs.bim[uint32(subjStart)]
var firstSubjBlkFound bool
var smv StoreMsg
// Adjust in case not found.
if firstSubjBlk == nil {
firstSubjBlkFound = true
}
// Track how many we need to adjust against the total.
var adjust uint64
for i := 0; i <= seqStart; i++ {
mb := fs.blks[i]
// We can skip blks if we know they are below the first one that has any subject matches.
if !firstSubjBlkFound {
if mb == firstSubjBlk {
firstSubjBlkFound = true
} else {
continue
}
}
// We need to scan this block.
var shouldExpire bool
mb.mu.Lock()
// Check if we should include all of this block in adjusting. If so work with metadata.
if sseq > mb.last.seq {
if isAll && !lastPerSubject {
adjust += mb.msgs
} else {
// We need to adjust for all matches in this block.
// We will scan fss state vs messages themselves.
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
if isMatch(subj) {
if lastPerSubject {
adjust++
} else {
adjust += ss.Msgs
}
}
}
}
} else {
// This is the last block. We need to scan per message here.
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
return 0, 0
}
shouldExpire = true
}
var last = mb.last.seq
if sseq < last {
last = sseq
}
for seq := mb.first.seq; seq < last; seq++ {
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
continue
}
// Check if it matches our filter.
if isMatch(sm.subj) && sm.seq < sseq {
adjust++
}
}
}
// If we loaded the block try to force expire.
if shouldExpire {
mb.tryForceExpireCacheLocked()
}
mb.mu.Unlock()
}
// Make final adjustment.
total -= adjust
return total, validThrough
}
// SubjectsTotal return message totals per subject.
func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
fs.mu.RLock()
defer fs.mu.RUnlock()
if len(fs.psim) == 0 {
return nil
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
isAll := filter == _EMPTY_ || filter == fwcs
wc := subjectHasWildcard(filter)
isMatch := func(subj string) bool {
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
}
fst := make(map[string]uint64)
for subj, psi := range fs.psim {
if isAll || isMatch(subj) {
fst[subj] = psi.total
}
}
return fst
}
// RegisterStorageUpdates registers a callback for updates to storage changes.
// It will present number of messages and bytes as a signed integer and an
// optional sequence number of the message if a single.
func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
fs.mu.Lock()
fs.scb = cb
bsz := fs.state.Bytes
fs.mu.Unlock()
if cb != nil && bsz > 0 {
cb(0, int64(bsz), 0, _EMPTY_)
}
}
// Helper to get hash key for specific message block.
// Lock should be held
func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
}
func (mb *msgBlock) setupWriteCache(buf []byte) {
// Make sure we have a cache setup.
if mb.cache != nil {
return
}
// Setup simple cache.
mb.cache = &cache{buf: buf}
// Make sure we set the proper cache offset if we have existing data.
var fi os.FileInfo
if mb.mfd != nil {
fi, _ = mb.mfd.Stat()
} else if mb.mfn != _EMPTY_ {
fi, _ = os.Stat(mb.mfn)
}
if fi != nil {
mb.cache.off = int(fi.Size())
}
mb.llts = time.Now().UnixNano()
mb.startCacheExpireTimer()
}
// This rolls to a new append msg block.
// Lock should be held.
func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
index := uint32(1)
var rbuf []byte
if lmb := fs.lmb; lmb != nil {
index = lmb.index + 1
// Determine if we can reclaim any resources here.
if fs.fip {
lmb.mu.Lock()
lmb.closeFDsLocked()
if lmb.cache != nil {
// Reset write timestamp and see if we can expire this cache.
rbuf = lmb.tryExpireWriteCache()
}
lmb.mu.Unlock()
}
}
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
// Lock should be held to quiet race detector.
mb.mu.Lock()
mb.setupWriteCache(rbuf)
mb.fss = make(map[string]*SimpleState)
// Set cache time to creation time to start.
ts := time.Now().UnixNano()
mb.llts, mb.lwts = 0, ts
// Remember our last sequence number.
mb.first.seq = fs.state.LastSeq + 1
mb.last.seq = fs.state.LastSeq
mb.mu.Unlock()
// Now do local hash.
key := sha256.Sum256(fs.hashKeyForBlock(index))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
mb.hh = hh
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, mb.index))
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
if err != nil {
mb.dirtyCloseWithRemove(true)
return nil, fmt.Errorf("Error creating msg block file [%q]: %v", mb.mfn, err)
}
mb.mfd = mfd
// Check if encryption is enabled.
if fs.prf != nil {
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
return nil, err
}
}
// If we know we will need this so go ahead and spin up.
if !fs.fip {
mb.spinUpFlushLoop()
}
// Add to our list of blocks and mark as last.
fs.addMsgBlock(mb)
if fs.dirty > 0 {
fs.kickFlushStateLoop()
}
return mb, nil
}
// Generate the keys for this message block and write them out.
func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
if mb == nil {
return nil
}
key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
if err != nil {
return err
}
mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
mb.kfn = keyFile
return nil
}
// Stores a raw message with expected sequence number and timestamp.
// Lock should be held.
func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
if fs.closed {
return ErrStoreClosed
}
// Per subject max check needed.
mmp := uint64(fs.cfg.MaxMsgsPer)
var psmc uint64
psmax := mmp > 0 && len(subj) > 0
if psmax {
if info, ok := fs.psim[subj]; ok {
psmc = info.total
}
}
var fseq uint64
// Check if we are discarding new messages when we reach the limit.
if fs.cfg.Discard == DiscardNew {
var asl bool
if psmax && psmc >= mmp {
// If we are instructed to discard new per subject, this is an error.
if fs.cfg.DiscardNewPer {
return ErrMaxMsgsPerSubject
}
if fseq, err = fs.firstSeqForSubj(subj); err != nil {
return err
}
asl = true
}
if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
return ErrMaxMsgs
}
if fs.cfg.MaxBytes > 0 && fs.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(fs.cfg.MaxBytes) {
if !asl || fs.sizeForSeq(fseq) <= len(msg)+len(hdr) {
return ErrMaxBytes
}
}
}
// Check sequence.
if seq != fs.state.LastSeq+1 {
if seq > 0 {
return ErrSequenceMismatch
}
seq = fs.state.LastSeq + 1
}
// Write msg record.
n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
if err != nil {
return err
}
// Mark dirty here since we added in a new message.
// We do not kick the flusher, that happens on new msg block for write or Stop().
fs.dirty++
// Adjust top level tracking of per subject msg counts.
if len(subj) > 0 {
index := fs.lmb.index
if info, ok := fs.psim[subj]; ok {
info.total++
if index > info.lblk {
info.lblk = index
}
} else {
fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index}
}
}
// Adjust first if needed.
now := time.Unix(0, ts).UTC()
if fs.state.Msgs == 0 {
fs.state.FirstSeq = seq
fs.state.FirstTime = now
}
fs.state.Msgs++
fs.state.Bytes += n
fs.state.LastSeq = seq
fs.state.LastTime = now
// Enforce per message limits.
// We snapshotted psmc before our actual write, so >= comparison needed.
if psmax && psmc >= mmp {
// We may have done this above.
if fseq == 0 {
fseq, _ = fs.firstSeqForSubj(subj)
}
if ok, _ := fs.removeMsgViaLimits(fseq); ok {
// Make sure we are below the limit.
if psmc--; psmc >= mmp {
for info, ok := fs.psim[subj]; ok && info.total > mmp; info, ok = fs.psim[subj] {
if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
if ok, _ := fs.removeMsgViaLimits(seq); !ok {
break
}
} else {
break
}
}
}
} else if mb := fs.selectMsgBlock(fseq); mb != nil {
// If we are here we could not remove fseq from above, so rebuild.
var ld *LostStreamData
if ld, _, _ = mb.rebuildState(); ld != nil {
fs.rebuildStateLocked(ld)
}
}
}
// Limits checks and enforcement.
// If they do any deletions they will update the
// byte count on their own, so no need to compensate.
fs.enforceMsgLimit()
fs.enforceBytesLimit()
// Check if we have and need the age expiration timer running.
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.startAgeChk()
}
return nil
}
// StoreRawMsg stores a raw message with expected sequence number and timestamp.
func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
fs.mu.Lock()
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
cb := fs.scb
// Check if first message timestamp requires expiry
// sooner than initial replica expiry timer set to MaxAge when initializing.
if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
fs.receivedAny = true
// don't block here by calling expireMsgs directly.
// Instead, set short timeout.
fs.resetAgeChk(int64(time.Millisecond * 50))
}
fs.mu.Unlock()
if err == nil && cb != nil {
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
}
return err
}
// Store stores a message. We hold the main filestore lock for any write operation.
func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
fs.mu.Lock()
seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
cb := fs.scb
fs.mu.Unlock()
if err != nil {
seq, ts = 0, 0
} else if cb != nil {
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
}
return seq, ts, err
}
// skipMsg will update this message block for a skipped message.
// If we do not have any messages, just update the metadata, otherwise
// we will place an empty record marking the sequence as used. The
// sequence will be marked erased.
// fs lock should be held.
func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
if mb == nil {
return
}
var needsRecord bool
nowts := now.UnixNano()
mb.mu.Lock()
// If we are empty can just do meta.
if mb.msgs == 0 {
mb.last.seq = seq
mb.last.ts = nowts
mb.first.seq = seq + 1
mb.first.ts = nowts
} else {
needsRecord = true
mb.dmap.Insert(seq)
}
mb.mu.Unlock()
if needsRecord {
mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
} else {
mb.kickFlusher()
}
}
// SkipMsg will use the next sequence number but not store anything.
func (fs *fileStore) SkipMsg() uint64 {
fs.mu.Lock()
defer fs.mu.Unlock()
// Grab time and last seq.
now, seq := time.Now().UTC(), fs.state.LastSeq+1
fs.state.LastSeq, fs.state.LastTime = seq, now
if fs.state.Msgs == 0 {
fs.state.FirstSeq, fs.state.FirstTime = seq, now
}
if seq == fs.state.FirstSeq {
fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
}
fs.lmb.skipMsg(seq, now)
return seq
}
// Lock should be held.
func (fs *fileStore) rebuildFirst() {
if len(fs.blks) == 0 {
return
}
fmb := fs.blks[0]
if fmb == nil {
return
}
ld, _, _ := fmb.rebuildState()
fmb.mu.RLock()
isEmpty := fmb.msgs == 0
fmb.mu.RUnlock()
if isEmpty {
fs.removeMsgBlock(fmb)
}
fs.selectNextFirst()
fs.rebuildStateLocked(ld)
}
// Optimized helper function to return first sequence.
// subj will always be publish subject here, meaning non-wildcard.
// We assume a fast check that this subj even exists already happened.
// Lock should be held.
func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
if len(fs.blks) == 0 {
return 0, nil
}
// See if we can optimize where we start.
start, stop := fs.blks[0].index, fs.lmb.index
if info, ok := fs.psim[subj]; ok {
start, stop = info.fblk, info.lblk
}
for i := start; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
return 0, err
}
ss := mb.fss[subj]
mb.mu.Unlock()
if ss != nil {
// Adjust first if it was not where we thought it should be.
if i != start {
if info, ok := fs.psim[subj]; ok {
info.fblk = i
}
}
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
return ss.First, nil
}
}
return 0, nil
}
// Will check the msg limit and drop firstSeq msg if needed.
// Lock should be held.
func (fs *fileStore) enforceMsgLimit() {
if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
return
}
for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
fs.rebuildFirst()
return
}
}
}
// Will check the bytes limit and drop msgs if needed.
// Lock should be held.
func (fs *fileStore) enforceBytesLimit() {
if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
return
}
for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
fs.rebuildFirst()
return
}
}
}
// Will make sure we have limits honored for max msgs per subject on recovery or config update.
// We will make sure to go through all msg blocks etc. but in practice this
// will most likely only be the last one, so can take a more conservative approach.
// Lock should be held.
func (fs *fileStore) enforceMsgPerSubjectLimit() {
maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)
// We want to suppress callbacks from remove during this process
// since these should have already been deleted and accounted for.
cb := fs.scb
fs.scb = nil
defer func() { fs.scb = cb }()
var numMsgs uint64
// collect all that are not correct.
needAttention := make(map[string]*psi)
for subj, psi := range fs.psim {
numMsgs += psi.total
if psi.total > maxMsgsPer {
needAttention[subj] = psi
}
}
// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
if numMsgs != fs.state.Msgs {
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
ld, _, err := mb.rebuildState()
if err != nil && ld != nil {
fs.addLostData(ld)
}
fs.populateGlobalPerSubjectInfo(mb)
}
// Rebuild fs state too.
fs.rebuildStateLocked(nil)
// Need to redo blocks that need attention.
needAttention = make(map[string]*psi)
for subj, psi := range fs.psim {
if psi.total > maxMsgsPer {
needAttention[subj] = psi
}
}
}
// Collect all the msgBlks we alter.
blks := make(map[*msgBlock]struct{})
// For re-use below.
var sm StoreMsg
// Walk all subjects that need attention here.
for subj, info := range needAttention {
total, start, stop := info.total, info.fblk, info.lblk
for i := start; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
}
// Grab the ss entry for this subject in case sparse.
mb.mu.Lock()
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
mb.mu.Unlock()
if ss == nil {
continue
}
for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
m, _, err := mb.firstMatching(subj, false, seq, &sm)
if err == nil {
seq = m.seq + 1
if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
total--
blks[mb] = struct{}{}
}
} else {
// On error just do single increment.
seq++
}
}
}
}
// Now write updated index for all affected msgBlks.
for mb := range blks {
mb.tryForceExpireCacheLocked()
}
}
// Lock should be held.
func (fs *fileStore) deleteFirstMsg() (bool, error) {
return fs.removeMsgViaLimits(fs.state.FirstSeq)
}
// If we remove via limits that can always be recovered on a restart we
// do not force the system to update the index file.
// Lock should be held.
func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
return fs.removeMsg(seq, false, true, false)
}
// RemoveMsg will remove the message from this store.
// Will return the number of bytes removed.
func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
return fs.removeMsg(seq, false, false, true)
}
func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
return fs.removeMsg(seq, true, false, true)
}
// Convenience function to remove per subject tracking at the filestore level.
// Lock should be held.
func (fs *fileStore) removePerSubject(subj string) {
if len(subj) == 0 {
return
}
// We do not update sense of fblk here but will do so when we resolve during lookup.
if info, ok := fs.psim[subj]; ok {
info.total--
if info.total == 0 {
delete(fs.psim, subj)
}
}
}
// Remove a message, optionally rewriting the mb file.
func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
if seq == 0 {
return false, ErrStoreMsgNotFound
}
fsLock := func() {
if needFSLock {
fs.mu.Lock()
}
}
fsUnlock := func() {
if needFSLock {
fs.mu.Unlock()
}
}
fsLock()
if fs.closed {
fsUnlock()
return false, ErrStoreClosed
}
if !viaLimits && fs.sips > 0 {
fsUnlock()
return false, ErrStoreSnapshotInProgress
}
// If in encrypted mode negate secure rewrite here.
if secure && fs.prf != nil {
secure = false
}
if fs.state.Msgs == 0 {
var err = ErrStoreEOF
if seq <= fs.state.LastSeq {
err = ErrStoreMsgNotFound
}
fsUnlock()
return false, err
}
mb := fs.selectMsgBlock(seq)
if mb == nil {
var err = ErrStoreEOF
if seq <= fs.state.LastSeq {
err = ErrStoreMsgNotFound
}
fsUnlock()
return false, err
}
mb.mu.Lock()
// See if we are closed or the sequence number is still relevant.
if mb.closed || seq < mb.first.seq {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// Now check dmap if it is there.
if mb.dmap.Exists(seq) {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
// Now just load regardless.
// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
if mb.cacheNotLoaded() {
// We do not want to block possible activity within another msg block.
// We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another
// go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block.
mb.mu.Unlock()
fsUnlock()
if err := mb.loadMsgs(); err != nil {
return false, err
}
fsLock()
// We need to check if things changed out from underneath us.
if fs.closed {
fsUnlock()
return false, ErrStoreClosed
}
mb.mu.Lock()
if mb.closed || seq < mb.first.seq {
mb.mu.Unlock()
fsUnlock()
return false, nil
}
// cacheLookup below will do dmap check so no need to repeat here.
}
var smv StoreMsg
sm, err := mb.cacheLookup(seq, &smv)
if err != nil {
mb.mu.Unlock()
fsUnlock()
// Mimic err behavior from above check to dmap. No error returned if already removed.
if err == errDeletedMsg {
err = nil
}
return false, err
}
// Grab size
msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
// Set cache timestamp for last remove.
mb.lrts = time.Now().UnixNano()
// Global stats
if fs.state.Msgs > 0 {
fs.state.Msgs--
}
if msz < fs.state.Bytes {
fs.state.Bytes -= msz
} else {
fs.state.Bytes = 0
}
// Now local mb updates.
if mb.msgs > 0 {
mb.msgs--
}
if msz < mb.bytes {
mb.bytes -= msz
} else {
mb.bytes = 0
}
// Mark as dirty for stream state.
fs.dirty++
// If we are tracking subjects here make sure we update that accounting.
mb.ensurePerSubjectInfoLoaded()
// If we are tracking multiple subjects here make sure we update that accounting.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
if secure {
// Grab record info.
ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
mb.eraseMsg(seq, int(ri), int(rl))
}
fifo := seq == mb.first.seq
isLastBlock := mb == fs.lmb
isEmpty := mb.msgs == 0
if fifo {
mb.selectNextFirst()
if !isEmpty {
// Can update this one in place.
if seq == fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq // new one.
if mb.first.ts == 0 {
fs.state.FirstTime = time.Time{}
} else {
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
}
}
} else if !isEmpty {
if mb.dmap.IsEmpty() {
// Mark initial base for delete set.
mb.dmap.SetInitialMin(mb.first.seq)
}
// Out of order delete.
mb.dmap.Insert(seq)
// Check if <25% utilization and minimum size met.
if mb.rbytes > compactMinimum && !isLastBlock {
// Remove the interior delete records
rbytes := mb.rbytes - uint64(mb.dmap.Size()*emptyRecordLen)
if rbytes>>2 > mb.bytes {
mb.compact()
}
}
}
if secure {
if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
// We have the mb lock here, this needs the mb locks so do in its own go routine.
go fs.rebuildState(ld)
}
}
// If empty remove this block and check if we need to update first sequence.
// We will write a tombstone at the end.
var firstSeqNeedsUpdate bool
if isEmpty {
fs.removeMsgBlock(mb)
firstSeqNeedsUpdate = seq == fs.state.FirstSeq
}
mb.mu.Unlock()
// If we emptied the current message block and the seq was state.FirstSeq
// then we need to jump message blocks. We will also write the index so
// we don't lose track of the first sequence.
if firstSeqNeedsUpdate {
fs.selectNextFirst()
}
// Check if we need to write a deleted record tombstone.
// This is for user initiated removes or to hold the first seq
// when the last block is empty.
if !viaLimits || (isEmpty && isLastBlock) {
if lmb := fs.lmb; sm != nil && lmb != nil {
lmb.writeTombstone(sm.seq, sm.ts)
}
fs.kickFlushStateLoop()
}
if cb := fs.scb; cb != nil {
// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
fs.mu.Unlock()
// Storage updates.
var subj string
if sm != nil {
subj = sm.subj
}
delta := int64(msz)
cb(-1, -delta, seq, subj)
if !needFSLock {
fs.mu.Lock()
}
} else if needFSLock {
// We acquired it so release it.
fs.mu.Unlock()
}
return true, nil
}
// This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
// This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
// writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
// Write lock needs to be held.
func (mb *msgBlock) compact() {
wasLoaded := mb.cacheAlreadyLoaded()
if !wasLoaded {
if err := mb.loadMsgsWithLock(); err != nil {
return
}
}
buf := mb.cache.buf
nbuf := make([]byte, 0, len(buf))
var le = binary.LittleEndian
var firstSet bool
isDeleted := func(seq uint64) bool {
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
return true
}
return mb.dmap.Exists(seq)
}
// For skip msgs.
var smh [msgHdrSize]byte
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
if index+msgHdrSize > lbuf {
return
}
hdr := buf[index : index+msgHdrSize]
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
return
}
// Only need to process non-deleted messages.
seq := le.Uint64(hdr[4:])
if !isDeleted(seq) {
// Normal message here.
nbuf = append(nbuf, buf[index:index+rl]...)
// Do not set based on tombstone.
if !firstSet && seq&tbit == 0 {
firstSet = true
mb.first.seq = seq
}
} else if firstSet {
// This is an interior delete that we need to make sure we have a placeholder for.
le.PutUint32(smh[0:], emptyRecordLen)
le.PutUint64(smh[4:], seq|ebit)
le.PutUint64(smh[12:], 0)
le.PutUint16(smh[20:], 0)
nbuf = append(nbuf, smh[:]...)
mb.hh.Reset()
mb.hh.Write(smh[4:20])
checksum := mb.hh.Sum(nil)
nbuf = append(nbuf, checksum...)
}
// Always set last.
mb.last.seq = seq &^ ebit
// Advance to next record.
index += rl
}
// Check for encryption.
if mb.bek != nil && len(nbuf) > 0 {
// Recreate to reset counter.
rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return
}
rbek.XORKeyStream(nbuf, nbuf)
}
// Close FDs first.
mb.closeFDsLocked()
// We will write to a new file and mv/rename it in case of failure.
mfn := filepath.Join(filepath.Join(mb.fs.fcfg.StoreDir, msgDir), fmt.Sprintf(newScan, mb.index))
if err := os.WriteFile(mfn, nbuf, defaultFilePerms); err != nil {
os.Remove(mfn)
return
}
if err := os.Rename(mfn, mb.mfn); err != nil {
os.Remove(mfn)
return
}
// Remove index file and wipe delete map, then rebuild.
mb.deleteDmap()
mb.rebuildStateLocked()
// If we entered with the msgs loaded make sure to reload them.
if wasLoaded {
mb.loadMsgsWithLock()
}
}
// Empty out our dmap.
func (mb *msgBlock) deleteDmap() {
mb.dmap.Empty()
}
// Grab info from a slot.
// Lock should be held.
func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
if mb.cache == nil || slot >= len(mb.cache.idx) {
return 0, 0, false, errPartialCache
}
bi := mb.cache.idx[slot]
ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0
// If this is a deleted slot return here.
if bi == dbit {
return 0, 0, false, errDeletedMsg
}
// Determine record length
var rl uint32
if len(mb.cache.idx) > slot+1 {
ni := mb.cache.idx[slot+1] &^ hbit
rl = ni - ri
} else {
rl = mb.cache.lrl
}
if rl < msgHdrSize {
return 0, 0, false, errBadMsg
}
return uint32(ri), rl, hashChecked, nil
}
func (fs *fileStore) isClosed() bool {
fs.mu.RLock()
closed := fs.closed
fs.mu.RUnlock()
return closed
}
// Will spin up our flush loop.
func (mb *msgBlock) spinUpFlushLoop() {
mb.mu.Lock()
defer mb.mu.Unlock()
// Are we already running or closed?
if mb.flusher || mb.closed {
return
}
mb.flusher = true
mb.fch = make(chan struct{}, 1)
mb.qch = make(chan struct{})
fch, qch := mb.fch, mb.qch
go mb.flushLoop(fch, qch)
}
// Raw low level kicker for flush loops.
func kickFlusher(fch chan struct{}) {
if fch != nil {
select {
case fch <- struct{}{}:
default:
}
}
}
// Kick flusher for this message block.
func (mb *msgBlock) kickFlusher() {
mb.mu.RLock()
defer mb.mu.RUnlock()
kickFlusher(mb.fch)
}
func (mb *msgBlock) setInFlusher() {
mb.mu.Lock()
mb.flusher = true
mb.mu.Unlock()
}
func (mb *msgBlock) clearInFlusher() {
mb.mu.Lock()
mb.flusher = false
mb.mu.Unlock()
}
// flushLoop watches for messages, index info, or recently closed msg block updates.
func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
mb.setInFlusher()
defer mb.clearInFlusher()
for {
select {
case <-fch:
// If we have pending messages process them first.
if waiting := mb.pendingWriteSize(); waiting != 0 {
ts := 1 * time.Millisecond
var waited time.Duration
for waiting < coalesceMinimum {
time.Sleep(ts)
select {
case <-qch:
return
default:
}
newWaiting := mb.pendingWriteSize()
if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
break
}
waiting = newWaiting
ts *= 2
}
mb.flushPendingMsgs()
// Check if we are no longer the last message block. If we are
// not we can close FDs and exit.
mb.fs.mu.RLock()
notLast := mb != mb.fs.lmb
mb.fs.mu.RUnlock()
if notLast {
if err := mb.closeFDs(); err == nil {
return
}
}
}
case <-qch:
return
}
}
}
// Lock should be held.
func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
var le = binary.LittleEndian
var hdr [msgHdrSize]byte
le.PutUint32(hdr[0:], uint32(rl))
le.PutUint64(hdr[4:], seq|ebit)
le.PutUint64(hdr[12:], 0)
le.PutUint16(hdr[20:], 0)
// Randomize record
data := make([]byte, rl-emptyRecordLen)
rand.Read(data)
// Now write to underlying buffer.
var b bytes.Buffer
b.Write(hdr[:])
b.Write(data)
// Calculate hash.
mb.hh.Reset()
mb.hh.Write(hdr[4:20])
mb.hh.Write(data)
checksum := mb.hh.Sum(nil)
// Write to msg record.
b.Write(checksum)
// Update both cache and disk.
nbytes := b.Bytes()
// Cache
if ri >= mb.cache.off {
li := ri - mb.cache.off
buf := mb.cache.buf[li : li+rl]
copy(buf, nbytes)
}
// Disk
if mb.cache.off+mb.cache.wp > ri {
mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
if err != nil {
return err
}
defer mfd.Close()
if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
mfd.Sync()
}
if err != nil {
return err
}
}
return nil
}
// Truncate this message block to the storedMsg.
func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
// Make sure we are loaded to process messages etc.
if err := mb.loadMsgs(); err != nil {
return 0, 0, err
}
// Calculate new eof using slot info from our new last sm.
ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
if err != nil {
return 0, 0, err
}
// Calculate new eof.
eof := int64(ri + rl)
var purged, bytes uint64
mb.mu.Lock()
checkDmap := mb.dmap.Size() > 0
var smv StoreMsg
for seq := mb.last.seq; seq > sm.seq; seq-- {
if checkDmap {
if mb.dmap.Exists(seq) {
// Delete and skip to next.
mb.dmap.Delete(seq)
checkDmap = !mb.dmap.IsEmpty()
continue
}
}
// We should have a valid msg to calculate removal stats.
if m, err := mb.cacheLookup(seq, &smv); err == nil {
if mb.msgs > 0 {
rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
mb.msgs--
if rl > mb.bytes {
rl = mb.bytes
}
mb.bytes -= rl
mb.rbytes -= rl
// For return accounting.
purged++
bytes += uint64(rl)
}
}
}
// If the block is compressed then we have to load it into memory
// and decompress it, truncate it and then write it back out.
// Otherwise, truncate the file itself and close the descriptor.
if mb.cmp != NoCompression {
buf, err := mb.loadBlock(nil)
if err != nil {
return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
}
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return 0, 0, err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
buf, err = mb.decompressIfNeeded(buf)
if err != nil {
return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
}
buf = buf[:eof]
copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
buf, err = mb.cmp.Compress(buf)
if err != nil {
return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
}
meta := &CompressionInfo{
Algorithm: mb.cmp,
OriginalSize: uint64(eof),
}
buf = append(meta.MarshalMetadata(), buf...)
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return 0, 0, err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
n, err := mb.writeAt(buf, 0)
if err != nil {
return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
}
if n != len(buf) {
return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
}
mb.mfd.Truncate(int64(len(buf)))
mb.mfd.Sync()
} else if mb.mfd != nil {
mb.mfd.Truncate(eof)
mb.mfd.Sync()
// Update our checksum.
var lchk [8]byte
mb.mfd.ReadAt(lchk[:], eof-8)
copy(mb.lchk[0:], lchk[:])
} else {
mb.mu.Unlock()
return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
}
// Update our last msg.
mb.last.seq = sm.seq
mb.last.ts = sm.ts
// Clear our cache.
mb.clearCacheAndOffset()
// Redo per subject info for this block.
mb.resetPerSubjectInfo()
mb.mu.Unlock()
// Load msgs again.
mb.loadMsgs()
return purged, bytes, nil
}
// Lock should be held.
func (mb *msgBlock) isEmpty() bool {
return mb.first.seq > mb.last.seq
}
// Lock should be held.
func (mb *msgBlock) selectNextFirst() {
var seq uint64
for seq = mb.first.seq + 1; seq <= mb.last.seq; seq++ {
if mb.dmap.Exists(seq) {
// We will move past this so we can delete the entry.
mb.dmap.Delete(seq)
} else {
break
}
}
// Set new first sequence.
mb.first.seq = seq
// Check if we are empty..
if mb.isEmpty() {
mb.first.ts = 0
return
}
// Need to get the timestamp.
// We will try the cache direct and fallback if needed.
var smv StoreMsg
sm, _ := mb.cacheLookup(seq, &smv)
if sm == nil {
// Slow path, need to unlock.
mb.mu.Unlock()
sm, _, _ = mb.fetchMsg(seq, &smv)
mb.mu.Lock()
}
if sm != nil {
mb.first.ts = sm.ts
} else {
mb.first.ts = 0
}
}
// Select the next FirstSeq
// Lock should be held.
func (fs *fileStore) selectNextFirst() {
if len(fs.blks) > 0 {
mb := fs.blks[0]
mb.mu.RLock()
fs.state.FirstSeq = mb.first.seq
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
mb.mu.RUnlock()
} else {
// Could not find anything, so treat like purge
fs.state.FirstSeq = fs.state.LastSeq + 1
fs.state.FirstTime = time.Time{}
}
}
// Lock should be held.
func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
if td == 0 {
td = mb.cexp
}
if mb.ctmr == nil {
mb.ctmr = time.AfterFunc(td, mb.expireCache)
} else {
mb.ctmr.Reset(td)
}
}
// Lock should be held.
func (mb *msgBlock) startCacheExpireTimer() {
mb.resetCacheExpireTimer(0)
}
// Used when we load in a message block.
// Lock should be held.
func (mb *msgBlock) clearCacheAndOffset() {
// Reset linear scan tracker.
mb.llseq = 0
if mb.cache != nil {
mb.cache.off = 0
mb.cache.wp = 0
}
mb.clearCache()
}
// Lock should be held.
func (mb *msgBlock) clearCache() {
if mb.ctmr != nil && mb.fss == nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
if mb.cache == nil {
return
}
buf := mb.cache.buf
if mb.cache.off == 0 {
mb.cache = nil
} else {
// Clear msgs and index.
mb.cache.buf = nil
mb.cache.idx = nil
mb.cache.wp = 0
}
recycleMsgBlockBuf(buf)
}
// Called to possibly expire a message block cache.
func (mb *msgBlock) expireCache() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.expireCacheLocked()
}
func (mb *msgBlock) tryForceExpireCache() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.tryForceExpireCacheLocked()
}
// We will attempt to force expire this by temporarily clearing the last load time.
func (mb *msgBlock) tryForceExpireCacheLocked() {
llts := mb.llts
mb.llts = 0
mb.expireCacheLocked()
mb.llts = llts
}
// This is for expiration of the write cache, which will be partial with fip.
// So we want to bypass the Pools here.
// Lock should be held.
func (mb *msgBlock) tryExpireWriteCache() []byte {
if mb.cache == nil {
return nil
}
lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
mb.lwts, mb.cache.nra = 0, true
mb.expireCacheLocked()
mb.lwts = lwts
if mb.cache != nil {
mb.cache.nra = nra
}
// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
// Clear last write time since we now are about to move on to a new lmb.
mb.lwts = 0
return buf[:0]
}
return nil
}
// Lock should be held.
func (mb *msgBlock) expireCacheLocked() {
if mb.cache == nil && mb.fss == nil {
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
return
}
// Can't expire if we still have pending.
if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
mb.resetCacheExpireTimer(mb.cexp)
return
}
// Grab timestamp to compare.
tns := time.Now().UnixNano()
// For the core buffer of messages, we care about reads and writes, but not removes.
bufts := mb.llts
if mb.lwts > bufts {
bufts = mb.lwts
}
// Check for activity on the cache that would prevent us from expiring.
if tns-bufts <= int64(mb.cexp) {
mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
return
}
// If we are here we will at least expire the core msg buffer.
// We need to capture offset in case we do a write next before a full load.
if mb.cache != nil {
mb.cache.off += len(mb.cache.buf)
if !mb.cache.nra {
recycleMsgBlockBuf(mb.cache.buf)
}
mb.cache.buf = nil
mb.cache.wp = 0
}
// Check if we can clear out our fss and idx unless under force expire.
// We used to hold onto the idx longer but removes need buf now so no point.
mb.fss = nil
mb.clearCache()
}
func (fs *fileStore) startAgeChk() {
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
}
}
// Lock should be held.
func (fs *fileStore) resetAgeChk(delta int64) {
if fs.cfg.MaxAge == 0 {
return
}
fireIn := fs.cfg.MaxAge
if delta > 0 && time.Duration(delta) < fireIn {
fireIn = time.Duration(delta)
}
if fs.ageChk != nil {
fs.ageChk.Reset(fireIn)
} else {
fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
}
}
// Lock should be held.
func (fs *fileStore) cancelAgeChk() {
if fs.ageChk != nil {
fs.ageChk.Stop()
fs.ageChk = nil
}
}
// Will expire msgs that are too old.
func (fs *fileStore) expireMsgs() {
// We need to delete one by one here and can not optimize for the time being.
// Reason is that we need more information to adjust ack pending in consumers.
var smv StoreMsg
var sm *StoreMsg
fs.mu.RLock()
maxAge := int64(fs.cfg.MaxAge)
minAge := time.Now().UnixNano() - maxAge
fs.mu.RUnlock()
for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
fs.mu.Lock()
fs.removeMsgViaLimits(sm.seq)
fs.mu.Unlock()
// Recalculate in case we are expiring a bunch.
minAge = time.Now().UnixNano() - maxAge
}
fs.mu.Lock()
defer fs.mu.Unlock()
// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
if fs.state.Msgs == 0 {
fs.cancelAgeChk()
} else {
if sm == nil {
fs.resetAgeChk(0)
} else {
fs.resetAgeChk(sm.ts - minAge)
}
}
}
// Lock should be held.
func (fs *fileStore) checkAndFlushAllBlocks() {
for _, mb := range fs.blks {
if mb.pendingWriteSize() > 0 {
// Since fs lock is held need to pull this apart in case we need to rebuild state.
mb.mu.Lock()
ld, _ := mb.flushPendingMsgsLocked()
mb.mu.Unlock()
if ld != nil {
fs.rebuildStateLocked(ld)
}
}
}
}
// This will check all the checksums on messages and report back any sequence numbers with errors.
func (fs *fileStore) checkMsgs() *LostStreamData {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.checkAndFlushAllBlocks()
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
// Make sure encryption loaded if needed for the block.
fs.loadEncryptionForMsgBlock(mb)
// FIXME(dlc) - check tombstones here too?
if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
// Rebuild fs state too.
mb.fs.rebuildStateLocked(ld)
}
fs.populateGlobalPerSubjectInfo(mb)
}
return fs.ld
}
// Lock should be held.
func (mb *msgBlock) enableForWriting(fip bool) error {
if mb == nil {
return errNoMsgBlk
}
if mb.mfd != nil {
return nil
}
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
if err != nil {
return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
}
mb.mfd = mfd
// Spin up our flusher loop if needed.
if !fip {
mb.spinUpFlushLoop()
}
return nil
}
// Helper function to place a delete tombstone.
// Lock should be held.
func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
}
// Will write the message record to the underlying message block.
// filestore lock will be held.
func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
mb.mu.Lock()
defer mb.mu.Unlock()
// Enable for writing if our mfd is not open.
if mb.mfd == nil {
if err := mb.enableForWriting(flush); err != nil {
return err
}
}
// Make sure we have a cache setup.
if mb.cache == nil {
mb.setupWriteCache(nil)
}
// Check if we are tracking per subject for our simple state.
// Do this before changing the cache that would trigger a flush pending msgs call
// if we needed to regenerate the per subject info.
if len(subj) > 0 && !mb.noTrack {
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return err
}
if ss := mb.fss[subj]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
// Indexing
index := len(mb.cache.buf) + int(mb.cache.off)
// Formats
// Format with no header
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
// With headers, high bit on total length will be set.
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)
// First write header, etc.
var le = binary.LittleEndian
var hdr [msgHdrSize]byte
l := uint32(rl)
hasHeaders := len(mhdr) > 0
if hasHeaders {
l |= hbit
}
le.PutUint32(hdr[0:], l)
le.PutUint64(hdr[4:], seq)
le.PutUint64(hdr[12:], uint64(ts))
le.PutUint16(hdr[20:], uint16(len(subj)))
// Now write to underlying buffer.
mb.cache.buf = append(mb.cache.buf, hdr[:]...)
mb.cache.buf = append(mb.cache.buf, subj...)
if hasHeaders {
var hlen [4]byte
le.PutUint32(hlen[0:], uint32(len(mhdr)))
mb.cache.buf = append(mb.cache.buf, hlen[:]...)
mb.cache.buf = append(mb.cache.buf, mhdr...)
}
mb.cache.buf = append(mb.cache.buf, msg...)
// Calculate hash.
mb.hh.Reset()
mb.hh.Write(hdr[4:20])
mb.hh.Write([]byte(subj))
if hasHeaders {
mb.hh.Write(mhdr)
}
mb.hh.Write(msg)
checksum := mb.hh.Sum(nil)
// Grab last checksum
copy(mb.lchk[0:], checksum)
// Update write through cache.
// Write to msg record.
mb.cache.buf = append(mb.cache.buf, checksum...)
mb.cache.lrl = uint32(rl)
// Set cache timestamp for last store.
mb.lwts = ts
// Only update index and do accounting if not a delete tombstone.
if seq&tbit == 0 {
// Accounting, do this before stripping ebit, it is ebit aware.
mb.updateAccounting(seq, ts, rl)
// Strip ebit if set.
seq = seq &^ ebit
if mb.cache.fseq == 0 {
mb.cache.fseq = seq
}
// Write index
mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
}
fch, werr := mb.fch, mb.werr
// If we should be flushing, or had a write error, do so here.
if flush || werr != nil {
ld, err := mb.flushPendingMsgsLocked()
if ld != nil && mb.fs != nil {
// We have the mb lock here, this needs the mb locks so do in its own go routine.
go mb.fs.rebuildState(ld)
}
if err != nil {
return err
}
} else {
// Kick the flusher here.
kickFlusher(fch)
}
return nil
}
// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSize() int {
if mb == nil {
return 0
}
mb.mu.RLock()
defer mb.mu.RUnlock()
return mb.pendingWriteSizeLocked()
}
// How many bytes pending to be written for this message block.
func (mb *msgBlock) pendingWriteSizeLocked() int {
if mb == nil {
return 0
}
var pending int
if !mb.closed && mb.mfd != nil && mb.cache != nil {
pending = len(mb.cache.buf) - int(mb.cache.wp)
}
return pending
}
// Try to close our FDs if we can.
func (mb *msgBlock) closeFDs() error {
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.closeFDsLocked()
}
func (mb *msgBlock) closeFDsLocked() error {
if buf, _ := mb.bytesPending(); len(buf) > 0 {
return errPendingData
}
mb.closeFDsLockedNoCheck()
return nil
}
func (mb *msgBlock) closeFDsLockedNoCheck() {
if mb.mfd != nil {
mb.mfd.Close()
mb.mfd = nil
}
}
// bytesPending returns the buffer to be used for writing to the underlying file.
// This marks we are in flush and will return nil if asked again until cleared.
// Lock should be held.
func (mb *msgBlock) bytesPending() ([]byte, error) {
if mb == nil || mb.mfd == nil {
return nil, errNoPending
}
if mb.cache == nil {
return nil, errNoCache
}
if len(mb.cache.buf) <= mb.cache.wp {
return nil, errNoPending
}
buf := mb.cache.buf[mb.cache.wp:]
if len(buf) == 0 {
return nil, errNoPending
}
return buf, nil
}
// Returns the current blkSize including deleted msgs etc.
func (mb *msgBlock) blkSize() uint64 {
mb.mu.RLock()
nb := mb.rbytes
mb.mu.RUnlock()
return nb
}
// Update accounting on a write msg.
// Lock should be held.
func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
isDeleted := seq&ebit != 0
if isDeleted {
seq = seq &^ ebit
}
if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq {
mb.first.seq = seq
mb.first.ts = ts
}
// Need atomics here for selectMsgBlock speed.
atomic.StoreUint64(&mb.last.seq, seq)
mb.last.ts = ts
mb.rbytes += rl
if !isDeleted {
mb.bytes += rl
mb.msgs++
}
}
// Lock should be held.
func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
var err error
// Get size for this message.
rl := fileStoreMsgSize(subj, hdr, msg)
if rl&hbit != 0 {
return 0, ErrMsgTooLarge
}
// Grab our current last message block.
mb := fs.lmb
// Mark as dirty for stream state.
fs.dirty++
if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
if mb != nil && fs.fcfg.Compression != NoCompression {
// We've now reached the end of this message block, if we want
// to compress blocks then now's the time to do it.
go mb.recompressOnDiskIfNeeded()
}
if mb, err = fs.newMsgBlockForWrite(); err != nil {
return 0, err
}
}
// Ask msg block to store in write through cache.
err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)
return rl, err
}
func (mb *msgBlock) recompressOnDiskIfNeeded() error {
// Wait for disk I/O slots to become available. This prevents us from
// running away with system resources.
<-dios
defer func() {
dios <- struct{}{}
}()
alg := mb.fs.fcfg.Compression
mb.mu.Lock()
defer mb.mu.Unlock()
origFN := mb.mfn // The original message block on disk.
tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.
// Open up the file block and read in the entire contents into memory.
// One of two things will happen:
// 1. The block will be compressed already and have a valid metadata
// header, in which case we do nothing.
// 2. The block will be uncompressed, in which case we will compress it
// and then write it back out to disk, reencrypting if necessary.
origBuf, err := os.ReadFile(origFN)
if err != nil {
return fmt.Errorf("failed to read original block from disk: %w", err)
}
// If the block is encrypted then we will need to decrypt it before
// doing anything. We always encrypt after compressing because then the
// compression can be as efficient as possible on the raw data, whereas
// the encrypted ciphertext will not compress anywhere near as well.
// The block encryption also covers the optional compression metadata.
if mb.bek != nil && len(origBuf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(origBuf, origBuf)
}
meta := &CompressionInfo{}
if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
// An error is only returned here if there's a problem with parsing
// the metadata. If the file has no metadata at all, no error is
// returned and the algorithm defaults to no compression.
return fmt.Errorf("failed to read existing metadata header: %w", err)
}
if meta.Algorithm == alg {
// The block is already compressed with the chosen algorithm so there
// is nothing else to do. This is not a common case, it is here only
// to ensure we don't do unnecessary work in case something asked us
// to recompress an already compressed block with the same algorithm.
return nil
} else if alg != NoCompression {
// The block is already compressed using some algorithm, so we need
// to decompress the block using the existing algorithm before we can
// recompress it with the new one.
if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
return fmt.Errorf("failed to decompress original block: %w", err)
}
}
// Rather than modifying the existing block on disk (which is a dangerous
// operation if something goes wrong), create a new temporary file. We will
// write out the new block here and then swap the files around afterwards
// once everything else has succeeded correctly.
tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
if err != nil {
return fmt.Errorf("failed to create temporary file: %w", err)
}
// The original buffer at this point is uncompressed, so we will now compress
// it if needed. Note that if the selected algorithm is NoCompression, the
// Compress function will just return the input buffer unmodified.
cmpBuf, err := alg.Compress(origBuf)
if err != nil {
return fmt.Errorf("failed to compress block: %w", err)
}
// We only need to write out the metadata header if compression is enabled.
// If we're trying to uncompress the file on disk at this point, don't bother
// writing metadata.
if alg != NoCompression {
meta := &CompressionInfo{
Algorithm: alg,
OriginalSize: uint64(len(origBuf)),
}
cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
}
// Re-encrypt the block if necessary.
if mb.bek != nil && len(cmpBuf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(cmpBuf, cmpBuf)
}
// Write the new block data (which might be compressed or encrypted) to the
// temporary file.
errorCleanup := func(err error) error {
tmpFD.Close()
os.Remove(tmpFN)
return err
}
if n, err := tmpFD.Write(cmpBuf); err != nil {
return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
} else if n != len(cmpBuf) {
return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
}
if err := tmpFD.Sync(); err != nil {
return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
}
if err := tmpFD.Close(); err != nil {
return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
}
// Now replace the original file with the newly updated temp file.
if err := os.Rename(tmpFN, origFN); err != nil {
return fmt.Errorf("failed to move temporary file into place: %w", err)
}
// Since the message block might be retained in memory, make sure the
// compression algorithm is up-to-date, since this will be needed when
// compacting or truncating.
mb.cmp = alg
return nil
}
func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
var meta CompressionInfo
if n, err := meta.UnmarshalMetadata(buf); err != nil {
// There was a problem parsing the metadata header of the block.
// If there's no metadata header, an error isn't returned here,
// we will instead just use default values of no compression.
return nil, err
} else if n == 0 {
// There were no metadata bytes, so we assume the block is not
// compressed and return it as-is.
return buf, nil
} else {
// Metadata was present so it's quite likely the block contents
// are compressed. If by any chance the metadata claims that the
// block is uncompressed, then the input slice is just returned
// unmodified.
return meta.Algorithm.Decompress(buf[n:])
}
}
// Sync msg and index files as needed. This is called from a timer.
func (fs *fileStore) syncBlocks() {
fs.mu.RLock()
if fs.closed {
fs.mu.RUnlock()
return
}
blks := append([]*msgBlock(nil), fs.blks...)
fs.mu.RUnlock()
for _, mb := range blks {
// Do actual sync. Hold lock for consistency.
mb.mu.Lock()
if mb.closed {
mb.mu.Unlock()
continue
}
if mb.needSync {
// Flush anything that may be pending.
if mb.pendingWriteSizeLocked() > 0 {
mb.flushPendingMsgsLocked()
}
if mb.mfd != nil {
mb.mfd.Sync()
} else {
fd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
if err != nil {
mb.mu.Unlock()
continue
}
fd.Sync()
fd.Close()
}
mb.needSync = false
}
// See if we can close FDs due to being idle.
if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
mb.dirtyCloseWithRemove(false)
}
mb.mu.Unlock()
}
fs.mu.Lock()
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
syncAlways := fs.fcfg.SyncAlways
fs.mu.Unlock()
if !syncAlways {
if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
fd.Sync()
fd.Close()
}
}
}
// Select the message block where this message should be found.
// Return nil if not in the set.
// Read lock should be held.
func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
_, mb := fs.selectMsgBlockWithIndex(seq)
return mb
}
func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
// Check for out of range.
if seq < fs.state.FirstSeq || seq > fs.state.LastSeq {
return -1, nil
}
const linearThresh = 32
nb := len(fs.blks) - 1
if nb < linearThresh {
for i, mb := range fs.blks {
if seq <= atomic.LoadUint64(&mb.last.seq) {
return i, mb
}
}
return -1, nil
}
// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
mb := fs.blks[mid]
// Right now these atomic loads do not factor in, so fine to leave. Was considering
// uplifting these to fs scope to avoid atomic load but not needed.
first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
if seq > last {
low = mid + 1
} else if seq < first {
high = mid - 1
} else {
return mid, mb
}
}
return -1, nil
}
// Select the message block where this message should be found.
// Return nil if not in the set.
func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
fs.mu.RLock()
defer fs.mu.RUnlock()
t := minTime.UnixNano()
for _, mb := range fs.blks {
mb.mu.RLock()
found := t <= mb.last.ts
mb.mu.RUnlock()
if found {
return mb
}
}
return nil
}
// Index a raw msg buffer.
// Lock should be held.
func (mb *msgBlock) indexCacheBuf(buf []byte) error {
var le = binary.LittleEndian
var fseq, pseq uint64
var idx []uint32
var index uint32
if mb.cache == nil {
// Approximation, may adjust below.
fseq = mb.first.seq
idx = make([]uint32, 0, mb.msgs)
mb.cache = &cache{}
} else {
fseq = mb.cache.fseq
idx = mb.cache.idx
if len(idx) == 0 {
idx = make([]uint32, 0, mb.msgs)
}
index = uint32(len(mb.cache.buf))
buf = append(mb.cache.buf, buf...)
}
// Create FSS if we should track.
if !mb.noTrack {
mb.fss = make(map[string]*SimpleState)
}
lbuf := uint32(len(buf))
for index < lbuf {
if index+msgHdrSize > lbuf {
return errCorruptState
}
hdr := buf[index : index+msgHdrSize]
rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:]))
// Clear any headers bit that could be set.
rl &^= hbit
dlen := int(rl) - msgHdrSize
// Do some quick sanity checks here.
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
// This means something is off.
// TODO(dlc) - Add into bad list?
return errCorruptState
}
// Check for tombstones which we can skip in terms of indexing.
if seq&tbit != 0 {
index += rl
continue
}
// Clear any erase bits.
erased := seq&ebit != 0
seq = seq &^ ebit
// We defer checksum checks to individual msg cache lookups to amortorize costs and
// not introduce latency for first message from a newly loaded block.
if seq >= mb.first.seq {
// Track that we do not have holes.
// Not expected but did see it in the field.
if pseq > 0 && seq != pseq+1 {
for dseq := pseq + 1; dseq < seq; dseq++ {
idx = append(idx, dbit)
mb.dmap.Insert(dseq)
}
}
pseq = seq
// Add to our index.
idx = append(idx, index)
mb.cache.lrl = uint32(rl)
// Adjust if we guessed wrong.
if seq != 0 && seq < fseq {
fseq = seq
}
// Handle FSS inline here.
if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
if ss := mb.fss[string(bsubj)]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
subj := mb.subjString(bsubj)
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
index += rl
}
mb.cache.buf = buf
mb.cache.idx = idx
mb.cache.fseq = fseq
mb.cache.wp += int(lbuf)
return nil
}
// flushPendingMsgs writes out any messages for this message block.
func (mb *msgBlock) flushPendingMsgs() error {
mb.mu.Lock()
fsLostData, err := mb.flushPendingMsgsLocked()
fs := mb.fs
mb.mu.Unlock()
// Signals us that we need to rebuild filestore state.
if fsLostData != nil && fs != nil {
// Rebuild fs state too.
fs.rebuildState(fsLostData)
}
return err
}
// Write function for actual data.
// mb.mfd should not be nil.
// Lock should held.
func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
// Used to mock write failures.
if mb.mockWriteErr {
// Reset on trip.
mb.mockWriteErr = false
return 0, errors.New("mock write error")
}
return mb.mfd.WriteAt(buf, woff)
}
// flushPendingMsgsLocked writes out any messages for this message block.
// Lock should be held.
func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
// Signals us that we need to rebuild filestore state.
var fsLostData *LostStreamData
if mb.cache == nil || mb.mfd == nil {
return nil, nil
}
buf, err := mb.bytesPending()
// If we got an error back return here.
if err != nil {
// No pending data to be written is not an error.
if err == errNoPending || err == errNoCache {
err = nil
}
return nil, err
}
woff := int64(mb.cache.off + mb.cache.wp)
lob := len(buf)
// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
// under heavy load.
// Check if we need to encrypt.
if mb.bek != nil && lob > 0 {
// Need to leave original alone.
var dst []byte
if lob <= defaultLargeBlockSize {
dst = getMsgBlockBuf(lob)[:lob]
} else {
dst = make([]byte, lob)
}
mb.bek.XORKeyStream(dst, buf)
buf = dst
}
// Append new data to the message block file.
for lbb := lob; lbb > 0; lbb = len(buf) {
n, err := mb.writeAt(buf, woff)
if err != nil {
mb.dirtyCloseWithRemove(false)
ld, _, _ := mb.rebuildStateLocked()
mb.werr = err
return ld, err
}
// Update our write offset.
woff += int64(n)
// Partial write.
if n != lbb {
buf = buf[n:]
} else {
// Done.
break
}
}
// Clear any error.
mb.werr = nil
// Cache may be gone.
if mb.cache == nil || mb.mfd == nil {
return fsLostData, mb.werr
}
// Check if we are in sync always mode.
if mb.syncAlways {
mb.mfd.Sync()
} else {
mb.needSync = true
}
// Check for additional writes while we were writing to the disk.
moreBytes := len(mb.cache.buf) - mb.cache.wp - lob
// Decide what we want to do with the buffer in hand. If we have load interest
// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
mb.cache.wp += lob
} else {
if cap(mb.cache.buf) <= maxBufReuse {
buf = mb.cache.buf[:0]
} else {
recycleMsgBlockBuf(mb.cache.buf)
buf = nil
}
if moreBytes > 0 {
nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
buf = nbuf
} else {
buf = append(buf, nbuf...)
}
}
// Update our cache offset.
mb.cache.off = int(woff)
// Reset write pointer.
mb.cache.wp = 0
// Place buffer back in the cache structure.
mb.cache.buf = buf
// Mark fseq to 0
mb.cache.fseq = 0
}
return fsLostData, mb.werr
}
// Lock should be held.
func (mb *msgBlock) clearLoading() {
mb.loading = false
}
// Will load msgs from disk.
func (mb *msgBlock) loadMsgs() error {
// We hold the lock here the whole time by design.
mb.mu.Lock()
defer mb.mu.Unlock()
return mb.loadMsgsWithLock()
}
// Lock should be held.
func (mb *msgBlock) cacheAlreadyLoaded() bool {
if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
return false
}
numEntries := mb.msgs + uint64(mb.dmap.Size()) + (mb.first.seq - mb.cache.fseq)
return numEntries == uint64(len(mb.cache.idx))
}
// Lock should be held.
func (mb *msgBlock) cacheNotLoaded() bool {
return !mb.cacheAlreadyLoaded()
}
// Used to load in the block contents.
// Lock should be held and all conditionals satisfied prior.
func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
f, err := os.Open(mb.mfn)
if err != nil {
if os.IsNotExist(err) {
err = errNoBlkData
}
return nil, err
}
defer f.Close()
var sz int
if info, err := f.Stat(); err == nil {
sz64 := info.Size()
if int64(int(sz64)) == sz64 {
sz = int(sz64)
} else {
return nil, errMsgBlkTooBig
}
}
if buf == nil {
buf = getMsgBlockBuf(sz)
if sz > cap(buf) {
// We know we will make a new one so just recycle for now.
recycleMsgBlockBuf(buf)
buf = nil
}
}
if sz > cap(buf) {
buf = make([]byte, sz)
} else {
buf = buf[:sz]
}
n, err := io.ReadFull(f, buf)
// On success capture raw bytes size.
if err == nil {
mb.rbytes = uint64(n)
}
return buf[:n], err
}
// Lock should be held.
func (mb *msgBlock) loadMsgsWithLock() error {
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
return err
}
}
// Check to see if we are loading already.
if mb.loading {
return nil
}
// Set loading status.
mb.loading = true
defer mb.clearLoading()
var nchecks int
checkCache:
nchecks++
if nchecks > 8 {
return errCorruptState
}
// Check to see if we have a full cache.
if mb.cacheAlreadyLoaded() {
return nil
}
mb.llts = time.Now().UnixNano()
// FIXME(dlc) - We could be smarter here.
if buf, _ := mb.bytesPending(); len(buf) > 0 {
ld, err := mb.flushPendingMsgsLocked()
if ld != nil && mb.fs != nil {
// We do not know if fs is locked or not at this point.
// This should be an exceptional condition so do so in Go routine.
go mb.fs.rebuildState(ld)
}
if err != nil {
return err
}
goto checkCache
}
// Load in the whole block.
// We want to hold the mb lock here to avoid any changes to state.
buf, err := mb.loadBlock(nil)
if err != nil {
if err == errNoBlkData {
if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
// Rebuild fs state too.
go mb.fs.rebuildState(ld)
}
}
return err
}
// Reset the cache since we just read everything in.
// Make sure this is cleared in case we had a partial when we started.
mb.clearCacheAndOffset()
// Check if we need to decrypt.
if mb.bek != nil && len(buf) > 0 {
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
return err
}
mb.bek = bek
mb.bek.XORKeyStream(buf, buf)
}
// Check for compression.
if buf, err = mb.decompressIfNeeded(buf); err != nil {
return err
}
if err := mb.indexCacheBuf(buf); err != nil {
if err == errCorruptState {
var ld *LostStreamData
if ld, _, err = mb.rebuildStateLocked(); ld != nil {
// We do not know if fs is locked or not at this point.
// This should be an exceptional condition so do so in Go routine.
go mb.fs.rebuildState(ld)
}
}
if err != nil {
return err
}
goto checkCache
}
if len(buf) > 0 {
mb.cloads++
mb.startCacheExpireTimer()
}
return nil
}
// Fetch a message from this block, possibly reading in and caching the messages.
// We assume the block was selected and is correct, so we do not do range checks.
func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
mb.mu.Lock()
defer mb.mu.Unlock()
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return nil, false, err
}
}
fsm, err := mb.cacheLookup(seq, sm)
if err != nil {
return nil, false, err
}
expireOk := seq == mb.last.seq && mb.llseq == seq
return fsm, expireOk, err
}
var (
errNoCache = errors.New("no message cache")
errBadMsg = errors.New("malformed or corrupt message")
errDeletedMsg = errors.New("deleted message")
errPartialCache = errors.New("partial cache")
errNoPending = errors.New("message block does not have pending data")
errNotReadable = errors.New("storage directory not readable")
errCorruptState = errors.New("corrupt state file")
errPriorState = errors.New("prior state file")
errPendingData = errors.New("pending data still present")
errNoEncryption = errors.New("encryption not enabled")
errBadKeySize = errors.New("encryption bad key size")
errNoMsgBlk = errors.New("no message block")
errMsgBlkTooBig = errors.New("message block size exceeded int capacity")
errUnknownCipher = errors.New("unknown cipher")
errNoMainKey = errors.New("encrypted store encountered with no main key")
errNoBlkData = errors.New("message block data missing")
)
const (
// Used for marking messages that have had their checksums checked.
// Used to signal a message record with headers.
hbit = 1 << 31
// Used for marking erased messages sequences.
ebit = 1 << 63
// Used for marking tombstone sequences.
tbit = 1 << 62
// Used to mark a bad index as deleted.
dbit = 1 << 30
)
// Will do a lookup from cache.
// Lock should be held.
func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
if seq < mb.first.seq || seq > mb.last.seq {
return nil, ErrStoreMsgNotFound
}
// If we have a delete map check it.
if mb.dmap.Exists(seq) {
mb.llts = time.Now().UnixNano()
return nil, errDeletedMsg
}
// Detect no cache loaded.
if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
return nil, errNoCache
}
// Check partial cache status.
if seq < mb.cache.fseq {
return nil, errPartialCache
}
bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
if err != nil {
return nil, err
}
// Update cache activity.
mb.llts = time.Now().UnixNano()
// The llseq signals us when we can expire a cache at the end of a linear scan.
// We want to only update when we know the last reads (multiple consumers) are sequential.
if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 {
mb.llseq = seq
}
li := int(bi) - mb.cache.off
if li >= len(mb.cache.buf) {
return nil, errPartialCache
}
buf := mb.cache.buf[li:]
// We use the high bit to denote we have already checked the checksum.
var hh hash.Hash64
if !hashChecked {
hh = mb.hh // This will force the hash check in msgFromBuf.
}
// Parse from the raw buffer.
fsm, err := mb.msgFromBuf(buf, sm, hh)
if err != nil || fsm == nil {
return nil, err
}
// Deleted messages that are decoded return a 0 for sequence.
if fsm.seq == 0 {
return nil, errDeletedMsg
}
if seq != fsm.seq {
recycleMsgBlockBuf(mb.cache.buf)
mb.cache.buf = nil
return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
}
// Clear the check bit here after we know all is good.
if !hashChecked {
mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
}
return fsm, nil
}
// Used when we are checking if discarding a message due to max msgs per subject will give us
// enough room for a max bytes condition.
// Lock should be already held.
func (fs *fileStore) sizeForSeq(seq uint64) int {
if seq == 0 {
return 0
}
var smv StoreMsg
if mb := fs.selectMsgBlock(seq); mb != nil {
if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
}
}
return 0
}
// Will return message for the given sequence number.
func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
// be stalled. Need another lock if want to happen in parallel.
fs.mu.RLock()
if fs.closed {
fs.mu.RUnlock()
return nil, ErrStoreClosed
}
// Indicates we want first msg.
if seq == 0 {
seq = fs.state.FirstSeq
}
// Make sure to snapshot here.
mb, lmb, lseq := fs.selectMsgBlock(seq), fs.lmb, fs.state.LastSeq
fs.mu.RUnlock()
if mb == nil {
var err = ErrStoreEOF
if seq <= lseq {
err = ErrStoreMsgNotFound
}
return nil, err
}
fsm, expireOk, err := mb.fetchMsg(seq, sm)
if err != nil {
return nil, err
}
// We detected a linear scan and access to the last message.
// If we are not the last message block we can try to expire the cache.
if mb != lmb && expireOk {
mb.tryForceExpireCache()
}
return fsm, nil
}
// Internal function to return msg parts from a raw buffer.
// Lock should be held.
func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
if len(buf) < emptyRecordLen {
return nil, errBadMsg
}
var le = binary.LittleEndian
hdr := buf[:msgHdrSize]
rl := le.Uint32(hdr[0:])
hasHeaders := rl&hbit != 0
rl &^= hbit // clear header bit
dlen := int(rl) - msgHdrSize
slen := int(le.Uint16(hdr[20:]))
// Simple sanity check.
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
return nil, errBadMsg
}
data := buf[msgHdrSize : msgHdrSize+dlen]
// Do checksum tests here if requested.
if hh != nil {
hh.Reset()
hh.Write(hdr[4:20])
hh.Write(data[:slen])
if hasHeaders {
hh.Write(data[slen+4 : dlen-recordHashSize])
} else {
hh.Write(data[slen : dlen-recordHashSize])
}
if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
return nil, errBadMsg
}
}
seq := le.Uint64(hdr[4:])
if seq&ebit != 0 {
seq = 0
}
ts := int64(le.Uint64(hdr[12:]))
// Create a StoreMsg if needed.
if sm == nil {
sm = new(StoreMsg)
} else {
sm.clear()
}
// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
// layers and for us to be safe to expire, and recycle, the large msgBlocks.
end := dlen - 8
if hasHeaders {
hl := le.Uint32(data[slen:])
bi := slen + 4
li := bi + int(hl)
sm.buf = append(sm.buf, data[bi:end]...)
li, end = li-bi, end-bi
sm.hdr = sm.buf[0:li:li]
sm.msg = sm.buf[li:end]
} else {
sm.buf = append(sm.buf, data[slen:end]...)
sm.msg = sm.buf[0 : end-slen]
}
sm.seq, sm.ts = seq, ts
// Treat subject a bit different to not reference underlying buf.
if slen > 0 {
sm.subj = mb.subjString(data[:slen])
}
return sm, nil
}
// Used to intern strings for subjects.
// Based on idea from https://github.com/josharian/intern/blob/master/intern.go
var subjPool = sync.Pool{
New: func() any {
return make(map[string]string)
},
}
// Get an interned string from a byte slice.
func subjFromBytes(b []byte) string {
sm := subjPool.Get().(map[string]string)
defer subjPool.Put(sm)
subj, ok := sm[string(b)]
if ok {
return subj
}
s := string(b)
sm[s] = s
return s
}
// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (fs *fileStore) subjString(skey []byte) string {
if fs == nil || len(skey) == 0 {
return _EMPTY_
}
if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 {
if lsubjs == 1 {
// The cast for the comparison does not make a copy
if string(skey) == fs.cfg.Subjects[0] {
return fs.cfg.Subjects[0]
}
} else {
for _, subj := range fs.cfg.Subjects {
if string(skey) == subj {
return subj
}
}
}
}
return subjFromBytes(skey)
}
// Given the `key` byte slice, this function will return the subject
// as an interned string of `key` or a configured subject as to minimize memory allocations.
// Lock should be held.
func (mb *msgBlock) subjString(skey []byte) string {
return mb.fs.subjString(skey)
}
// LoadMsg will lookup the message by sequence number and return it if found.
func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
return fs.msgForSeq(seq, sm)
}
// loadLast will load the last message for a subject. Subject should be non empty and not ">".
func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.closed || fs.lmb == nil {
return nil, ErrStoreClosed
}
if len(fs.blks) == 0 {
return nil, ErrStoreMsgNotFound
}
start, stop := fs.lmb.index, fs.blks[0].index
wc := subjectHasWildcard(subj)
// If literal subject check for presence.
if !wc {
if info := fs.psim[subj]; info == nil {
return nil, ErrStoreMsgNotFound
} else {
start, stop = info.lblk, info.fblk
}
}
// Walk blocks backwards.
for i := start; i >= stop; i-- {
mb := fs.bim[i]
if mb == nil {
continue
}
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
return nil, err
}
var l uint64
// Optimize if subject is not a wildcard.
if !wc {
if ss := mb.fss[subj]; ss != nil {
l = ss.Last
}
}
if l == 0 {
_, _, l = mb.filteredPendingLocked(subj, wc, mb.first.seq)
}
if l > 0 {
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
mb.mu.Unlock()
return nil, err
}
}
lsm, err = mb.cacheLookup(l, sm)
}
mb.mu.Unlock()
if l > 0 {
break
}
}
return lsm, err
}
// LoadLastMsg will return the last message we have that matches a given subject.
// The subject can be a wildcard.
func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
if subject == _EMPTY_ || subject == fwcs {
sm, err = fs.msgForSeq(fs.lastSeq(), smv)
} else {
sm, err = fs.loadLast(subject, smv)
}
if sm == nil || (err != nil && err != ErrStoreClosed) {
err = ErrStoreMsgNotFound
}
return sm, err
}
func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
if fs.closed {
return nil, 0, ErrStoreClosed
}
if start < fs.state.FirstSeq {
start = fs.state.FirstSeq
}
if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
for i := bi; i < len(fs.blks); i++ {
mb := fs.blks[i]
if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
if expireOk && mb != fs.lmb {
mb.tryForceExpireCache()
}
return sm, sm.seq, nil
} else if err != ErrStoreMsgNotFound {
return nil, 0, err
}
}
}
return nil, fs.state.LastSeq, ErrStoreEOF
}
// Type returns the type of the underlying store.
func (fs *fileStore) Type() StorageType {
return FileStorage
}
// Returns number of subjects in this store.
// Lock should be held.
func (fs *fileStore) numSubjects() int {
return len(fs.psim)
}
// FastState will fill in state with only the following.
// Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
func (fs *fileStore) FastState(state *StreamState) {
fs.mu.RLock()
state.Msgs = fs.state.Msgs
state.Bytes = fs.state.Bytes
state.FirstSeq = fs.state.FirstSeq
state.FirstTime = fs.state.FirstTime
state.LastSeq = fs.state.LastSeq
state.LastTime = fs.state.LastTime
if state.LastSeq > state.FirstSeq {
state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
if state.NumDeleted < 0 {
state.NumDeleted = 0
}
}
state.Consumers = len(fs.cfs)
state.NumSubjects = fs.numSubjects()
fs.mu.RUnlock()
}
// State returns the current state of the stream.
func (fs *fileStore) State() StreamState {
fs.mu.RLock()
state := fs.state
state.Consumers = len(fs.cfs)
state.NumSubjects = fs.numSubjects()
state.Deleted = nil // make sure.
if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
state.Deleted = make([]uint64, 0, numDeleted)
cur := fs.state.FirstSeq
for _, mb := range fs.blks {
mb.mu.Lock()
fseq := mb.first.seq
// Account for messages missing from the head.
if fseq > cur {
for seq := cur; seq < fseq; seq++ {
state.Deleted = append(state.Deleted, seq)
}
}
cur = mb.last.seq + 1 // Expected next first.
mb.dmap.Range(func(seq uint64) bool {
if seq < fseq {
mb.dmap.Delete(seq)
} else {
state.Deleted = append(state.Deleted, seq)
}
return true
})
mb.mu.Unlock()
}
}
fs.mu.RUnlock()
state.Lost = fs.lostData()
// Can not be guaranteed to be sorted.
if len(state.Deleted) > 0 {
sort.Slice(state.Deleted, func(i, j int) bool {
return state.Deleted[i] < state.Deleted[j]
})
state.NumDeleted = len(state.Deleted)
}
return state
}
func (fs *fileStore) Utilization() (total, reported uint64, err error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
for _, mb := range fs.blks {
mb.mu.RLock()
reported += mb.bytes
total += mb.rbytes
mb.mu.RUnlock()
}
return total, reported, nil
}
func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
if len(hdr) == 0 {
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
return uint64(22 + len(subj) + len(msg) + 8)
}
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
}
func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
return uint64(emptyRecordLen + slen + 4 + maxPayload)
}
// Determine time since last write or remove of a message.
// Read lock should be held.
func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
if mb.closed {
return 0
}
last := mb.lwts
if mb.lrts > last {
last = mb.lrts
}
return time.Since(time.Unix(0, last).UTC())
}
func checkNewHeader(hdr []byte) error {
if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
(hdr[1] != version && hdr[1] != newVersion) {
return errCorruptState
}
return nil
}
// readIndexInfo will read in the index information for the message block.
func (mb *msgBlock) readIndexInfo() error {
ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
buf, err := os.ReadFile(ifn)
if err != nil {
return err
}
// Set if first time.
if mb.liwsz == 0 {
mb.liwsz = int64(len(buf))
}
// Decrypt if needed.
if mb.aek != nil {
buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
if err != nil {
return err
}
}
if err := checkNewHeader(buf); err != nil {
defer os.Remove(ifn)
return fmt.Errorf("bad index file")
}
bi := hdrLen
// Helpers, will set i to -1 on error.
readSeq := func() uint64 {
if bi < 0 {
return 0
}
seq, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return seq &^ ebit
}
readCount := readSeq
readTimeStamp := func() int64 {
if bi < 0 {
return 0
}
ts, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return ts
}
mb.msgs = readCount()
mb.bytes = readCount()
mb.first.seq = readSeq()
mb.first.ts = readTimeStamp()
mb.last.seq = readSeq()
mb.last.ts = readTimeStamp()
dmapLen := readCount()
// Check if this is a short write index file.
if bi < 0 || bi+checksumSize > len(buf) {
os.Remove(ifn)
return fmt.Errorf("short index file")
}
// Check for consistency if accounting. If something is off bail and we will rebuild.
if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen {
os.Remove(ifn)
return fmt.Errorf("accounting inconsistent")
}
// Checksum
copy(mb.lchk[0:], buf[bi:bi+checksumSize])
bi += checksumSize
// Now check for presence of a delete map
if dmapLen > 0 {
// New version is encoded avl seqset.
if buf[1] == newVersion {
dmap, _, err := avl.Decode(buf[bi:])
if err != nil {
return fmt.Errorf("could not decode avl dmap: %v", err)
}
mb.dmap = *dmap
} else {
// This is the old version.
for i := 0; i < int(dmapLen); i++ {
seq := readSeq()
if seq == 0 {
break
}
mb.dmap.Insert(seq + mb.first.seq)
}
}
}
return nil
}
// Will return total number of cache loads.
func (fs *fileStore) cacheLoads() uint64 {
var tl uint64
fs.mu.RLock()
for _, mb := range fs.blks {
tl += mb.cloads
}
fs.mu.RUnlock()
return tl
}
// Will return total number of cached bytes.
func (fs *fileStore) cacheSize() uint64 {
var sz uint64
fs.mu.RLock()
for _, mb := range fs.blks {
mb.mu.RLock()
if mb.cache != nil {
sz += uint64(len(mb.cache.buf))
}
mb.mu.RUnlock()
}
fs.mu.RUnlock()
return sz
}
// Will return total number of dmapEntries for all msg blocks.
func (fs *fileStore) dmapEntries() int {
var total int
fs.mu.RLock()
for _, mb := range fs.blks {
total += mb.dmap.Size()
}
fs.mu.RUnlock()
return total
}
// Fixed helper for iterating.
func subjectsEqual(a, b string) bool {
return a == b
}
func subjectsAll(a, b string) bool {
return true
}
func compareFn(subject string) func(string, string) bool {
if subject == _EMPTY_ || subject == fwcs {
return subjectsAll
}
if subjectHasWildcard(subject) {
return subjectIsSubsetMatch
}
return subjectsEqual
}
// PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
// Will return the number of purged messages.
func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
if subject == _EMPTY_ || subject == fwcs {
if keep == 0 && (sequence == 0 || sequence == 1) {
return fs.Purge()
}
if sequence > 1 {
return fs.Compact(sequence)
}
}
eq, wc := compareFn(subject), subjectHasWildcard(subject)
var firstSeqNeedsUpdate bool
var bytes uint64
// If we have a "keep" designation need to get full filtered state so we know how many to purge.
var maxp uint64
if keep > 0 {
ss := fs.FilteredState(1, subject)
if keep >= ss.Msgs {
return 0, nil
}
maxp = ss.Msgs - keep
}
var smv StoreMsg
fs.mu.Lock()
// We may remove blocks as we purge, so don't range directly on fs.blks
// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
for i := 0; i < len(fs.blks); i++ {
mb := fs.blks[i]
mb.mu.Lock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
mb.mu.Unlock()
continue
}
t, f, l := mb.filteredPendingLocked(subject, wc, mb.first.seq)
if t == 0 {
mb.mu.Unlock()
continue
}
var shouldExpire bool
if mb.cacheNotLoaded() {
mb.loadMsgsWithLock()
shouldExpire = true
}
if sequence > 1 && sequence <= l {
l = sequence - 1
}
for seq := f; seq <= l; seq++ {
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
// Do fast in place remove.
// Stats
if mb.msgs > 0 {
// Msgs
fs.state.Msgs--
mb.msgs--
// Bytes, make sure to not go negative.
if rl > fs.state.Bytes {
rl = fs.state.Bytes
}
if rl > mb.bytes {
rl = mb.bytes
}
fs.state.Bytes -= rl
mb.bytes -= rl
// Totals
purged++
bytes += rl
}
// FSS updates.
mb.removeSeqPerSubject(sm.subj, seq)
fs.removePerSubject(sm.subj)
// Check for first message.
if seq == mb.first.seq {
mb.selectNextFirst()
if mb.isEmpty() {
fs.removeMsgBlock(mb)
i--
// keep flag set, if set previously
firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
} else if seq == fs.state.FirstSeq {
fs.state.FirstSeq = mb.first.seq // new one.
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
}
} else {
// Out of order delete.
mb.dmap.Insert(seq)
}
if maxp > 0 && purged >= maxp {
break
}
}
}
// Expire if we were responsible for loading.
if shouldExpire {
// Expire this cache before moving on.
mb.tryForceExpireCacheLocked()
}
mb.mu.Unlock()
// Check if we should break out of top level too.
if maxp > 0 && purged >= maxp {
break
}
}
if firstSeqNeedsUpdate {
fs.selectNextFirst()
}
fs.dirty++
cb := fs.scb
fs.mu.Unlock()
fs.kickFlushStateLoop()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return purged, nil
}
// Purge will remove all messages from this store.
// Will return the number of purged messages.
func (fs *fileStore) Purge() (uint64, error) {
return fs.purge(0)
}
func (fs *fileStore) purge(fseq uint64) (uint64, error) {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return 0, ErrStoreClosed
}
purged := fs.state.Msgs
rbytes := int64(fs.state.Bytes)
fs.state.FirstSeq = fs.state.LastSeq + 1
fs.state.FirstTime = time.Time{}
fs.state.Bytes = 0
fs.state.Msgs = 0
for _, mb := range fs.blks {
mb.dirtyClose()
}
fs.blks = nil
fs.lmb = nil
fs.bim = make(map[uint32]*msgBlock)
// Clear any per subject tracking.
fs.psim = make(map[string]*psi)
// Mark dirty
fs.dirty++
// Move the msgs directory out of the way, will delete out of band.
// FIXME(dlc) - These can error and we need to change api above to propagate?
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
// If purge directory still exists then we need to wait
// in place and remove since rename would fail.
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
os.Rename(mdir, pdir)
go os.RemoveAll(pdir)
// Create new one.
os.MkdirAll(mdir, defaultDirPerms)
// Make sure we have a lmb to write to.
if _, err := fs.newMsgBlockForWrite(); err != nil {
fs.mu.Unlock()
return purged, err
}
// Check if we need to set the first seq to a new number.
if fseq > fs.state.FirstSeq {
fs.state.FirstSeq = fseq
fs.state.LastSeq = fseq - 1
}
lmb := fs.lmb
lmb.first.seq = fs.state.FirstSeq
lmb.last.seq = fs.state.LastSeq
lmb.last.ts = fs.state.LastTime.UnixNano()
if fs.lmb.last.seq > 1 {
// Leave a tombstone so we can remember our starting sequence in case
// full state becomes corrupted.
lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts)
}
cb := fs.scb
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -rbytes, 0, _EMPTY_)
}
return purged, nil
}
// Compact will remove all messages from this store up to
// but not including the seq parameter.
// Will return the number of purged messages.
func (fs *fileStore) Compact(seq uint64) (uint64, error) {
if seq == 0 {
return fs.purge(seq)
}
var purged, bytes uint64
fs.mu.Lock()
// Same as purge all.
if lseq := fs.state.LastSeq; seq > lseq {
fs.mu.Unlock()
return fs.purge(seq)
}
// We have to delete interior messages.
smb := fs.selectMsgBlock(seq)
if smb == nil {
fs.mu.Unlock()
return 0, nil
}
// All msgblocks up to this one can be thrown away.
var deleted int
for _, mb := range fs.blks {
if mb == smb {
break
}
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj := range mb.fss {
fs.removePerSubject(subj)
}
// Now close.
mb.dirtyCloseWithRemove(true)
mb.mu.Unlock()
deleted++
}
var smv StoreMsg
var err error
var isEmpty bool
smb.mu.Lock()
if smb.first.seq == seq {
goto SKIP
}
// Make sure we have the messages loaded.
if smb.cacheNotLoaded() {
if err = smb.loadMsgsWithLock(); err != nil {
goto SKIP
}
}
for mseq := smb.first.seq; mseq < seq; mseq++ {
sm, err := smb.cacheLookup(mseq, &smv)
if err == errDeletedMsg {
// Update dmap.
if !smb.dmap.IsEmpty() {
smb.dmap.Delete(seq)
}
} else if sm != nil {
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
if smb.msgs > 0 {
smb.msgs--
if sz > smb.bytes {
sz = smb.bytes
}
smb.bytes -= sz
bytes += sz
purged++
}
// Update fss
smb.removeSeqPerSubject(sm.subj, mseq)
fs.removePerSubject(sm.subj)
}
}
// Check if empty after processing, could happen if tail of messages are all deleted.
isEmpty = smb.msgs == 0
if isEmpty {
smb.dirtyCloseWithRemove(true)
// Update fs first here as well.
fs.state.FirstSeq = smb.last.seq + 1
fs.state.FirstTime = time.Time{}
deleted++
} else {
// Make sure to sync changes.
smb.needSync = true
// Update fs first seq and time.
smb.first.seq = seq - 1 // Just for start condition for selectNextFirst.
smb.selectNextFirst()
fs.state.FirstSeq = smb.first.seq
fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
// Check if we should reclaim the head space from this block.
// This will be optimistic only, so don't continue if we encounter any errors here.
if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
var moff uint32
moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq))
if err != nil || moff >= uint32(len(smb.cache.buf)) {
goto SKIP
}
buf := smb.cache.buf[moff:]
// Don't reuse, copy to new recycled buf.
nbuf := getMsgBlockBuf(len(buf))
nbuf = append(nbuf, buf...)
smb.closeFDsLockedNoCheck()
// Check for encryption.
if smb.bek != nil && len(nbuf) > 0 {
// Recreate to reset counter.
bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
if err != nil {
goto SKIP
}
// For future writes make sure to set smb.bek to keep counter correct.
smb.bek = bek
smb.bek.XORKeyStream(nbuf, nbuf)
}
// Recompress if necessary (smb.cmp contains the algorithm used when
// the block was loaded from disk, or defaults to NoCompression if not)
if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
goto SKIP
}
if err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms); err != nil {
goto SKIP
}
// Make sure to remove fss state.
smb.fss = nil
smb.clearCacheAndOffset()
smb.rbytes = uint64(len(nbuf))
}
}
SKIP:
smb.mu.Unlock()
if deleted > 0 {
// Update block map.
if fs.bim != nil {
for _, mb := range fs.blks[:deleted] {
delete(fs.bim, mb.index)
}
}
// Update blks slice.
fs.blks = copyMsgBlocks(fs.blks[deleted:])
if lb := len(fs.blks); lb == 0 {
fs.lmb = nil
} else {
fs.lmb = fs.blks[lb-1]
}
}
// Update top level accounting.
if purged > fs.state.Msgs {
purged = fs.state.Msgs
}
fs.state.Msgs -= purged
if bytes > fs.state.Bytes {
bytes = fs.state.Bytes
}
fs.state.Bytes -= bytes
fs.dirty++
fs.kickFlushStateLoop()
cb := fs.scb
fs.mu.Unlock()
if cb != nil && purged > 0 {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return purged, err
}
// Will completely reset our store.
func (fs *fileStore) reset() error {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
if fs.sips > 0 {
fs.mu.Unlock()
return ErrStoreSnapshotInProgress
}
var purged, bytes uint64
cb := fs.scb
for _, mb := range fs.blks {
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
mb.dirtyCloseWithRemove(true)
mb.mu.Unlock()
}
// Reset
fs.state.FirstSeq = 0
fs.state.FirstTime = time.Time{}
fs.state.LastSeq = 0
fs.state.LastTime = time.Now().UTC()
// Update msgs and bytes.
fs.state.Msgs = 0
fs.state.Bytes = 0
// Reset blocks.
fs.blks, fs.lmb = nil, nil
// Reset subject mappings.
fs.psim = make(map[string]*psi)
fs.bim = make(map[uint32]*msgBlock)
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return nil
}
// Truncate will truncate a stream store up to seq. Sequence needs to be valid.
func (fs *fileStore) Truncate(seq uint64) error {
// Check for request to reset.
if seq == 0 {
return fs.reset()
}
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
if fs.sips > 0 {
fs.mu.Unlock()
return ErrStoreSnapshotInProgress
}
nlmb := fs.selectMsgBlock(seq)
if nlmb == nil {
fs.mu.Unlock()
return ErrInvalidSequence
}
lsm, _, _ := nlmb.fetchMsg(seq, nil)
if lsm == nil {
fs.mu.Unlock()
return ErrInvalidSequence
}
// Set lmb to nlmb and make sure writeable.
fs.lmb = nlmb
if err := nlmb.enableForWriting(fs.fip); err != nil {
return err
}
var purged, bytes uint64
// Truncate our new last message block.
nmsgs, nbytes, err := nlmb.truncate(lsm)
if err != nil {
fs.mu.Unlock()
return fmt.Errorf("nlmb.truncate: %w", err)
}
// Account for the truncated msgs and bytes.
purged += nmsgs
bytes += nbytes
// Remove any left over msg blocks.
getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
mb.mu.Lock()
purged += mb.msgs
bytes += mb.bytes
fs.removeMsgBlock(mb)
mb.mu.Unlock()
}
// Reset last.
fs.state.LastSeq = lsm.seq
fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
// Update msgs and bytes.
if purged > fs.state.Msgs {
purged = fs.state.Msgs
}
fs.state.Msgs -= purged
if bytes > fs.state.Bytes {
bytes = fs.state.Bytes
}
fs.state.Bytes -= bytes
// Reset our subject lookup info.
fs.resetGlobalPerSubjectInfo()
fs.dirty++
fs.kickFlushStateLoop()
cb := fs.scb
fs.mu.Unlock()
if cb != nil {
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
}
return nil
}
func (fs *fileStore) lastSeq() uint64 {
fs.mu.RLock()
seq := fs.state.LastSeq
fs.mu.RUnlock()
return seq
}
// Returns number of msg blks.
func (fs *fileStore) numMsgBlocks() int {
fs.mu.RLock()
defer fs.mu.RUnlock()
return len(fs.blks)
}
// Will add a new msgBlock.
// Lock should be held.
func (fs *fileStore) addMsgBlock(mb *msgBlock) {
fs.blks = append(fs.blks, mb)
fs.lmb = mb
fs.bim[mb.index] = mb
}
// Remove from our list of blks.
// Both locks should be held.
func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
// Remove from list.
for i, omb := range fs.blks {
if mb == omb {
blks := append(fs.blks[:i], fs.blks[i+1:]...)
fs.blks = copyMsgBlocks(blks)
if fs.bim != nil {
delete(fs.bim, mb.index)
}
break
}
}
}
// Removes the msgBlock
// Both locks should be held.
func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
mb.dirtyCloseWithRemove(true)
fs.removeMsgBlockFromList(mb)
// Check for us being last message block
if mb == fs.lmb {
last := mb.last
// Creating a new message write block requires that the lmb lock is not held.
mb.mu.Unlock()
// Write the tombstone to remember since this was last block.
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
lmb.writeTombstone(last.seq, last.ts)
}
mb.mu.Lock()
}
}
// Called by purge to simply get rid of the cache and close our fds.
// Lock should not be held.
func (mb *msgBlock) dirtyClose() {
mb.mu.Lock()
defer mb.mu.Unlock()
mb.dirtyCloseWithRemove(false)
}
// Should be called with lock held.
func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
if mb == nil {
return
}
// Stop cache expiration timer.
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
// Clear any tracking by subject.
mb.fss = nil
// Close cache
mb.clearCacheAndOffset()
// Quit our loops.
if mb.qch != nil {
close(mb.qch)
mb.qch = nil
}
if mb.mfd != nil {
mb.mfd.Close()
mb.mfd = nil
}
if remove {
if mb.mfn != _EMPTY_ {
os.Remove(mb.mfn)
mb.mfn = _EMPTY_
}
if mb.kfn != _EMPTY_ {
os.Remove(mb.kfn)
}
// Since we are removing a block kick the state flusher.
mb.fs.kickFlushStateLoop()
}
}
// Remove a seq from the fss and select new first.
// Lock should be held.
func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss == nil {
return
}
if ss.Msgs == 1 {
delete(mb.fss, subj)
return
}
ss.Msgs--
// Only one left.
if ss.Msgs == 1 {
if seq == ss.Last {
ss.Last = ss.First
} else {
ss.First = ss.Last
}
ss.firstNeedsUpdate = false
return
}
// We can lazily calculate the first sequence when needed.
ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
}
// Will recalulate the first sequence for this subject in this block.
// Will avoid slower path message lookups and scan the cache directly instead.
func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
// Need to make sure messages are loaded.
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return
}
}
// Mark first as updated.
ss.firstNeedsUpdate = false
startSeq++
startSlot := int(startSeq - mb.cache.fseq)
if startSlot >= len(mb.cache.idx) {
ss.First = ss.Last
return
} else if startSlot < 0 {
startSlot = 0
}
var le = binary.LittleEndian
for slot := startSlot; slot < len(mb.cache.idx); slot++ {
li := int(mb.cache.idx[slot]&^hbit) - mb.cache.off
if li >= len(mb.cache.buf) {
ss.First = ss.Last
return
}
buf := mb.cache.buf[li:]
hdr := buf[:msgHdrSize]
slen := int(le.Uint16(hdr[20:]))
if subj == string(buf[msgHdrSize:msgHdrSize+slen]) {
seq := le.Uint64(hdr[4:])
if seq < mb.first.seq || seq&ebit != 0 {
continue
}
if mb.dmap.Exists(seq) {
continue
}
ss.First = seq
return
}
}
}
// Lock should be held.
func (fs *fileStore) resetGlobalPerSubjectInfo() {
// Clear any global subject state.
fs.psim = make(map[string]*psi)
for _, mb := range fs.blks {
fs.populateGlobalPerSubjectInfo(mb)
}
}
// Lock should be held.
func (mb *msgBlock) resetPerSubjectInfo() error {
mb.fss = nil
return mb.generatePerSubjectInfo()
}
// generatePerSubjectInfo will generate the per subject info via the raw msg block.
// Lock should be held.
func (mb *msgBlock) generatePerSubjectInfo() error {
// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
if mb.msgs == 0 {
return nil
}
if mb.cacheNotLoaded() {
if err := mb.loadMsgsWithLock(); err != nil {
return err
}
// indexCaceheBuf can produce fss now, so if non-nil we are good.
if mb.fss != nil {
return nil
}
}
// Create new one regardless.
mb.fss = make(map[string]*SimpleState)
var smv StoreMsg
fseq, lseq := mb.first.seq, mb.last.seq
for seq := fseq; seq <= lseq; seq++ {
sm, err := mb.cacheLookup(seq, &smv)
if err != nil {
// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
if err == ErrStoreMsgNotFound || err == errDeletedMsg {
continue
}
if err == errNoCache {
return nil
}
return err
}
if sm != nil && len(sm.subj) > 0 {
if ss := mb.fss[sm.subj]; ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
}
}
}
if len(mb.fss) > 0 {
// Make sure we run the cache expire timer.
mb.llts = time.Now().UnixNano()
mb.startCacheExpireTimer()
}
return nil
}
// Helper to make sure fss loaded if we are tracking.
// Lock should be held
func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
if mb.fss != nil || mb.noTrack {
return nil
}
if mb.msgs == 0 {
mb.fss = make(map[string]*SimpleState)
return nil
}
return mb.generatePerSubjectInfo()
}
// Called on recovery to populate the global psim state.
// Lock should be held.
func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
mb.mu.Lock()
defer mb.mu.Unlock()
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
return
}
// Now populate psim.
for subj, ss := range mb.fss {
if len(subj) > 0 {
if info, ok := fs.psim[subj]; ok {
info.total += ss.Msgs
if mb.index > info.lblk {
info.lblk = mb.index
}
} else {
fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}
}
}
}
}
// Close the message block.
func (mb *msgBlock) close(sync bool) {
if mb == nil {
return
}
mb.mu.Lock()
defer mb.mu.Unlock()
if mb.closed {
return
}
// Stop cache expiration timer.
if mb.ctmr != nil {
mb.ctmr.Stop()
mb.ctmr = nil
}
mb.fss = nil
// Close cache
mb.clearCacheAndOffset()
// Quit our loops.
if mb.qch != nil {
close(mb.qch)
mb.qch = nil
}
if mb.mfd != nil {
if sync {
mb.mfd.Sync()
}
mb.mfd.Close()
}
mb.mfd = nil
// Mark as closed.
mb.closed = true
}
func (fs *fileStore) closeAllMsgBlocks(sync bool) {
for _, mb := range fs.blks {
mb.close(sync)
}
}
func (fs *fileStore) Delete() error {
if fs.isClosed() {
// Always attempt to remove since we could have been closed beforehand.
os.RemoveAll(fs.fcfg.StoreDir)
// Since we did remove, if we did have anything remaining make sure to
// call into any storage updates that had been registered.
fs.mu.Lock()
cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
// Guard against double accounting if called twice.
fs.state.Msgs, fs.state.Bytes = 0, 0
fs.mu.Unlock()
if msgs > 0 && cb != nil {
cb(-msgs, -bytes, 0, _EMPTY_)
}
return ErrStoreClosed
}
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
// If purge directory still exists then we need to wait
// in place and remove since rename would fail.
if _, err := os.Stat(pdir); err == nil {
os.RemoveAll(pdir)
}
// Do Purge() since if we have lots of blocks uses a mv/rename.
fs.Purge()
if err := fs.Stop(); err != nil {
return err
}
err := os.RemoveAll(fs.fcfg.StoreDir)
if err == nil {
return nil
}
ttl := time.Now().Add(time.Second)
for time.Now().Before(ttl) {
time.Sleep(10 * time.Millisecond)
if err = os.RemoveAll(fs.fcfg.StoreDir); err == nil {
return nil
}
}
return err
}
// Lock should be held.
func (fs *fileStore) cancelSyncTimer() {
if fs.syncTmr != nil {
fs.syncTmr.Stop()
fs.syncTmr = nil
}
}
const (
fullStateMagic = uint8(11)
fullStateVersion = uint8(1)
)
// This go routine runs and receives kicks to write out our full stream state index.
// This will get kicked when we create a new block or when we delete a block in general.
// This is also called during Stop().
func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) {
for {
select {
case <-fch:
fs.writeFullState()
case <-qch:
close(done)
return
}
}
}
// Kick the flusher.
func (fs *fileStore) kickFlushStateLoop() {
kickFlusher(fs.fch)
}
// Helper since unixnano of zero time undefined.
func timestampNormalized(t time.Time) int64 {
if t.IsZero() {
return 0
}
return t.UnixNano()
}
// This will write the full binary state for the stream.
// This plus everything new since last hash will be the total recovered state.
// This state dump will have the following.
// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
// 4. Last block index and hash of record inclusive to this stream state.
func (fs *fileStore) writeFullState() error {
fs.mu.Lock()
if fs.closed || fs.dirty == 0 {
fs.mu.Unlock()
return nil
}
var _buf [32 * 1024]byte
_buf[0], _buf[1] = fullStateMagic, fullStateVersion
buf := _buf[:hdrLen]
buf = binary.AppendUvarint(buf, fs.state.Msgs)
buf = binary.AppendUvarint(buf, fs.state.Bytes)
buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
buf = binary.AppendUvarint(buf, fs.state.LastSeq)
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))
// Do per subject information map if applicable.
numSubjects := len(fs.psim)
buf = binary.AppendUvarint(buf, uint64(numSubjects))
if numSubjects > 0 {
for subj, psi := range fs.psim {
buf = binary.AppendUvarint(buf, uint64(len(subj)))
buf = append(buf, subj...)
buf = binary.AppendUvarint(buf, psi.total)
buf = binary.AppendUvarint(buf, uint64(psi.fblk))
if psi.total > 1 {
buf = binary.AppendUvarint(buf, uint64(psi.lblk))
}
}
}
// Now walk all blocks and write out first and last and optional dmap encoding.
var lbi uint32
var lchk [8]byte
nb := len(fs.blks)
buf = binary.AppendUvarint(buf, uint64(nb))
// Use basetime to save some space.
baseTime := timestampNormalized(fs.state.FirstTime)
for _, mb := range fs.blks {
mb.mu.RLock()
buf = binary.AppendUvarint(buf, uint64(mb.index))
buf = binary.AppendUvarint(buf, mb.bytes)
buf = binary.AppendUvarint(buf, mb.first.seq)
buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
buf = binary.AppendUvarint(buf, mb.last.seq)
buf = binary.AppendVarint(buf, mb.last.ts-baseTime)
numDeleted := mb.dmap.Size()
buf = binary.AppendUvarint(buf, uint64(numDeleted))
if numDeleted > 0 {
var scratch [8 * 1024]byte
dmap, _ := mb.dmap.Encode(scratch[:0])
buf = append(buf, dmap...)
}
// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
// We use this to quickly open this file on recovery.
if mb == fs.lmb {
lbi = mb.index
mb.ensureLastChecksumLoaded()
copy(lchk[0:], mb.lchk[:])
}
mb.mu.RUnlock()
}
// Place block index and hash onto the end.
buf = binary.AppendUvarint(buf, uint64(lbi))
buf = append(buf, lchk[:]...)
// Encrypt if needed.
if fs.prf != nil {
if err := fs.setupAEK(); err != nil {
fs.mu.Unlock()
return err
}
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
rand.Read(nonce)
buf = fs.aek.Seal(nonce, nonce, buf, nil)
}
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
fs.hh.Reset()
fs.hh.Write(buf)
buf = fs.hh.Sum(buf)
// Snapshot prior dirty count.
priorDirty := fs.dirty
// Release lock.
fs.mu.Unlock()
// Write to a tmp file and rename.
const tmpPre = streamStreamStateFile + tsep
f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
if err != nil {
return err
}
tmpName := f.Name()
defer os.Remove(tmpName)
if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
f.Sync()
}
f.Close()
if err != nil {
return err
}
// Rename into position under our lock, clear prior dirty pending on success.
fs.mu.Lock()
if !fs.closed {
if err := os.Rename(tmpName, fn); err != nil {
fs.mu.Unlock()
return err
}
fs.dirty -= priorDirty
}
fs.mu.Unlock()
return nil
}
// Stop the current filestore.
func (fs *fileStore) Stop() error {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return ErrStoreClosed
}
fs.checkAndFlushAllBlocks()
fs.closeAllMsgBlocks(false)
fs.cancelSyncTimer()
fs.cancelAgeChk()
// Release the state flusher loop.
close(fs.qch)
// Wait for the state flush loop to exit.
fsld := fs.fsld
fs.mu.Unlock()
<-fsld
// Write full state if needed. If not dirty this is a no-op.
fs.writeFullState()
fs.mu.Lock()
// Mark as closed.
fs.closed = true
fs.lmb = nil
// We should update the upper usage layer on a stop.
cb, bytes := fs.scb, int64(fs.state.Bytes)
var _cfs [256]ConsumerStore
cfs := append(_cfs[:0], fs.cfs...)
fs.cfs = nil
fs.mu.Unlock()
for _, o := range cfs {
o.Stop()
}
if bytes > 0 && cb != nil {
cb(0, -bytes, 0, _EMPTY_)
}
return nil
}
const errFile = "errors.txt"
// Stream our snapshot through S2 compression and tar.
func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) {
defer w.Close()
enc := s2.NewWriter(w)
defer enc.Close()
tw := tar.NewWriter(enc)
defer tw.Close()
defer func() {
fs.mu.Lock()
fs.sips--
fs.mu.Unlock()
}()
modTime := time.Now().UTC()
writeFile := func(name string, buf []byte) error {
hdr := &tar.Header{
Name: name,
Mode: 0600,
ModTime: modTime,
Uname: "nats",
Gname: "nats",
Size: int64(len(buf)),
Format: tar.FormatPAX,
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(buf); err != nil {
return err
}
return nil
}
writeErr := func(err string) {
writeFile(errFile, []byte(err))
}
fs.mu.Lock()
blks := fs.blks
// Grab our general meta data.
// We do this now instead of pulling from files since they could be encrypted.
meta, err := json.Marshal(fs.cfg)
if err != nil {
fs.mu.Unlock()
writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
return
}
hh := fs.hh
hh.Reset()
hh.Write(meta)
sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
fs.mu.Unlock()
// Meta first.
if writeFile(JetStreamMetaFile, meta) != nil {
return
}
if writeFile(JetStreamMetaFileSum, sum) != nil {
return
}
// Can't use join path here, tar only recognizes relative paths with forward slashes.
msgPre := msgDir + "/"
var bbuf []byte
const minLen = 32
sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
if fs.aek != nil {
ns := fs.aek.NonceSize()
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
if err == nil {
// Redo hash checksum at end on plaintext.
hh.Reset()
hh.Write(buf)
buf = fs.hh.Sum(buf)
}
}
if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
return
}
}
// Now do messages themselves.
for _, mb := range blks {
if mb.pendingWriteSize() > 0 {
mb.flushPendingMsgs()
}
mb.mu.Lock()
// We could stream but don't want to hold the lock and prevent changes, so just read in and
// release the lock for now.
bbuf, err = mb.loadBlock(bbuf)
if err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
return
}
// Check for encryption.
if mb.bek != nil && len(bbuf) > 0 {
rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
if err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
return
}
rbek.XORKeyStream(bbuf, bbuf)
}
// Check for compression.
if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
mb.mu.Unlock()
writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
return
}
mb.mu.Unlock()
// Do this one unlocked.
if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
return
}
}
// Bail if no consumers requested.
if !includeConsumers {
return
}
// Do consumers' state last.
fs.mu.RLock()
cfs := fs.cfs
fs.mu.RUnlock()
for _, cs := range cfs {
o, ok := cs.(*consumerFileStore)
if !ok {
continue
}
o.mu.Lock()
// Grab our general meta data.
// We do this now instead of pulling from files since they could be encrypted.
meta, err := json.Marshal(o.cfg)
if err != nil {
o.mu.Unlock()
writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
return
}
o.hh.Reset()
o.hh.Write(meta)
sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))
// We can have the running state directly encoded now.
state, err := o.encodeState()
if err != nil {
o.mu.Unlock()
writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
return
}
odirPre := filepath.Join(consumerDir, o.name)
o.mu.Unlock()
// Write all the consumer files.
if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
return
}
if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
return
}
writeFile(filepath.Join(odirPre, consumerState), state)
}
}
// Create a snapshot of this stream and its consumer's state along with messages.
func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
fs.mu.Lock()
if fs.closed {
fs.mu.Unlock()
return nil, ErrStoreClosed
}
// Only allow one at a time.
if fs.sips > 0 {
fs.mu.Unlock()
return nil, ErrStoreSnapshotInProgress
}
// Mark us as snapshotting
fs.sips += 1
fs.mu.Unlock()
if checkMsgs {
ld := fs.checkMsgs()
if ld != nil && len(ld.Msgs) > 0 {
return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
}
}
pr, pw := net.Pipe()
// Set a write deadline here to protect ourselves.
if deadline > 0 {
pw.SetWriteDeadline(time.Now().Add(deadline))
}
// We can add to our stream while snapshotting but not "user" delete anything.
var state StreamState
fs.FastState(&state)
// Stream in separate Go routine.
go fs.streamSnapshot(pw, &state, includeConsumers)
return &SnapshotResult{pr, state}, nil
}
// Helper to return the config.
func (fs *fileStore) fileStoreConfig() FileStoreConfig {
fs.mu.RLock()
defer fs.mu.RUnlock()
return fs.fcfg
}
// Read lock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readLockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.RLock()
}
}
// Read unlock all existing message blocks.
// Lock held on entry.
func (fs *fileStore) readUnlockAllMsgBlocks() {
for _, mb := range fs.blks {
mb.mu.RUnlock()
}
}
// Binary encoded state snapshot, >= v2.10 server.
func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
// Calculate deleted.
var numDeleted int64
if fs.state.LastSeq > fs.state.FirstSeq {
numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
if numDeleted < 0 {
numDeleted = 0
}
}
// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
var buf [1024]byte
buf[0], buf[1] = streamStateMagic, streamStateVersion
n := hdrLen
n += binary.PutUvarint(buf[n:], fs.state.Msgs)
n += binary.PutUvarint(buf[n:], fs.state.Bytes)
n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
n += binary.PutUvarint(buf[n:], failed)
n += binary.PutUvarint(buf[n:], uint64(numDeleted))
b := buf[0:n]
if numDeleted > 0 {
var scratch [4 * 1024]byte
fs.readLockAllMsgBlocks()
defer fs.readUnlockAllMsgBlocks()
for _, db := range fs.deleteBlocks() {
switch db := db.(type) {
case *DeleteRange:
first, _, num := db.State()
scratch[0] = runLengthMagic
i := 1
i += binary.PutUvarint(scratch[i:], first)
i += binary.PutUvarint(scratch[i:], num)
b = append(b, scratch[0:i]...)
case *avl.SequenceSet:
buf, err := db.Encode(scratch[:0])
if err != nil {
return nil, err
}
b = append(b, buf...)
default:
return nil, errors.New("no impl")
}
}
}
return b, nil
}
// We used to be more sophisticated to save memory, but speed is more important.
// All blocks should be at least read locked.
func (fs *fileStore) deleteBlocks() DeleteBlocks {
var dbs DeleteBlocks
var prevLast uint64
for _, mb := range fs.blks {
// Detect if we have a gap between these blocks.
if prevLast > 0 && prevLast+1 != mb.first.seq {
gap := mb.first.seq - prevLast - 1
dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: gap})
}
if mb.dmap.Size() > 0 {
dbs = append(dbs, &mb.dmap)
}
prevLast = mb.last.seq
}
return dbs
}
// SyncDeleted will make sure this stream has same deleted state as dbs.
func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
if len(dbs) == 0 {
return
}
fs.mu.Lock()
defer fs.mu.Unlock()
var needsCheck DeleteBlocks
fs.readLockAllMsgBlocks()
mdbs := fs.deleteBlocks()
for i, db := range dbs {
// If the block is same as what we have we can skip.
if i < len(mdbs) {
first, last, num := db.State()
eFirst, eLast, eNum := mdbs[i].State()
if first == eFirst && last == eLast && num == eNum {
continue
}
}
// Need to insert these.
needsCheck = append(needsCheck, db)
}
fs.readUnlockAllMsgBlocks()
for _, db := range needsCheck {
db.Range(func(dseq uint64) bool {
fs.removeMsg(dseq, false, true, false)
return true
})
}
}
////////////////////////////////////////////////////////////////////////////////
// Consumers
////////////////////////////////////////////////////////////////////////////////
type consumerFileStore struct {
mu sync.Mutex
fs *fileStore
cfg *FileConsumerInfo
prf keyGen
aek cipher.AEAD
name string
odir string
ifn string
hh hash.Hash64
state ConsumerState
fch chan struct{}
qch chan struct{}
flusher bool
writing bool
dirty bool
closed bool
}
func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
if fs == nil {
return nil, fmt.Errorf("filestore is nil")
}
if fs.isClosed() {
return nil, ErrStoreClosed
}
if cfg == nil || name == _EMPTY_ {
return nil, fmt.Errorf("bad consumer config")
}
// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
if cfg.MemoryStorage {
// Create directly here.
o := &consumerMemStore{ms: fs, cfg: *cfg}
fs.AddConsumer(o)
return o, nil
}
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
return nil, fmt.Errorf("could not create consumer directory - %v", err)
}
csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
o := &consumerFileStore{
fs: fs,
cfg: csi,
prf: fs.prf,
name: name,
odir: odir,
ifn: filepath.Join(odir, consumerState),
}
key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil, fmt.Errorf("could not create hash: %v", err)
}
o.hh = hh
// Check for encryption.
if o.prf != nil {
if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
if len(ekey) < minBlkKeySize {
return nil, errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
if err != nil {
return nil, err
}
sc := fs.fcfg.Cipher
kek, err := genEncryptionKey(sc, rb)
if err != nil {
return nil, err
}
ns := kek.NonceSize()
nonce := ekey[:ns]
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
if err != nil {
// We may be here on a cipher conversion, so attempt to convert.
if err = o.convertCipher(); err != nil {
return nil, err
}
} else {
o.aek, err = genEncryptionKey(sc, seed)
}
if err != nil {
return nil, err
}
}
}
// Track if we are creating the directory so that we can clean up if we encounter an error.
var didCreate bool
// Write our meta data iff does not exist.
meta := filepath.Join(odir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
didCreate = true
csi.Created = time.Now().UTC()
if err := o.writeConsumerMeta(); err != nil {
os.RemoveAll(odir)
return nil, err
}
}
// If we expect to be encrypted check that what we are restoring is not plaintext.
// This can happen on snapshot restores or conversions.
if o.prf != nil {
keyFile := filepath.Join(odir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
if err := o.writeConsumerMeta(); err != nil {
if didCreate {
os.RemoveAll(odir)
}
return nil, err
}
// Redo the state file as well here if we have one and we can tell it was plaintext.
if buf, err := os.ReadFile(o.ifn); err == nil {
if _, err := decodeConsumerState(buf); err == nil {
if err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms); err != nil {
if didCreate {
os.RemoveAll(odir)
}
return nil, err
}
}
}
}
}
// Create channels to control our flush go routine.
o.fch = make(chan struct{}, 1)
o.qch = make(chan struct{})
go o.flushLoop(o.fch, o.qch)
// Make sure to load in our state from disk if needed.
o.loadState()
// Assign to filestore.
fs.AddConsumer(o)
return o, nil
}
func (o *consumerFileStore) convertCipher() error {
fs := o.fs
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)
ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
if err != nil {
return err
}
if len(ekey) < minBlkKeySize {
return errBadKeySize
}
// Recover key encryption key.
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
if err != nil {
return err
}
// Do these in reverse since converting.
sc := fs.fcfg.Cipher
osc := AES
if sc == AES {
osc = ChaCha
}
kek, err := genEncryptionKey(osc, rb)
if err != nil {
return err
}
ns := kek.NonceSize()
nonce := ekey[:ns]
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
if err != nil {
return err
}
aek, err := genEncryptionKey(osc, seed)
if err != nil {
return err
}
// Now read in and decode our state using the old cipher.
buf, err := os.ReadFile(o.ifn)
if err != nil {
return err
}
buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
return err
}
// Since we are here we recovered our old state.
// Now write our meta, which will generate the new keys with the new cipher.
if err := o.writeConsumerMeta(); err != nil {
return err
}
// Now write out or state with the new cipher.
return o.writeState(buf)
}
// Kick flusher for this consumer.
// Lock should be held.
func (o *consumerFileStore) kickFlusher() {
if o.fch != nil {
select {
case o.fch <- struct{}{}:
default:
}
}
o.dirty = true
}
// Set in flusher status
func (o *consumerFileStore) setInFlusher() {
o.mu.Lock()
o.flusher = true
o.mu.Unlock()
}
// Clear in flusher status
func (o *consumerFileStore) clearInFlusher() {
o.mu.Lock()
o.flusher = false
o.mu.Unlock()
}
// Report in flusher status
func (o *consumerFileStore) inFlusher() bool {
o.mu.Lock()
defer o.mu.Unlock()
return o.flusher
}
// flushLoop watches for consumer updates and the quit channel.
func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {
o.setInFlusher()
defer o.clearInFlusher()
// Maintain approximately 10 updates per second per consumer under load.
const minTime = 100 * time.Millisecond
var lastWrite time.Time
var dt *time.Timer
setDelayTimer := func(addWait time.Duration) {
if dt == nil {
dt = time.NewTimer(addWait)
return
}
if !dt.Stop() {
select {
case <-dt.C:
default:
}
}
dt.Reset(addWait)
}
for {
select {
case <-fch:
if ts := time.Since(lastWrite); ts < minTime {
setDelayTimer(minTime - ts)
select {
case <-dt.C:
case <-qch:
return
}
}
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return
}
buf, err := o.encodeState()
o.mu.Unlock()
if err != nil {
return
}
// TODO(dlc) - if we error should start failing upwards.
if err := o.writeState(buf); err == nil {
lastWrite = time.Now()
}
case <-qch:
return
}
}
}
// SetStarting sets our starting stream sequence.
func (o *consumerFileStore) SetStarting(sseq uint64) error {
o.mu.Lock()
o.state.Delivered.Stream = sseq
buf, err := o.encodeState()
o.mu.Unlock()
if err != nil {
return err
}
return o.writeState(buf)
}
// HasState returns if this store has a recorded state.
func (o *consumerFileStore) HasState() bool {
o.mu.Lock()
_, err := os.Stat(o.ifn)
o.mu.Unlock()
return err == nil
}
// UpdateDelivered is called whenever a new message has been delivered.
func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
o.mu.Lock()
defer o.mu.Unlock()
if dc != 1 && o.cfg.AckPolicy == AckNone {
return ErrNoAckPolicy
}
// On restarts the old leader may get a replay from the raft logs that are old.
if dseq <= o.state.AckFloor.Consumer {
return nil
}
// See if we expect an ack for this.
if o.cfg.AckPolicy != AckNone {
// Need to create pending records here.
if o.state.Pending == nil {
o.state.Pending = make(map[uint64]*Pending)
}
var p *Pending
// Check for an update to a message already delivered.
if sseq <= o.state.Delivered.Stream {
if p = o.state.Pending[sseq]; p != nil {
p.Sequence, p.Timestamp = dseq, ts
}
} else {
// Add to pending.
o.state.Pending[sseq] = &Pending{dseq, ts}
}
// Update delivered as needed.
if dseq > o.state.Delivered.Consumer {
o.state.Delivered.Consumer = dseq
}
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq
}
if dc > 1 {
if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
// Make sure to remove from pending.
delete(o.state.Pending, sseq)
}
if o.state.Redelivered == nil {
o.state.Redelivered = make(map[uint64]uint64)
}
// Only update if greater then what we already have.
if o.state.Redelivered[sseq] < dc-1 {
o.state.Redelivered[sseq] = dc - 1
}
}
} else {
// For AckNone just update delivered and ackfloor at the same time.
if dseq > o.state.Delivered.Consumer {
o.state.Delivered.Consumer = dseq
o.state.AckFloor.Consumer = dseq
}
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq
o.state.AckFloor.Stream = sseq
}
}
// Make sure we flush to disk.
o.kickFlusher()
return nil
}
// UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
o.mu.Lock()
defer o.mu.Unlock()
if o.cfg.AckPolicy == AckNone {
return ErrNoAckPolicy
}
// On restarts the old leader may get a replay from the raft logs that are old.
if dseq <= o.state.AckFloor.Consumer {
return nil
}
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
return ErrStoreMsgNotFound
}
// Check for AckAll here.
if o.cfg.AckPolicy == AckAll {
sgap := sseq - o.state.AckFloor.Stream
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
for seq := sseq; seq > sseq-sgap; seq-- {
delete(o.state.Pending, seq)
if len(o.state.Redelivered) > 0 {
delete(o.state.Redelivered, seq)
}
}
o.kickFlusher()
return nil
}
// AckExplicit
// First delete from our pending state.
if p, ok := o.state.Pending[sseq]; ok {
delete(o.state.Pending, sseq)
dseq = p.Sequence // Use the original.
}
if len(o.state.Pending) == 0 {
o.state.AckFloor.Consumer = o.state.Delivered.Consumer
o.state.AckFloor.Stream = o.state.Delivered.Stream
} else if dseq == o.state.AckFloor.Consumer+1 {
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
if o.state.Delivered.Consumer > dseq {
for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
if p, ok := o.state.Pending[ss]; ok {
if p.Sequence > 0 {
o.state.AckFloor.Consumer = p.Sequence - 1
o.state.AckFloor.Stream = ss - 1
}
break
}
}
}
}
// We do these regardless.
delete(o.state.Redelivered, sseq)
o.kickFlusher()
return nil
}
const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen
// Encode our consumer state, version 2.
// Lock should be held.
func (o *consumerFileStore) EncodedState() ([]byte, error) {
o.mu.Lock()
defer o.mu.Unlock()
return o.encodeState()
}
func (o *consumerFileStore) encodeState() ([]byte, error) {
// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
state, err := o.stateWithCopyLocked(false)
if err != nil {
return nil, err
}
return encodeConsumerState(state), nil
}
func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
o.mu.Lock()
defer o.mu.Unlock()
// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
csi := o.cfg
csi.ConsumerConfig = *cfg
return o.writeConsumerMeta()
}
func (o *consumerFileStore) Update(state *ConsumerState) error {
o.mu.Lock()
defer o.mu.Unlock()
// Check to see if this is an outdated update.
if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
return nil
}
// Sanity checks.
if state.AckFloor.Consumer > state.Delivered.Consumer {
return fmt.Errorf("bad ack floor for consumer")
}
if state.AckFloor.Stream > state.Delivered.Stream {
return fmt.Errorf("bad ack floor for stream")
}
// Copy to our state.
var pending map[uint64]*Pending
var redelivered map[uint64]uint64
if len(state.Pending) > 0 {
pending = make(map[uint64]*Pending, len(state.Pending))
for seq, p := range state.Pending {
pending[seq] = &Pending{p.Sequence, p.Timestamp}
if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
}
}
}
if len(state.Redelivered) > 0 {
redelivered = make(map[uint64]uint64, len(state.Redelivered))
for seq, dc := range state.Redelivered {
redelivered[seq] = dc
}
}
o.state.Delivered = state.Delivered
o.state.AckFloor = state.AckFloor
o.state.Pending = pending
o.state.Redelivered = redelivered
o.kickFlusher()
return nil
}
// Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
// Lock should be held.
func (o *consumerFileStore) encryptState(buf []byte) []byte {
if o.aek == nil {
return buf
}
// TODO(dlc) - Optimize on space usage a bit?
nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
rand.Read(nonce)
return o.aek.Seal(nonce, nonce, buf, nil)
}
// Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
// https://github.com/nats-io/nats-server/issues/2742
var dios chan struct{}
// Used to setup our simplistic counting semaphore using buffered channels.
// golang.org's semaphore seemed a bit heavy.
func init() {
// Limit ourselves to a max of 4 blocking IO calls.
const nIO = 4
dios = make(chan struct{}, nIO)
// Fill it up to start.
for i := 0; i < nIO; i++ {
dios <- struct{}{}
}
}
func (o *consumerFileStore) writeState(buf []byte) error {
// Check if we have the index file open.
o.mu.Lock()
if o.writing || len(buf) == 0 {
o.mu.Unlock()
return nil
}
// Check on encryption.
if o.aek != nil {
buf = o.encryptState(buf)
}
o.writing = true
o.dirty = false
ifn := o.ifn
o.mu.Unlock()
// Lock not held here but we do limit number of outstanding calls that could block OS threads.
<-dios
err := os.WriteFile(ifn, buf, defaultFilePerms)
dios <- struct{}{}
o.mu.Lock()
if err != nil {
o.dirty = true
}
o.writing = false
o.mu.Unlock()
return err
}
// Will upodate the config. Only used when recovering ephemerals.
func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
o.mu.Lock()
defer o.mu.Unlock()
o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
return o.writeConsumerMeta()
}
// Write out the consumer meta data, i.e. state.
// Lock should be held.
func (cfs *consumerFileStore) writeConsumerMeta() error {
meta := filepath.Join(cfs.odir, JetStreamMetaFile)
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
return err
}
if cfs.prf != nil && cfs.aek == nil {
fs := cfs.fs
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
if err != nil {
return err
}
cfs.aek = key
keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
return err
}
}
b, err := json.Marshal(cfs.cfg)
if err != nil {
return err
}
// Encrypt if needed.
if cfs.aek != nil {
nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
rand.Read(nonce)
b = cfs.aek.Seal(nonce, nonce, b, nil)
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
cfs.hh.Reset()
cfs.hh.Write(b)
checksum := hex.EncodeToString(cfs.hh.Sum(nil))
sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
// Consumer version.
func checkConsumerHeader(hdr []byte) (uint8, error) {
if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
return 0, errCorruptState
}
version := hdr[1]
switch version {
case 1, 2:
return version, nil
}
return 0, fmt.Errorf("unsupported version: %d", version)
}
func (o *consumerFileStore) copyPending() map[uint64]*Pending {
pending := make(map[uint64]*Pending, len(o.state.Pending))
for seq, p := range o.state.Pending {
pending[seq] = &Pending{p.Sequence, p.Timestamp}
}
return pending
}
func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
for seq, dc := range o.state.Redelivered {
redelivered[seq] = dc
}
return redelivered
}
// Type returns the type of the underlying store.
func (o *consumerFileStore) Type() StorageType { return FileStorage }
// State retrieves the state from the state file.
// This is not expected to be called in high performance code, only on startup.
func (o *consumerFileStore) State() (*ConsumerState, error) {
return o.stateWithCopy(true)
}
// This will not copy pending or redelivered, so should only be done under the
// consumer owner's lock.
func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
return o.stateWithCopy(false)
}
func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
o.mu.Lock()
defer o.mu.Unlock()
return o.stateWithCopyLocked(doCopy)
}
// Lock should be held.
func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
if o.closed {
return nil, ErrStoreClosed
}
state := &ConsumerState{}
// See if we have a running state or if we need to read in from disk.
if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
state.Delivered = o.state.Delivered
state.AckFloor = o.state.AckFloor
if len(o.state.Pending) > 0 {
if doCopy {
state.Pending = o.copyPending()
} else {
state.Pending = o.state.Pending
}
}
if len(o.state.Redelivered) > 0 {
if doCopy {
state.Redelivered = o.copyRedelivered()
} else {
state.Redelivered = o.state.Redelivered
}
}
return state, nil
}
// Read the state in here from disk..
buf, err := os.ReadFile(o.ifn)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if len(buf) == 0 {
return state, nil
}
// Check on encryption.
if o.aek != nil {
ns := o.aek.NonceSize()
buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
if err != nil {
return nil, err
}
}
state, err = decodeConsumerState(buf)
if err != nil {
return nil, err
}
// Copy this state into our own.
o.state.Delivered = state.Delivered
o.state.AckFloor = state.AckFloor
if len(state.Pending) > 0 {
if doCopy {
o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
for seq, p := range state.Pending {
o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
}
} else {
o.state.Pending = state.Pending
}
}
if len(state.Redelivered) > 0 {
if doCopy {
o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
for seq, dc := range state.Redelivered {
o.state.Redelivered[seq] = dc
}
} else {
o.state.Redelivered = state.Redelivered
}
}
return state, nil
}
// Lock should be held. Called at startup.
func (o *consumerFileStore) loadState() {
if _, err := os.Stat(o.ifn); err == nil {
// This will load our state in from disk.
o.stateWithCopyLocked(false)
}
}
// Decode consumer state.
func decodeConsumerState(buf []byte) (*ConsumerState, error) {
version, err := checkConsumerHeader(buf)
if err != nil {
return nil, err
}
bi := hdrLen
// Helpers, will set i to -1 on error.
readSeq := func() uint64 {
if bi < 0 {
return 0
}
seq, n := binary.Uvarint(buf[bi:])
if n <= 0 {
bi = -1
return 0
}
bi += n
return seq
}
readTimeStamp := func() int64 {
if bi < 0 {
return 0
}
ts, n := binary.Varint(buf[bi:])
if n <= 0 {
bi = -1
return -1
}
bi += n
return ts
}
// Just for clarity below.
readLen := readSeq
readCount := readSeq
state := &ConsumerState{}
state.AckFloor.Consumer = readSeq()
state.AckFloor.Stream = readSeq()
state.Delivered.Consumer = readSeq()
state.Delivered.Stream = readSeq()
if bi == -1 {
return nil, errCorruptState
}
if version == 1 {
// Adjust back. Version 1 also stored delivered as next to be delivered,
// so adjust that back down here.
if state.AckFloor.Consumer > 1 {
state.Delivered.Consumer += state.AckFloor.Consumer - 1
}
if state.AckFloor.Stream > 1 {
state.Delivered.Stream += state.AckFloor.Stream - 1
}
}
// We have additional stuff.
if numPending := readLen(); numPending > 0 {
mints := readTimeStamp()
state.Pending = make(map[uint64]*Pending, numPending)
for i := 0; i < int(numPending); i++ {
sseq := readSeq()
var dseq uint64
if version == 2 {
dseq = readSeq()
}
ts := readTimeStamp()
// Check the state machine for corruption, not the value which could be -1.
if bi == -1 {
return nil, errCorruptState
}
// Adjust seq back.
sseq += state.AckFloor.Stream
if sseq == 0 {
return nil, errCorruptState
}
if version == 2 {
dseq += state.AckFloor.Consumer
}
// Adjust the timestamp back.
if version == 1 {
ts = (ts + mints) * int64(time.Second)
} else {
ts = (mints - ts) * int64(time.Second)
}
// Store in pending.
state.Pending[sseq] = &Pending{dseq, ts}
}
}
// We have redelivered entries here.
if numRedelivered := readLen(); numRedelivered > 0 {
state.Redelivered = make(map[uint64]uint64, numRedelivered)
for i := 0; i < int(numRedelivered); i++ {
if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
// Adjust seq back.
seq += state.AckFloor.Stream
state.Redelivered[seq] = n
}
}
}
return state, nil
}
// Stop the processing of the consumers's state.
func (o *consumerFileStore) Stop() error {
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return nil
}
if o.qch != nil {
close(o.qch)
o.qch = nil
}
var err error
var buf []byte
if o.dirty {
// Make sure to write this out..
if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
if o.aek != nil {
buf = o.encryptState(buf)
}
}
}
o.odir = _EMPTY_
o.closed = true
ifn, fs := o.ifn, o.fs
o.mu.Unlock()
fs.RemoveConsumer(o)
if len(buf) > 0 {
o.waitOnFlusher()
<-dios
err = os.WriteFile(ifn, buf, defaultFilePerms)
dios <- struct{}{}
}
return err
}
func (o *consumerFileStore) waitOnFlusher() {
if !o.inFlusher() {
return
}
timeout := time.Now().Add(100 * time.Millisecond)
for time.Now().Before(timeout) {
if !o.inFlusher() {
return
}
time.Sleep(10 * time.Millisecond)
}
}
// Delete the consumer.
func (o *consumerFileStore) Delete() error {
return o.delete(false)
}
func (o *consumerFileStore) StreamDelete() error {
return o.delete(true)
}
func (o *consumerFileStore) delete(streamDeleted bool) error {
o.mu.Lock()
if o.closed {
o.mu.Unlock()
return nil
}
if o.qch != nil {
close(o.qch)
o.qch = nil
}
var err error
odir := o.odir
o.odir = _EMPTY_
o.closed = true
fs := o.fs
o.mu.Unlock()
// If our stream was not deleted this will remove the directories.
if odir != _EMPTY_ && !streamDeleted {
<-dios
err = os.RemoveAll(odir)
dios <- struct{}{}
}
if !streamDeleted {
fs.RemoveConsumer(o)
}
return err
}
func (fs *fileStore) AddConsumer(o ConsumerStore) error {
fs.mu.Lock()
defer fs.mu.Unlock()
fs.cfs = append(fs.cfs, o)
return nil
}
func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
fs.mu.Lock()
defer fs.mu.Unlock()
for i, cfs := range fs.cfs {
if o == cfs {
fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
break
}
}
return nil
}
////////////////////////////////////////////////////////////////////////////////
// Templates
////////////////////////////////////////////////////////////////////////////////
type templateFileStore struct {
dir string
hh hash.Hash64
}
func newTemplateFileStore(storeDir string) *templateFileStore {
tdir := filepath.Join(storeDir, tmplsDir)
key := sha256.Sum256([]byte("templates"))
hh, err := highwayhash.New64(key[:])
if err != nil {
return nil
}
return &templateFileStore{dir: tdir, hh: hh}
}
func (ts *templateFileStore) Store(t *streamTemplate) error {
dir := filepath.Join(ts.dir, t.Name)
if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
}
meta := filepath.Join(dir, JetStreamMetaFile)
if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
return err
}
t.mu.Lock()
b, err := json.Marshal(t)
t.mu.Unlock()
if err != nil {
return err
}
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
return err
}
// FIXME(dlc) - Do checksum
ts.hh.Reset()
ts.hh.Write(b)
checksum := hex.EncodeToString(ts.hh.Sum(nil))
sum := filepath.Join(dir, JetStreamMetaFileSum)
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
return err
}
return nil
}
func (ts *templateFileStore) Delete(t *streamTemplate) error {
return os.RemoveAll(filepath.Join(ts.dir, t.Name))
}
////////////////////////////////////////////////////////////////////////////////
// Compression
////////////////////////////////////////////////////////////////////////////////
type CompressionInfo struct {
Algorithm StoreCompression
OriginalSize uint64
}
func (c *CompressionInfo) MarshalMetadata() []byte {
b := make([]byte, 14) // 4 + potentially up to 10 for uint64
b[0], b[1], b[2] = 'c', 'm', 'p'
b[3] = byte(c.Algorithm)
n := binary.PutUvarint(b[4:], c.OriginalSize)
return b[:4+n]
}
func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
c.Algorithm = NoCompression
c.OriginalSize = 0
if len(b) < 5 { // 4 + min 1 for uvarint uint64
return 0, nil
}
if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
return 0, nil
}
var n int
c.Algorithm = StoreCompression(b[3])
c.OriginalSize, n = binary.Uvarint(b[4:])
if n <= 0 {
return 0, fmt.Errorf("metadata incomplete")
}
return 4 + n, nil
}
func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
if len(buf) < checksumSize {
return nil, fmt.Errorf("uncompressed buffer is too short")
}
bodyLen := int64(len(buf) - checksumSize)
var output bytes.Buffer
var writer io.WriteCloser
switch alg {
case NoCompression:
return buf, nil
case S2Compression:
writer = s2.NewWriter(&output)
default:
return nil, fmt.Errorf("compression algorithm not known")
}
input := bytes.NewReader(buf[:bodyLen])
checksum := buf[bodyLen:]
// Compress the block content, but don't compress the checksum.
// We will preserve it at the end of the block as-is.
if n, err := io.CopyN(writer, input, bodyLen); err != nil {
return nil, fmt.Errorf("error writing to compression writer: %w", err)
} else if n != bodyLen {
return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
}
if err := writer.Close(); err != nil {
return nil, fmt.Errorf("error closing compression writer: %w", err)
}
// Now add the checksum back onto the end of the block.
if n, err := output.Write(checksum); err != nil {
return nil, fmt.Errorf("error writing checksum: %w", err)
} else if n != checksumSize {
return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
}
return output.Bytes(), nil
}
func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
if len(buf) < checksumSize {
return nil, fmt.Errorf("compressed buffer is too short")
}
bodyLen := int64(len(buf) - checksumSize)
input := bytes.NewReader(buf[:bodyLen])
var reader io.ReadCloser
switch alg {
case NoCompression:
return buf, nil
case S2Compression:
reader = io.NopCloser(s2.NewReader(input))
default:
return nil, fmt.Errorf("compression algorithm not known")
}
// Decompress the block content. The checksum isn't compressed so
// we can preserve it from the end of the block as-is.
checksum := buf[bodyLen:]
output, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("error reading compression reader: %w", err)
}
output = append(output, checksum...)
return output, reader.Close()
}