mirror of
https://github.com/gogrlx/nats-server.git
synced 2026-04-02 03:38:42 -07:00
Several strategies which are listed below. 1. Checking a RaftNode to see if it is the leader now uses atomics. 2. Checking if we are the JetStream meta leader from the server now uses an atomic. 3. Accessing the JetStream context no longer requires a server lock, uses atomic.Pointer. 4. Filestore syncBlocks would hold msgBlock locks during sync, now does not. Signed-off-by: Derek Collison <derek@nats.io>
8592 lines
209 KiB
Go
8592 lines
209 KiB
Go
// Copyright 2019-2023 The NATS Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package server
|
|
|
|
import (
|
|
"archive/tar"
|
|
"bytes"
|
|
"crypto/aes"
|
|
"crypto/cipher"
|
|
"crypto/rand"
|
|
"crypto/sha256"
|
|
"encoding/binary"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"hash"
|
|
"io"
|
|
"math"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/klauspost/compress/s2"
|
|
"github.com/minio/highwayhash"
|
|
"github.com/nats-io/nats-server/v2/server/avl"
|
|
"golang.org/x/crypto/chacha20"
|
|
"golang.org/x/crypto/chacha20poly1305"
|
|
)
|
|
|
|
type FileStoreConfig struct {
|
|
// Where the parent directory for all storage will be located.
|
|
StoreDir string
|
|
// BlockSize is the file block size. This also represents the maximum overhead size.
|
|
BlockSize uint64
|
|
// CacheExpire is how long with no activity until we expire the cache.
|
|
CacheExpire time.Duration
|
|
// SyncInterval is how often we sync to disk in the background.
|
|
SyncInterval time.Duration
|
|
// SyncAlways is when the stream should sync all data writes.
|
|
SyncAlways bool
|
|
// AsyncFlush allows async flush to batch write operations.
|
|
AsyncFlush bool
|
|
// Cipher is the cipher to use when encrypting.
|
|
Cipher StoreCipher
|
|
// Compression is the algorithm to use when compressing.
|
|
Compression StoreCompression
|
|
|
|
// Internal reference to our server.
|
|
srv *Server
|
|
}
|
|
|
|
// FileStreamInfo allows us to remember created time.
|
|
type FileStreamInfo struct {
|
|
Created time.Time
|
|
StreamConfig
|
|
}
|
|
|
|
type StoreCipher int
|
|
|
|
const (
|
|
ChaCha StoreCipher = iota
|
|
AES
|
|
NoCipher
|
|
)
|
|
|
|
func (cipher StoreCipher) String() string {
|
|
switch cipher {
|
|
case ChaCha:
|
|
return "ChaCha20-Poly1305"
|
|
case AES:
|
|
return "AES-GCM"
|
|
case NoCipher:
|
|
return "None"
|
|
default:
|
|
return "Unknown StoreCipher"
|
|
}
|
|
}
|
|
|
|
type StoreCompression uint8
|
|
|
|
const (
|
|
NoCompression StoreCompression = iota
|
|
S2Compression
|
|
)
|
|
|
|
func (alg StoreCompression) String() string {
|
|
switch alg {
|
|
case NoCompression:
|
|
return "None"
|
|
case S2Compression:
|
|
return "S2"
|
|
default:
|
|
return "Unknown StoreCompression"
|
|
}
|
|
}
|
|
|
|
func (alg StoreCompression) MarshalJSON() ([]byte, error) {
|
|
var str string
|
|
switch alg {
|
|
case S2Compression:
|
|
str = "s2"
|
|
case NoCompression:
|
|
str = "none"
|
|
default:
|
|
return nil, fmt.Errorf("unknown compression algorithm")
|
|
}
|
|
return json.Marshal(str)
|
|
}
|
|
|
|
func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
|
|
var str string
|
|
if err := json.Unmarshal(b, &str); err != nil {
|
|
return err
|
|
}
|
|
switch str {
|
|
case "s2":
|
|
*alg = S2Compression
|
|
case "none":
|
|
*alg = NoCompression
|
|
default:
|
|
return fmt.Errorf("unknown compression algorithm")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// File ConsumerInfo is used for creating consumer stores.
|
|
type FileConsumerInfo struct {
|
|
Created time.Time
|
|
Name string
|
|
ConsumerConfig
|
|
}
|
|
|
|
// Default file and directory permissions.
|
|
const (
|
|
defaultDirPerms = os.FileMode(0750)
|
|
defaultFilePerms = os.FileMode(0640)
|
|
)
|
|
|
|
type psi struct {
|
|
total uint64
|
|
fblk uint32
|
|
lblk uint32
|
|
}
|
|
|
|
type fileStore struct {
|
|
srv *Server
|
|
mu sync.RWMutex
|
|
state StreamState
|
|
tombs []uint64
|
|
ld *LostStreamData
|
|
scb StorageUpdateHandler
|
|
ageChk *time.Timer
|
|
syncTmr *time.Timer
|
|
cfg FileStreamInfo
|
|
fcfg FileStoreConfig
|
|
prf keyGen
|
|
oldprf keyGen
|
|
aek cipher.AEAD
|
|
lmb *msgBlock
|
|
blks []*msgBlock
|
|
bim map[uint32]*msgBlock
|
|
psim map[string]*psi
|
|
hh hash.Hash64
|
|
qch chan struct{}
|
|
fch chan struct{}
|
|
fsld chan struct{}
|
|
cfs []ConsumerStore
|
|
sips int
|
|
dirty int
|
|
closed bool
|
|
fip bool
|
|
receivedAny bool
|
|
}
|
|
|
|
// Represents a message store block and its data.
|
|
type msgBlock struct {
|
|
// Here for 32bit systems and atomic.
|
|
first msgId
|
|
last msgId
|
|
mu sync.RWMutex
|
|
fs *fileStore
|
|
aek cipher.AEAD
|
|
bek cipher.Stream
|
|
seed []byte
|
|
nonce []byte
|
|
mfn string
|
|
mfd *os.File
|
|
cmp StoreCompression // Effective compression at the time of loading the block
|
|
liwsz int64
|
|
index uint32
|
|
bytes uint64 // User visible bytes count.
|
|
rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
|
|
msgs uint64 // User visible message count.
|
|
fss map[string]*SimpleState
|
|
kfn string
|
|
lwts int64
|
|
llts int64
|
|
lrts int64
|
|
llseq uint64
|
|
hh hash.Hash64
|
|
cache *cache
|
|
cloads uint64
|
|
cexp time.Duration
|
|
ctmr *time.Timer
|
|
werr error
|
|
dmap avl.SequenceSet
|
|
fch chan struct{}
|
|
qch chan struct{}
|
|
lchk [8]byte
|
|
loading bool
|
|
flusher bool
|
|
noTrack bool
|
|
needSync bool
|
|
syncAlways bool
|
|
closed bool
|
|
|
|
// Used to mock write failures.
|
|
mockWriteErr bool
|
|
}
|
|
|
|
// Write through caching layer that is also used on loading messages.
|
|
type cache struct {
|
|
buf []byte
|
|
off int
|
|
wp int
|
|
idx []uint32
|
|
lrl uint32
|
|
fseq uint64
|
|
nra bool
|
|
}
|
|
|
|
type msgId struct {
|
|
seq uint64
|
|
ts int64
|
|
}
|
|
|
|
const (
|
|
// Magic is used to identify the file store files.
|
|
magic = uint8(22)
|
|
// Version
|
|
version = uint8(1)
|
|
// New IndexInfo Version
|
|
newVersion = uint8(2)
|
|
// hdrLen
|
|
hdrLen = 2
|
|
// This is where we keep the streams.
|
|
streamsDir = "streams"
|
|
// This is where we keep the message store blocks.
|
|
msgDir = "msgs"
|
|
// This is where we temporarily move the messages dir.
|
|
purgeDir = "__msgs__"
|
|
// used to scan blk file names.
|
|
blkScan = "%d.blk"
|
|
// used for compacted blocks that are staged.
|
|
newScan = "%d.new"
|
|
// used to scan index file names.
|
|
indexScan = "%d.idx"
|
|
// to look for orphans
|
|
indexScanAll = "*.idx"
|
|
// to look for orphans
|
|
fssScanAll = "*.fss"
|
|
// used to store our block encryption key.
|
|
keyScan = "%d.key"
|
|
// to look for orphans
|
|
keyScanAll = "*.key"
|
|
// This is where we keep state on consumers.
|
|
consumerDir = "obs"
|
|
// Index file for a consumer.
|
|
consumerState = "o.dat"
|
|
// The suffix that will be given to a new temporary block during compression.
|
|
compressTmpSuffix = ".tmp"
|
|
// This is where we keep state on templates.
|
|
tmplsDir = "templates"
|
|
// Maximum size of a write buffer we may consider for re-use.
|
|
maxBufReuse = 2 * 1024 * 1024
|
|
// default cache buffer expiration
|
|
defaultCacheBufferExpiration = 5 * time.Second
|
|
// default sync interval
|
|
defaultSyncInterval = 2 * time.Minute
|
|
// default idle timeout to close FDs.
|
|
closeFDsIdle = 30 * time.Second
|
|
// coalesceMinimum
|
|
coalesceMinimum = 16 * 1024
|
|
// maxFlushWait is maximum we will wait to gather messages to flush.
|
|
maxFlushWait = 8 * time.Millisecond
|
|
|
|
// Metafiles for streams and consumers.
|
|
JetStreamMetaFile = "meta.inf"
|
|
JetStreamMetaFileSum = "meta.sum"
|
|
JetStreamMetaFileKey = "meta.key"
|
|
|
|
// This is the full snapshotted state for the stream.
|
|
streamStreamStateFile = "index.db"
|
|
|
|
// AEK key sizes
|
|
minMetaKeySize = 64
|
|
minBlkKeySize = 64
|
|
|
|
// Default stream block size.
|
|
defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
|
|
// Default for workqueue or interest based.
|
|
defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
|
|
// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
|
|
// E.g. mirrors/sources etc.
|
|
defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
|
|
// Maximum size for the encrypted head block.
|
|
maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
|
|
// Default for KV based
|
|
defaultKVBlockSize = defaultMediumBlockSize
|
|
// max block size for now.
|
|
maxBlockSize = defaultLargeBlockSize
|
|
// Compact minimum threshold.
|
|
compactMinimum = 2 * 1024 * 1024 // 2MB
|
|
// FileStoreMinBlkSize is minimum size we will do for a blk size.
|
|
FileStoreMinBlkSize = 32 * 1000 // 32kib
|
|
// FileStoreMaxBlkSize is maximum size we will do for a blk size.
|
|
FileStoreMaxBlkSize = maxBlockSize
|
|
// Check for bad record length value due to corrupt data.
|
|
rlBadThresh = 32 * 1024 * 1024
|
|
// Checksum size for hash for msg records.
|
|
recordHashSize = 8
|
|
)
|
|
|
|
func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
|
|
return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
|
|
}
|
|
|
|
func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
|
|
if cfg.Name == _EMPTY_ {
|
|
return nil, fmt.Errorf("name required")
|
|
}
|
|
if cfg.Storage != FileStorage {
|
|
return nil, fmt.Errorf("fileStore requires file storage type in config")
|
|
}
|
|
// Default values.
|
|
if fcfg.BlockSize == 0 {
|
|
fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
|
|
}
|
|
if fcfg.BlockSize > maxBlockSize {
|
|
return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
|
|
}
|
|
if fcfg.CacheExpire == 0 {
|
|
fcfg.CacheExpire = defaultCacheBufferExpiration
|
|
}
|
|
if fcfg.SyncInterval == 0 {
|
|
fcfg.SyncInterval = defaultSyncInterval
|
|
}
|
|
|
|
// Check the directory
|
|
if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
|
|
if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
|
|
return nil, fmt.Errorf("could not create storage directory - %v", err)
|
|
}
|
|
} else if stat == nil || !stat.IsDir() {
|
|
return nil, fmt.Errorf("storage directory is not a directory")
|
|
}
|
|
tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("storage directory is not writable")
|
|
}
|
|
|
|
tmpfile.Close()
|
|
<-dios
|
|
os.Remove(tmpfile.Name())
|
|
dios <- struct{}{}
|
|
|
|
fs := &fileStore{
|
|
fcfg: fcfg,
|
|
psim: make(map[string]*psi),
|
|
bim: make(map[uint32]*msgBlock),
|
|
cfg: FileStreamInfo{Created: created, StreamConfig: cfg},
|
|
prf: prf,
|
|
oldprf: oldprf,
|
|
qch: make(chan struct{}),
|
|
fch: make(chan struct{}, 1),
|
|
fsld: make(chan struct{}),
|
|
srv: fcfg.srv,
|
|
}
|
|
|
|
// Set flush in place to AsyncFlush which by default is false.
|
|
fs.fip = !fcfg.AsyncFlush
|
|
|
|
// Check if this is a new setup.
|
|
mdir := filepath.Join(fcfg.StoreDir, msgDir)
|
|
odir := filepath.Join(fcfg.StoreDir, consumerDir)
|
|
if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
|
|
return nil, fmt.Errorf("could not create message storage directory - %v", err)
|
|
}
|
|
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
|
|
return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
|
|
}
|
|
|
|
// Create highway hash for message blocks. Use sha256 of directory as key.
|
|
key := sha256.Sum256([]byte(cfg.Name))
|
|
fs.hh, err = highwayhash.New64(key[:])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create hash: %v", err)
|
|
}
|
|
|
|
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
|
|
// Make sure we do not have an encrypted store underneath of us but no main key.
|
|
if fs.prf == nil {
|
|
if _, err := os.Stat(keyFile); err == nil {
|
|
return nil, errNoMainKey
|
|
}
|
|
}
|
|
|
|
// Attempt to recover our state.
|
|
err = fs.recoverFullState()
|
|
if err != nil {
|
|
// Hold onto state
|
|
prior := fs.state
|
|
// Reset anything that could have been set from above.
|
|
fs.state = StreamState{}
|
|
fs.psim = make(map[string]*psi)
|
|
fs.bim = make(map[uint32]*msgBlock)
|
|
fs.blks = nil
|
|
fs.tombs = nil
|
|
|
|
// Recover our message state the old way
|
|
if err := fs.recoverMsgs(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Check if our prior remember a last past where we can see.
|
|
if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
|
|
fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
|
|
if lmb, err := fs.newMsgBlockForWrite(); err == nil {
|
|
lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
|
|
} else {
|
|
return nil, err
|
|
}
|
|
}
|
|
// Since we recovered here, make sure to kick ourselves to write out our stream state.
|
|
fs.dirty++
|
|
defer fs.kickFlushStateLoop()
|
|
}
|
|
|
|
// Also make sure we get rid of old idx and fss files on return.
|
|
// Do this in separate go routine vs inline and at end of processing.
|
|
defer func() {
|
|
go func() {
|
|
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll))
|
|
os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll))
|
|
}()
|
|
}()
|
|
|
|
// Lock while do enforcements and removals.
|
|
fs.mu.Lock()
|
|
|
|
// Check if we have any left over tombstones to process.
|
|
if len(fs.tombs) > 0 {
|
|
for _, seq := range fs.tombs {
|
|
fs.removeMsg(seq, false, false, false)
|
|
fs.removeFromLostData(seq)
|
|
}
|
|
// Not needed after this phase.
|
|
fs.tombs = nil
|
|
}
|
|
|
|
// Limits checks and enforcement.
|
|
fs.enforceMsgLimit()
|
|
fs.enforceBytesLimit()
|
|
|
|
// Do age checks too, make sure to call in place.
|
|
if fs.cfg.MaxAge != 0 {
|
|
fs.expireMsgsOnRecover()
|
|
fs.startAgeChk()
|
|
}
|
|
|
|
// If we have max msgs per subject make sure the is also enforced.
|
|
if fs.cfg.MaxMsgsPer > 0 {
|
|
fs.enforceMsgPerSubjectLimit()
|
|
}
|
|
|
|
// Grab first sequence for check below while we have lock.
|
|
firstSeq := fs.state.FirstSeq
|
|
fs.mu.Unlock()
|
|
|
|
// If the stream has an initial sequence number then make sure we
|
|
// have purged up until that point. We will do this only if the
|
|
// recovered first sequence number is before our configured first
|
|
// sequence. Need to do this locked as by now the age check timer
|
|
// has started.
|
|
if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
|
|
if _, err := fs.purge(cfg.FirstSeq); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Write our meta data if it does not exist or is zero'd out.
|
|
meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
|
|
fi, err := os.Stat(meta)
|
|
if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
|
|
if err := fs.writeStreamMeta(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// If we expect to be encrypted check that what we are restoring is not plaintext.
|
|
// This can happen on snapshot restores or conversions.
|
|
if fs.prf != nil {
|
|
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
|
|
if err := fs.writeStreamMeta(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
|
|
|
|
// Spin up the go routine that will write out or full state stream index.
|
|
go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld)
|
|
|
|
return fs, nil
|
|
}
|
|
|
|
// Lock all existing message blocks.
|
|
// Lock held on entry.
|
|
func (fs *fileStore) lockAllMsgBlocks() {
|
|
for _, mb := range fs.blks {
|
|
mb.mu.Lock()
|
|
}
|
|
}
|
|
|
|
// Unlock all existing message blocks.
|
|
// Lock held on entry.
|
|
func (fs *fileStore) unlockAllMsgBlocks() {
|
|
for _, mb := range fs.blks {
|
|
mb.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
|
|
if fs.isClosed() {
|
|
return ErrStoreClosed
|
|
}
|
|
if cfg.Name == _EMPTY_ {
|
|
return fmt.Errorf("name required")
|
|
}
|
|
if cfg.Storage != FileStorage {
|
|
return fmt.Errorf("fileStore requires file storage type in config")
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
|
|
old_cfg := fs.cfg
|
|
// Messages block reference fs.cfg.Subjects (in subjString) under the
|
|
// mb's lock, not fs' lock. So do the switch here under all existing
|
|
// message blocks' lock in order to silence the DATA RACE detector.
|
|
fs.lockAllMsgBlocks()
|
|
fs.cfg = new_cfg
|
|
fs.unlockAllMsgBlocks()
|
|
if err := fs.writeStreamMeta(); err != nil {
|
|
fs.lockAllMsgBlocks()
|
|
fs.cfg = old_cfg
|
|
fs.unlockAllMsgBlocks()
|
|
fs.mu.Unlock()
|
|
return err
|
|
}
|
|
|
|
// Limits checks and enforcement.
|
|
fs.enforceMsgLimit()
|
|
fs.enforceBytesLimit()
|
|
|
|
// Do age timers.
|
|
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
|
|
fs.startAgeChk()
|
|
}
|
|
if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
|
|
fs.ageChk.Stop()
|
|
fs.ageChk = nil
|
|
}
|
|
|
|
if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
|
|
fs.enforceMsgPerSubjectLimit()
|
|
}
|
|
fs.mu.Unlock()
|
|
|
|
if cfg.MaxAge != 0 {
|
|
fs.expireMsgs()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
|
|
if maxBytes > 0 {
|
|
blkSize := (maxBytes / 4) + 1 // (25% overhead)
|
|
// Round up to nearest 100
|
|
if m := blkSize % 100; m != 0 {
|
|
blkSize += 100 - m
|
|
}
|
|
if blkSize <= FileStoreMinBlkSize {
|
|
blkSize = FileStoreMinBlkSize
|
|
} else if blkSize >= FileStoreMaxBlkSize {
|
|
blkSize = FileStoreMaxBlkSize
|
|
} else {
|
|
blkSize = defaultMediumBlockSize
|
|
}
|
|
if encrypted && blkSize > maximumEncryptedBlockSize {
|
|
// Notes on this below.
|
|
blkSize = maximumEncryptedBlockSize
|
|
}
|
|
return uint64(blkSize)
|
|
}
|
|
|
|
switch {
|
|
case encrypted:
|
|
// In the case of encrypted stores, large blocks can result in worsened perf
|
|
// since many writes on disk involve re-encrypting the entire block. For now,
|
|
// we will enforce a cap on the block size when encryption is enabled to avoid
|
|
// this.
|
|
return maximumEncryptedBlockSize
|
|
case retention == LimitsPolicy:
|
|
// TODO(dlc) - Make the blocksize relative to this if set.
|
|
return defaultLargeBlockSize
|
|
default:
|
|
// TODO(dlc) - Make the blocksize relative to this if set.
|
|
return defaultMediumBlockSize
|
|
}
|
|
}
|
|
|
|
func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
|
|
if sc == ChaCha {
|
|
ek, err = chacha20poly1305.NewX(seed)
|
|
} else if sc == AES {
|
|
block, e := aes.NewCipher(seed)
|
|
if e != nil {
|
|
return nil, err
|
|
}
|
|
ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
|
|
} else {
|
|
err = errUnknownCipher
|
|
}
|
|
return ek, err
|
|
}
|
|
|
|
// Generate an asset encryption key from the context and server PRF.
|
|
func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
|
|
if fs.prf == nil {
|
|
return nil, nil, nil, nil, errNoEncryption
|
|
}
|
|
// Generate key encryption key.
|
|
rb, err := fs.prf([]byte(context))
|
|
if err != nil {
|
|
return nil, nil, nil, nil, err
|
|
}
|
|
|
|
sc := fs.fcfg.Cipher
|
|
|
|
kek, err := genEncryptionKey(sc, rb)
|
|
if err != nil {
|
|
return nil, nil, nil, nil, err
|
|
}
|
|
// Generate random asset encryption key seed.
|
|
|
|
const seedSize = 32
|
|
seed = make([]byte, seedSize)
|
|
if n, err := rand.Read(seed); err != nil || n != seedSize {
|
|
return nil, nil, nil, nil, err
|
|
}
|
|
|
|
aek, err = genEncryptionKey(sc, seed)
|
|
if err != nil {
|
|
return nil, nil, nil, nil, err
|
|
}
|
|
|
|
// Generate our nonce. Use same buffer to hold encrypted seed.
|
|
nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
|
|
rand.Read(nonce)
|
|
|
|
bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
|
|
if err != nil {
|
|
return nil, nil, nil, nil, err
|
|
}
|
|
|
|
return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
|
|
}
|
|
|
|
// Will generate the block encryption key.
|
|
func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
|
|
if sc == ChaCha {
|
|
return chacha20.NewUnauthenticatedCipher(seed, nonce)
|
|
} else if sc == AES {
|
|
block, err := aes.NewCipher(seed)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return cipher.NewCTR(block, nonce), nil
|
|
}
|
|
return nil, errUnknownCipher
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) recoverAEK() error {
|
|
if fs.prf != nil && fs.aek == nil {
|
|
ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rb, err := fs.prf([]byte(fs.cfg.Name))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns := kek.NonceSize()
|
|
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fs.aek = aek
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) setupAEK() error {
|
|
if fs.prf != nil && fs.aek == nil {
|
|
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
|
|
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
// Set our aek.
|
|
fs.aek = key
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Write out meta and the checksum.
|
|
// Lock should be held.
|
|
func (fs *fileStore) writeStreamMeta() error {
|
|
if err := fs.setupAEK(); err != nil {
|
|
return err
|
|
}
|
|
|
|
meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
|
|
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
b, err := json.Marshal(fs.cfg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Encrypt if needed.
|
|
if fs.aek != nil {
|
|
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
|
|
rand.Read(nonce)
|
|
b = fs.aek.Seal(nonce, nonce, b, nil)
|
|
}
|
|
|
|
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
fs.hh.Reset()
|
|
fs.hh.Write(b)
|
|
checksum := hex.EncodeToString(fs.hh.Sum(nil))
|
|
sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
|
|
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Pools to recycle the blocks to help with memory pressure.
|
|
var blkPoolBig sync.Pool // 16MB
|
|
var blkPoolMedium sync.Pool // 8MB
|
|
var blkPoolSmall sync.Pool // 2MB
|
|
|
|
// Get a new msg block based on sz estimate.
|
|
func getMsgBlockBuf(sz int) (buf []byte) {
|
|
var pb interface{}
|
|
if sz <= defaultSmallBlockSize {
|
|
pb = blkPoolSmall.Get()
|
|
} else if sz <= defaultMediumBlockSize {
|
|
pb = blkPoolMedium.Get()
|
|
} else {
|
|
pb = blkPoolBig.Get()
|
|
}
|
|
if pb != nil {
|
|
buf = *(pb.(*[]byte))
|
|
} else {
|
|
// Here we need to make a new blk.
|
|
// If small leave as is..
|
|
if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
|
|
sz = defaultMediumBlockSize
|
|
} else if sz > defaultMediumBlockSize {
|
|
sz = defaultLargeBlockSize
|
|
}
|
|
buf = make([]byte, sz)
|
|
}
|
|
return buf[:0]
|
|
}
|
|
|
|
// Recycle the msg block.
|
|
func recycleMsgBlockBuf(buf []byte) {
|
|
if buf == nil || cap(buf) < defaultSmallBlockSize {
|
|
return
|
|
}
|
|
// Make sure to reset before placing back into pool.
|
|
buf = buf[:0]
|
|
|
|
// We need to make sure the load code gets a block that can fit the maximum for a size block.
|
|
// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
|
|
// it right back in and making a new []byte.
|
|
// From above we know its already >= defaultSmallBlockSize
|
|
if sz := cap(buf); sz < defaultMediumBlockSize {
|
|
blkPoolSmall.Put(&buf)
|
|
} else if sz < defaultLargeBlockSize {
|
|
blkPoolMedium.Put(&buf)
|
|
} else {
|
|
blkPoolBig.Put(&buf)
|
|
}
|
|
}
|
|
|
|
const (
|
|
msgHdrSize = 22
|
|
checksumSize = 8
|
|
emptyRecordLen = msgHdrSize + checksumSize
|
|
)
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) noTrackSubjects() bool {
|
|
return !(len(fs.psim) > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
|
|
}
|
|
|
|
// Will init the basics for a message block.
|
|
func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
|
|
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
|
|
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))
|
|
|
|
if mb.hh == nil {
|
|
key := sha256.Sum256(fs.hashKeyForBlock(index))
|
|
mb.hh, _ = highwayhash.New64(key[:])
|
|
}
|
|
return mb
|
|
}
|
|
|
|
// Lock for fs should be held.
|
|
func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
|
|
if fs.prf == nil {
|
|
return nil
|
|
}
|
|
|
|
var createdKeys bool
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
|
|
if err != nil {
|
|
// We do not seem to have keys even though we should. Could be a plaintext conversion.
|
|
// Create the keys and we will double check below.
|
|
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
|
|
return err
|
|
}
|
|
createdKeys = true
|
|
} else {
|
|
if len(ekey) < minBlkKeySize {
|
|
return errBadKeySize
|
|
}
|
|
// Recover key encryption key.
|
|
rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
sc := fs.fcfg.Cipher
|
|
kek, err := genEncryptionKey(sc, rb)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns := kek.NonceSize()
|
|
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
|
|
if err != nil {
|
|
// We may be here on a cipher conversion, so attempt to convert.
|
|
if err = mb.convertCipher(); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
mb.seed, mb.nonce = seed, ekey[:ns]
|
|
}
|
|
mb.aek, err = genEncryptionKey(sc, mb.seed)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// If we created keys here, let's check the data and if it is plaintext convert here.
|
|
if createdKeys {
|
|
if err := mb.convertToEncrypted(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Load a last checksum if needed from the block file.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) ensureLastChecksumLoaded() {
|
|
var empty [8]byte
|
|
if mb.lchk != empty {
|
|
return
|
|
}
|
|
copy(mb.lchk[0:], mb.lastChecksum())
|
|
}
|
|
|
|
// Lock held on entry
|
|
func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
|
|
mb := fs.initMsgBlock(index)
|
|
|
|
// Open up the message file, but we will try to recover from the index file.
|
|
// We will check that the last checksums match.
|
|
file, err := os.Open(mb.mfn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
|
|
if fi, err := file.Stat(); fi != nil {
|
|
mb.rbytes = uint64(fi.Size())
|
|
} else {
|
|
return nil, err
|
|
}
|
|
|
|
// Make sure encryption loaded if needed.
|
|
fs.loadEncryptionForMsgBlock(mb)
|
|
|
|
// Grab last checksum from main block file.
|
|
var lchk [8]byte
|
|
if mb.rbytes >= checksumSize {
|
|
if mb.bek != nil {
|
|
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
copy(lchk[0:], buf[len(buf)-checksumSize:])
|
|
}
|
|
} else {
|
|
file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
|
|
}
|
|
}
|
|
|
|
file.Close()
|
|
|
|
// Read our index file. Use this as source of truth if possible.
|
|
if err := mb.readIndexInfo(); err == nil {
|
|
// Quick sanity check here.
|
|
// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
|
|
if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
|
|
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
|
|
fs.populateGlobalPerSubjectInfo(mb)
|
|
// Try to dump any state we needed on recovery.
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
fs.addMsgBlock(mb)
|
|
return mb, nil
|
|
}
|
|
}
|
|
|
|
// If we get data loss rebuilding the message block state record that with the fs itself.
|
|
ld, tombs, _ := mb.rebuildState()
|
|
if ld != nil {
|
|
fs.addLostData(ld)
|
|
}
|
|
// Collect all tombstones.
|
|
if len(tombs) > 0 {
|
|
fs.tombs = append(fs.tombs, tombs...)
|
|
}
|
|
|
|
if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
|
|
fs.populateGlobalPerSubjectInfo(mb)
|
|
// Try to dump any state we needed on recovery.
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
|
|
mb.closeFDs()
|
|
fs.addMsgBlock(mb)
|
|
|
|
return mb, nil
|
|
}
|
|
|
|
func (fs *fileStore) lostData() *LostStreamData {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
if fs.ld == nil {
|
|
return nil
|
|
}
|
|
nld := *fs.ld
|
|
return &nld
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) addLostData(ld *LostStreamData) {
|
|
if ld == nil {
|
|
return
|
|
}
|
|
if fs.ld != nil {
|
|
var added bool
|
|
for _, seq := range ld.Msgs {
|
|
if _, found := fs.ld.exists(seq); !found {
|
|
fs.ld.Msgs = append(fs.ld.Msgs, seq)
|
|
added = true
|
|
}
|
|
}
|
|
if added {
|
|
msgs := fs.ld.Msgs
|
|
sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
|
|
fs.ld.Bytes += ld.Bytes
|
|
}
|
|
} else {
|
|
fs.ld = ld
|
|
}
|
|
}
|
|
|
|
// Helper to see if we already have this sequence reported in our lost data.
|
|
func (ld *LostStreamData) exists(seq uint64) (int, bool) {
|
|
i, found := sort.Find(len(ld.Msgs), func(i int) int {
|
|
tseq := ld.Msgs[i]
|
|
if tseq < seq {
|
|
return -1
|
|
}
|
|
if tseq > seq {
|
|
return +1
|
|
}
|
|
return 0
|
|
})
|
|
return i, found
|
|
}
|
|
|
|
func (fs *fileStore) removeFromLostData(seq uint64) {
|
|
if fs.ld == nil {
|
|
return
|
|
}
|
|
if i, found := fs.ld.exists(seq); found {
|
|
fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
|
|
if len(fs.ld.Msgs) == 0 {
|
|
fs.ld = nil
|
|
}
|
|
}
|
|
}
|
|
|
|
func (fs *fileStore) rebuildState(ld *LostStreamData) {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
fs.rebuildStateLocked(ld)
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
|
|
fs.addLostData(ld)
|
|
|
|
fs.state.Msgs, fs.state.Bytes = 0, 0
|
|
fs.state.FirstSeq, fs.state.LastSeq = 0, 0
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
fs.state.Msgs += mb.msgs
|
|
fs.state.Bytes += mb.bytes
|
|
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
|
|
fs.state.FirstSeq = mb.first.seq
|
|
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
|
|
}
|
|
fs.state.LastSeq = mb.last.seq
|
|
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
|
|
mb.mu.RUnlock()
|
|
}
|
|
}
|
|
|
|
// Attempt to convert the cipher used for this message block.
|
|
func (mb *msgBlock) convertCipher() error {
|
|
fs := mb.fs
|
|
sc := fs.fcfg.Cipher
|
|
|
|
var osc StoreCipher
|
|
switch sc {
|
|
case ChaCha:
|
|
osc = AES
|
|
case AES:
|
|
osc = ChaCha
|
|
}
|
|
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(ekey) < minBlkKeySize {
|
|
return errBadKeySize
|
|
}
|
|
type prfWithCipher struct {
|
|
keyGen
|
|
StoreCipher
|
|
}
|
|
var prfs []prfWithCipher
|
|
if fs.prf != nil {
|
|
prfs = append(prfs, prfWithCipher{fs.prf, sc})
|
|
prfs = append(prfs, prfWithCipher{fs.prf, osc})
|
|
}
|
|
if fs.oldprf != nil {
|
|
prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
|
|
prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
|
|
}
|
|
|
|
for _, prf := range prfs {
|
|
// Recover key encryption key.
|
|
rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
kek, err := genEncryptionKey(prf.StoreCipher, rb)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
ns := kek.NonceSize()
|
|
seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
nonce := ekey[:ns]
|
|
bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf, _ := mb.loadBlock(nil)
|
|
bek.XORKeyStream(buf, buf)
|
|
// Make sure we can parse with old cipher and key file.
|
|
if err = mb.indexCacheBuf(buf); err != nil {
|
|
return err
|
|
}
|
|
// Reset the cache since we just read everything in.
|
|
mb.cache = nil
|
|
|
|
// Generate new keys. If we error for some reason then we will put
|
|
// the old keyfile back.
|
|
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
|
|
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
|
|
os.WriteFile(keyFile, ekey, defaultFilePerms)
|
|
return err
|
|
}
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
return fmt.Errorf("unable to recover keys")
|
|
}
|
|
|
|
// Convert a plaintext block to encrypted.
|
|
func (mb *msgBlock) convertToEncrypted() error {
|
|
if mb.bek == nil {
|
|
return nil
|
|
}
|
|
buf, err := mb.loadBlock(nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := mb.indexCacheBuf(buf); err != nil {
|
|
// This likely indicates this was already encrypted or corrupt.
|
|
mb.cache = nil
|
|
return err
|
|
}
|
|
// Undo cache from above for later.
|
|
mb.cache = nil
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
if err := os.WriteFile(mb.mfn, buf, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
|
|
// We will return any lost data, and we will return any delete tombstones we encountered.
|
|
func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
return mb.rebuildStateLocked()
|
|
}
|
|
|
|
// Rebuild the state of the blk based on what we have on disk in the N.blk file.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
|
|
startLastSeq := mb.last.seq
|
|
|
|
// Remove the .fss file and clear any cache we have set.
|
|
mb.clearCacheAndOffset()
|
|
|
|
buf, err := mb.loadBlock(nil)
|
|
if err != nil || len(buf) == 0 {
|
|
var ld *LostStreamData
|
|
// No data to rebuild from here.
|
|
if mb.msgs > 0 {
|
|
// We need to declare lost data here.
|
|
ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
|
|
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
|
|
if !mb.dmap.Exists(seq) {
|
|
ld.Msgs = append(ld.Msgs, seq)
|
|
}
|
|
}
|
|
// Clear invalid state. We will let this blk be added in here.
|
|
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
|
|
mb.dmap.Empty()
|
|
mb.first.seq = mb.last.seq + 1
|
|
}
|
|
return ld, nil, err
|
|
}
|
|
|
|
// Clear state we need to rebuild.
|
|
mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
|
|
mb.last.seq, mb.last.ts = 0, 0
|
|
firstNeedsSet := true
|
|
|
|
// Check if we need to decrypt.
|
|
if mb.bek != nil && len(buf) > 0 {
|
|
// Recreate to reset counter.
|
|
mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
}
|
|
|
|
// Check for compression.
|
|
if buf, err = mb.decompressIfNeeded(buf); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
mb.rbytes = uint64(len(buf))
|
|
|
|
addToDmap := func(seq uint64) {
|
|
if seq == 0 {
|
|
return
|
|
}
|
|
mb.dmap.Insert(seq)
|
|
}
|
|
|
|
var le = binary.LittleEndian
|
|
|
|
truncate := func(index uint32) {
|
|
var fd *os.File
|
|
if mb.mfd != nil {
|
|
fd = mb.mfd
|
|
} else {
|
|
fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
|
|
if err == nil {
|
|
defer fd.Close()
|
|
}
|
|
}
|
|
if fd == nil {
|
|
return
|
|
}
|
|
if err := fd.Truncate(int64(index)); err == nil {
|
|
// Update our checksum.
|
|
if index >= 8 {
|
|
var lchk [8]byte
|
|
fd.ReadAt(lchk[:], int64(index-8))
|
|
copy(mb.lchk[0:], lchk[:])
|
|
}
|
|
fd.Sync()
|
|
}
|
|
}
|
|
|
|
gatherLost := func(lb uint32) *LostStreamData {
|
|
var ld LostStreamData
|
|
for seq := mb.last.seq + 1; seq <= startLastSeq; seq++ {
|
|
ld.Msgs = append(ld.Msgs, seq)
|
|
}
|
|
ld.Bytes = uint64(lb)
|
|
return &ld
|
|
}
|
|
|
|
// For tombstones that we find and collect.
|
|
var (
|
|
tombstones []uint64
|
|
minTombstoneSeq uint64
|
|
minTombstoneTs int64
|
|
)
|
|
|
|
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
|
|
if index+msgHdrSize > lbuf {
|
|
truncate(index)
|
|
return gatherLost(lbuf - index), tombstones, nil
|
|
}
|
|
|
|
hdr := buf[index : index+msgHdrSize]
|
|
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
|
|
|
|
hasHeaders := rl&hbit != 0
|
|
// Clear any headers bit that could be set.
|
|
rl &^= hbit
|
|
dlen := int(rl) - msgHdrSize
|
|
// Do some quick sanity checks here.
|
|
if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
|
|
truncate(index)
|
|
return gatherLost(lbuf - index), tombstones, errBadMsg
|
|
}
|
|
|
|
// Check for checksum failures before additional processing.
|
|
data := buf[index+msgHdrSize : index+rl]
|
|
if hh := mb.hh; hh != nil {
|
|
hh.Reset()
|
|
hh.Write(hdr[4:20])
|
|
hh.Write(data[:slen])
|
|
if hasHeaders {
|
|
hh.Write(data[slen+4 : dlen-recordHashSize])
|
|
} else {
|
|
hh.Write(data[slen : dlen-recordHashSize])
|
|
}
|
|
checksum := hh.Sum(nil)
|
|
if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
|
|
truncate(index)
|
|
return gatherLost(lbuf - index), tombstones, errBadMsg
|
|
}
|
|
copy(mb.lchk[0:], checksum)
|
|
}
|
|
|
|
// Grab our sequence and timestamp.
|
|
seq := le.Uint64(hdr[4:])
|
|
ts := int64(le.Uint64(hdr[12:]))
|
|
|
|
// Check if this is a delete tombstone.
|
|
if seq&tbit != 0 {
|
|
seq = seq &^ tbit
|
|
// Need to process this here and make sure we have accounted for this properly.
|
|
tombstones = append(tombstones, seq)
|
|
if minTombstoneSeq == 0 || seq < minTombstoneSeq {
|
|
minTombstoneSeq, minTombstoneTs = seq, ts
|
|
}
|
|
index += rl
|
|
continue
|
|
}
|
|
|
|
// This is an old erased message, or a new one that we can track.
|
|
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
|
|
seq = seq &^ ebit
|
|
if seq >= mb.first.seq {
|
|
// Only add to dmap if past recorded first seq and non-zero.
|
|
if seq != 0 {
|
|
addToDmap(seq)
|
|
}
|
|
mb.last.seq = seq
|
|
mb.last.ts = ts
|
|
if mb.msgs == 0 {
|
|
mb.first.seq, mb.first.ts = seq+1, 0
|
|
}
|
|
}
|
|
index += rl
|
|
continue
|
|
}
|
|
|
|
// This is for when we have index info that adjusts for deleted messages
|
|
// at the head. So the first.seq will be already set here. If this is larger
|
|
// replace what we have with this seq.
|
|
if firstNeedsSet && seq >= mb.first.seq {
|
|
firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts
|
|
}
|
|
|
|
if !mb.dmap.Exists(seq) {
|
|
mb.msgs++
|
|
mb.bytes += uint64(rl)
|
|
|
|
// Rebuild per subject info if needed.
|
|
if slen > 0 {
|
|
if mb.fss == nil {
|
|
mb.fss = make(map[string]*SimpleState)
|
|
}
|
|
// For the lookup, we cast the byte slice and there won't be any copy
|
|
if ss := mb.fss[string(data[:slen])]; ss != nil {
|
|
ss.Msgs++
|
|
ss.Last = seq
|
|
} else {
|
|
// This will either use a subject from the config, or make a copy
|
|
// so we don't reference the underlying buffer.
|
|
subj := mb.subjString(data[:slen])
|
|
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Always set last
|
|
mb.last.seq = seq
|
|
mb.last.ts = ts
|
|
|
|
// Advance to next record.
|
|
index += rl
|
|
}
|
|
|
|
// For empty msg blocks make sure we recover last seq correctly based off of first.
|
|
// Or if we seem to have no messages but had a tombstone, which we use to remember
|
|
// sequences and timestamps now, use that to properly setup the first and last.
|
|
if mb.msgs == 0 {
|
|
if mb.first.seq > 0 {
|
|
mb.last.seq = mb.first.seq - 1
|
|
} else if mb.first.seq == 0 && minTombstoneSeq > 0 {
|
|
mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0
|
|
if mb.last.seq == 0 {
|
|
mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, tombstones, nil
|
|
}
|
|
|
|
// For doing warn logging.
|
|
// Lock should be held.
|
|
func (fs *fileStore) warn(format string, args ...any) {
|
|
// No-op if no server configured.
|
|
if fs.srv == nil {
|
|
return
|
|
}
|
|
fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
|
|
}
|
|
|
|
// recoverFullState will attempt to receover our last full state and re-process any state changes
|
|
// that happened afterwards.
|
|
func (fs *fileStore) recoverFullState() (rerr error) {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
// Check for any left over purged messages.
|
|
<-dios
|
|
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
|
|
if _, err := os.Stat(pdir); err == nil {
|
|
os.RemoveAll(pdir)
|
|
}
|
|
|
|
// Grab our stream state file and load it in.
|
|
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
|
|
buf, err := os.ReadFile(fn)
|
|
dios <- struct{}{}
|
|
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
fs.warn("Could not read stream state file: %v", err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
const minLen = 32
|
|
if len(buf) < minLen {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state too short (%d bytes)", len(buf))
|
|
return errCorruptState
|
|
}
|
|
|
|
// The highwayhash will be on the end. Check that it still matches.
|
|
h := buf[len(buf)-highwayhash.Size64:]
|
|
buf = buf[:len(buf)-highwayhash.Size64]
|
|
fs.hh.Reset()
|
|
fs.hh.Write(buf)
|
|
if !bytes.Equal(h, fs.hh.Sum(nil)) {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state checksum did not match")
|
|
return errCorruptState
|
|
}
|
|
|
|
// Decrypt if needed.
|
|
if fs.prf != nil {
|
|
// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
|
|
// since snapshots strip encryption.
|
|
if err := fs.recoverAEK(); err == nil {
|
|
ns := fs.aek.NonceSize()
|
|
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
|
|
if err != nil {
|
|
fs.warn("Stream state error reading encryption key: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state magic and version mismatch")
|
|
return errCorruptState
|
|
}
|
|
|
|
bi := hdrLen
|
|
|
|
readU64 := func() uint64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
v, n := binary.Uvarint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return 0
|
|
}
|
|
bi += n
|
|
return v
|
|
}
|
|
readI64 := func() int64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
v, n := binary.Varint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return -1
|
|
}
|
|
bi += n
|
|
return v
|
|
}
|
|
|
|
setTime := func(t *time.Time, ts int64) {
|
|
if ts == 0 {
|
|
*t = time.Time{}
|
|
} else {
|
|
*t = time.Unix(0, ts).UTC()
|
|
}
|
|
}
|
|
|
|
var state StreamState
|
|
state.Msgs = readU64()
|
|
state.Bytes = readU64()
|
|
state.FirstSeq = readU64()
|
|
baseTime := readI64()
|
|
setTime(&state.FirstTime, baseTime)
|
|
state.LastSeq = readU64()
|
|
setTime(&state.LastTime, readI64())
|
|
|
|
// Check for per subject info.
|
|
if numSubjects := int(readU64()); numSubjects > 0 {
|
|
fs.psim = make(map[string]*psi, numSubjects)
|
|
for i := 0; i < numSubjects; i++ {
|
|
if lsubj := int(readU64()); lsubj > 0 {
|
|
if bi+lsubj > len(buf) {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state bad subject len (%d)", lsubj)
|
|
return errCorruptState
|
|
}
|
|
subj := fs.subjString(buf[bi : bi+lsubj])
|
|
bi += lsubj
|
|
psi := &psi{total: readU64(), fblk: uint32(readU64())}
|
|
if psi.total > 1 {
|
|
psi.lblk = uint32(readU64())
|
|
} else {
|
|
psi.lblk = psi.fblk
|
|
}
|
|
fs.psim[subj] = psi
|
|
}
|
|
}
|
|
}
|
|
|
|
if numBlocks := readU64(); numBlocks > 0 {
|
|
lastIndex := int(numBlocks - 1)
|
|
fs.blks = make([]*msgBlock, 0, numBlocks)
|
|
for i := 0; i < int(numBlocks); i++ {
|
|
index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
|
|
if bi < 0 {
|
|
break
|
|
}
|
|
mb := fs.initMsgBlock(index)
|
|
mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes
|
|
mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
|
|
if numDeleted > 0 {
|
|
dmap, n, err := avl.Decode(buf[bi:])
|
|
if err != nil {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state error decoding avl dmap: %v", err)
|
|
return errCorruptState
|
|
}
|
|
mb.dmap = *dmap
|
|
if mb.msgs > numDeleted {
|
|
mb.msgs -= numDeleted
|
|
} else {
|
|
mb.msgs = 0
|
|
}
|
|
bi += n
|
|
}
|
|
// Only add in if not empty or the lmb.
|
|
if mb.msgs > 0 || i == lastIndex {
|
|
fs.addMsgBlock(mb)
|
|
} else {
|
|
// Mark dirty to cleanup.
|
|
fs.dirty++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pull in last block index for the block that had last checksum when we wrote the full state.
|
|
blkIndex := uint32(readU64())
|
|
var lchk [8]byte
|
|
if bi+len(lchk) > len(buf) {
|
|
bi = -1
|
|
} else {
|
|
copy(lchk[0:], buf[bi:bi+len(lchk)])
|
|
}
|
|
|
|
// Check if we had any errors.
|
|
if bi < 0 {
|
|
os.Remove(fn)
|
|
fs.warn("Stream state has no checksum present")
|
|
return errCorruptState
|
|
}
|
|
|
|
// Move into place our state, msgBlks and subject info.
|
|
fs.state = state
|
|
|
|
// First let's check the happy path, open the blk file that was the lmb when we created the full state.
|
|
// See if we have the last block available.
|
|
var matched bool
|
|
var mb *msgBlock
|
|
if mb = fs.bim[blkIndex]; mb != nil {
|
|
if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
|
|
// If our saved state is past what we see on disk, fallback and rebuild.
|
|
if ld, _, _ := mb.rebuildState(); ld != nil {
|
|
fs.addLostData(ld)
|
|
}
|
|
fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex)
|
|
return errPriorState
|
|
}
|
|
if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
|
|
// Remove the last message block since we will re-process below.
|
|
fs.removeMsgBlockFromList(mb)
|
|
}
|
|
}
|
|
|
|
// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
|
|
// If we did not match we re-process the last block.
|
|
start := blkIndex
|
|
if matched {
|
|
start++
|
|
}
|
|
|
|
for bi := start; ; bi++ {
|
|
nmb, err := fs.recoverMsgBlock(bi)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil
|
|
}
|
|
os.Remove(fn)
|
|
fs.warn("Stream state could not recover msg block %d", bi)
|
|
return err
|
|
}
|
|
if nmb != nil {
|
|
// Check if we have to account for a partial message block.
|
|
if !matched && mb != nil && mb.index == nmb.index {
|
|
if err := fs.adjustAccounting(mb, nmb); err != nil {
|
|
fs.warn("Stream state could not adjust accounting: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
// Update top level accounting.
|
|
if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq {
|
|
fs.state.FirstSeq = nmb.first.seq
|
|
fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
|
|
}
|
|
if nmb.last.seq > fs.state.LastSeq {
|
|
fs.state.LastSeq = nmb.last.seq
|
|
fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
|
|
}
|
|
fs.state.Msgs += nmb.msgs
|
|
fs.state.Bytes += nmb.bytes
|
|
}
|
|
}
|
|
}
|
|
|
|
// adjustAccounting will be called when a stream state was only partially accounted for
|
|
// with a message block, e.g. additional records were added after the stream state.
|
|
// Lock should be held.
|
|
func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) error {
|
|
nmb.mu.Lock()
|
|
defer nmb.mu.Unlock()
|
|
|
|
// First make sure the new block is loaded.
|
|
if nmb.cacheNotLoaded() {
|
|
nmb.loadMsgsWithLock()
|
|
}
|
|
nmb.ensurePerSubjectInfoLoaded()
|
|
|
|
lookupAndAdjust := func(seq uint64) error {
|
|
var smv StoreMsg
|
|
// Lookup the message.
|
|
sm, err := nmb.cacheLookup(seq, &smv)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Since we found it we just need to adjust fs totals and psim.
|
|
fs.state.Msgs--
|
|
fs.state.Bytes -= fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
|
|
if len(sm.subj) > 0 && fs.psim != nil {
|
|
fs.removePerSubject(sm.subj)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Walk all the original mb's sequences that were included in the stream state.
|
|
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
|
|
// If we had already declared it deleted we can move on since you can not undelete.
|
|
if mb.dmap.Exists(seq) {
|
|
continue
|
|
}
|
|
// Lookup the message.
|
|
if err := lookupAndAdjust(seq); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Now check to see if we had a higher first for the recovered state mb vs nmb.
|
|
if nmb.first.seq < mb.first.seq {
|
|
for seq := nmb.first.seq; seq < mb.first.seq; seq++ {
|
|
// Lookup the message.
|
|
if err := lookupAndAdjust(seq); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// Now set first for nmb.
|
|
nmb.first = mb.first
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Grabs last checksum for the named block file.
|
|
// Takes into account encryption etc.
|
|
func (mb *msgBlock) lastChecksum() []byte {
|
|
f, err := os.Open(mb.mfn)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
defer f.Close()
|
|
|
|
var lchk [8]byte
|
|
if fi, _ := f.Stat(); fi != nil {
|
|
mb.rbytes = uint64(fi.Size())
|
|
}
|
|
if mb.rbytes < checksumSize {
|
|
return nil
|
|
}
|
|
// Encrypted?
|
|
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
|
|
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
|
|
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
|
|
return nil
|
|
}
|
|
}
|
|
if mb.bek != nil {
|
|
if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
copy(lchk[0:], buf[len(buf)-checksumSize:])
|
|
}
|
|
} else {
|
|
f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
|
|
}
|
|
return lchk[:]
|
|
}
|
|
|
|
func (fs *fileStore) recoverMsgs() error {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
// Check for any left over purged messages.
|
|
<-dios
|
|
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
|
|
if _, err := os.Stat(pdir); err == nil {
|
|
os.RemoveAll(pdir)
|
|
}
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
f, err := os.Open(mdir)
|
|
if err != nil {
|
|
dios <- struct{}{}
|
|
return errNotReadable
|
|
}
|
|
dirs, err := f.ReadDir(-1)
|
|
f.Close()
|
|
dios <- struct{}{}
|
|
|
|
if err != nil {
|
|
return errNotReadable
|
|
}
|
|
|
|
indices := make(sort.IntSlice, 0, len(dirs))
|
|
var index int
|
|
for _, fi := range dirs {
|
|
if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
|
|
indices = append(indices, index)
|
|
}
|
|
}
|
|
indices.Sort()
|
|
|
|
// Recover all of the msg blocks.
|
|
// We now guarantee they are coming in order.
|
|
for _, index := range indices {
|
|
if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
|
|
// This is a truncate block with possibly no index. If the OS got shutdown
|
|
// out from underneath of us this is possible.
|
|
if mb.first.seq == 0 {
|
|
mb.dirtyCloseWithRemove(true)
|
|
fs.removeMsgBlockFromList(mb)
|
|
continue
|
|
}
|
|
if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq {
|
|
fs.state.FirstSeq = mb.first.seq
|
|
if mb.first.ts == 0 {
|
|
fs.state.FirstTime = time.Time{}
|
|
} else {
|
|
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
|
|
}
|
|
}
|
|
if mb.last.seq > fs.state.LastSeq {
|
|
fs.state.LastSeq = mb.last.seq
|
|
fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
|
|
}
|
|
fs.state.Msgs += mb.msgs
|
|
fs.state.Bytes += mb.bytes
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if len(fs.blks) > 0 {
|
|
fs.lmb = fs.blks[len(fs.blks)-1]
|
|
} else {
|
|
_, err = fs.newMsgBlockForWrite()
|
|
}
|
|
|
|
// Check if we encountered any lost data.
|
|
if fs.ld != nil {
|
|
var emptyBlks []*msgBlock
|
|
for _, mb := range fs.blks {
|
|
if mb.msgs == 0 && mb.rbytes == 0 {
|
|
emptyBlks = append(emptyBlks, mb)
|
|
}
|
|
}
|
|
for _, mb := range emptyBlks {
|
|
// Need the mb lock here.
|
|
mb.mu.Lock()
|
|
fs.removeMsgBlock(mb)
|
|
mb.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for keyfiles orphans.
|
|
if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
|
|
valid := make(map[uint32]bool)
|
|
for _, mb := range fs.blks {
|
|
valid[mb.index] = true
|
|
}
|
|
for _, fn := range kms {
|
|
var index uint32
|
|
shouldRemove := true
|
|
if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
|
|
shouldRemove = false
|
|
}
|
|
if shouldRemove {
|
|
os.Remove(fn)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Will expire msgs that have aged out on restart.
|
|
// We will treat this differently in case we have a recovery
|
|
// that will expire alot of messages on startup.
|
|
// Should only be called on startup.
|
|
func (fs *fileStore) expireMsgsOnRecover() {
|
|
if fs.state.Msgs == 0 {
|
|
return
|
|
}
|
|
|
|
var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
|
|
var purged, bytes uint64
|
|
var deleted int
|
|
var nts int64
|
|
|
|
// If we expire all make sure to write out a tombstone. Need to be done by hand here,
|
|
// usually taken care of by fs.removeMsgBlock() but we do not call that here.
|
|
var last msgId
|
|
|
|
deleteEmptyBlock := func(mb *msgBlock) {
|
|
// If we are the last keep state to remember first/last sequence.
|
|
// Do this part by hand since not deleting one by one.
|
|
if mb == fs.lmb {
|
|
last = mb.last
|
|
}
|
|
// Make sure we do subject cleanup as well.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj := range mb.fss {
|
|
fs.removePerSubject(subj)
|
|
}
|
|
// Make sure we do subject cleanup as well.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj := range mb.fss {
|
|
fs.removePerSubject(subj)
|
|
}
|
|
mb.dirtyCloseWithRemove(true)
|
|
deleted++
|
|
}
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.mu.Lock()
|
|
if minAge < mb.first.ts {
|
|
nts = mb.first.ts
|
|
mb.mu.Unlock()
|
|
break
|
|
}
|
|
// Can we remove whole block here?
|
|
if mb.last.ts <= minAge {
|
|
purged += mb.msgs
|
|
bytes += mb.bytes
|
|
deleteEmptyBlock(mb)
|
|
mb.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
// If we are here we have to process the interior messages of this blk.
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
mb.mu.Unlock()
|
|
break
|
|
}
|
|
|
|
var smv StoreMsg
|
|
var needNextFirst bool
|
|
|
|
// Walk messages and remove if expired.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for seq := mb.first.seq; seq <= mb.last.seq; seq++ {
|
|
sm, err := mb.cacheLookup(seq, &smv)
|
|
// Process interior deleted msgs.
|
|
if err == errDeletedMsg {
|
|
// Update dmap.
|
|
if mb.dmap.Exists(seq) {
|
|
mb.dmap.Delete(seq)
|
|
}
|
|
// Keep this updated just in case since we are removing dmap entries.
|
|
mb.first.seq, needNextFirst = seq, true
|
|
continue
|
|
}
|
|
// Break on other errors.
|
|
if err != nil || sm == nil {
|
|
mb.first.seq, needNextFirst = seq, true
|
|
break
|
|
}
|
|
|
|
// No error and sm != nil from here onward.
|
|
|
|
// Check for done.
|
|
if minAge < sm.ts {
|
|
mb.first.seq, needNextFirst = sm.seq, false
|
|
mb.first.seq = sm.seq
|
|
mb.first.ts = sm.ts
|
|
nts = sm.ts
|
|
break
|
|
}
|
|
|
|
// Delete the message here.
|
|
if mb.msgs > 0 {
|
|
mb.first.seq, needNextFirst = seq, true
|
|
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
|
|
if sz > mb.bytes {
|
|
sz = mb.bytes
|
|
}
|
|
mb.bytes -= sz
|
|
bytes += sz
|
|
mb.msgs--
|
|
purged++
|
|
}
|
|
// Update fss
|
|
// Make sure we have fss loaded.
|
|
mb.removeSeqPerSubject(sm.subj, seq)
|
|
fs.removePerSubject(sm.subj)
|
|
}
|
|
// Make sure we have a proper next first sequence.
|
|
if needNextFirst {
|
|
mb.selectNextFirst()
|
|
}
|
|
// Check if empty after processing, could happen if tail of messages are all deleted.
|
|
if mb.msgs == 0 {
|
|
deleteEmptyBlock(mb)
|
|
}
|
|
mb.mu.Unlock()
|
|
break
|
|
}
|
|
|
|
if nts > 0 {
|
|
// Make sure to set age check based on this value.
|
|
fs.resetAgeChk(nts - minAge)
|
|
}
|
|
|
|
if deleted > 0 {
|
|
// Update block map.
|
|
if fs.bim != nil {
|
|
for _, mb := range fs.blks[:deleted] {
|
|
delete(fs.bim, mb.index)
|
|
}
|
|
}
|
|
// Update blks slice.
|
|
fs.blks = copyMsgBlocks(fs.blks[deleted:])
|
|
if lb := len(fs.blks); lb == 0 {
|
|
fs.lmb = nil
|
|
} else {
|
|
fs.lmb = fs.blks[lb-1]
|
|
}
|
|
}
|
|
// Update top level accounting.
|
|
if purged < fs.state.Msgs {
|
|
fs.state.Msgs -= purged
|
|
} else {
|
|
fs.state.Msgs = 0
|
|
}
|
|
if bytes < fs.state.Bytes {
|
|
fs.state.Bytes -= bytes
|
|
} else {
|
|
fs.state.Bytes = 0
|
|
}
|
|
// Make sure to we properly set the fs first sequence and timestamp.
|
|
fs.selectNextFirst()
|
|
|
|
// Check if we have no messages and blocks left.
|
|
if fs.lmb == nil && last.seq != 0 {
|
|
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
|
|
lmb.writeTombstone(last.seq, last.ts)
|
|
}
|
|
// Clear any global subject state.
|
|
fs.psim = make(map[string]*psi)
|
|
}
|
|
|
|
// If we purged anything, make sure we kick flush state loop.
|
|
if purged > 0 {
|
|
fs.dirty++
|
|
fs.kickFlushStateLoop()
|
|
}
|
|
}
|
|
|
|
func copyMsgBlocks(src []*msgBlock) []*msgBlock {
|
|
if src == nil {
|
|
return nil
|
|
}
|
|
dst := make([]*msgBlock, len(src))
|
|
copy(dst, src)
|
|
return dst
|
|
}
|
|
|
|
// GetSeqFromTime looks for the first sequence number that has
|
|
// the message with >= timestamp.
|
|
// FIXME(dlc) - inefficient, and dumb really. Make this better.
|
|
func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
|
|
fs.mu.RLock()
|
|
lastSeq := fs.state.LastSeq
|
|
closed := fs.closed
|
|
fs.mu.RUnlock()
|
|
|
|
if closed {
|
|
return 0
|
|
}
|
|
|
|
mb := fs.selectMsgBlockForStart(t)
|
|
if mb == nil {
|
|
return lastSeq + 1
|
|
}
|
|
|
|
mb.mu.RLock()
|
|
fseq := mb.first.seq
|
|
lseq := mb.last.seq
|
|
mb.mu.RUnlock()
|
|
|
|
var smv StoreMsg
|
|
|
|
// Linear search, hence the dumb part..
|
|
ts := t.UnixNano()
|
|
for seq := fseq; seq <= lseq; seq++ {
|
|
sm, _, _ := mb.fetchMsg(seq, &smv)
|
|
if sm != nil && sm.ts >= ts {
|
|
return sm.seq
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// Find the first matching message.
|
|
func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}
|
|
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
return nil, false, err
|
|
}
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
return nil, false, err
|
|
}
|
|
}
|
|
|
|
// If we only have 1 subject currently and it matches our filter we can also set isAll.
|
|
if !isAll && len(mb.fss) == 1 {
|
|
_, isAll = mb.fss[filter]
|
|
}
|
|
// Skip scan of mb.fss if number of messages in the block are less than
|
|
// 1/2 the number of subjects in mb.fss. Or we have a wc and lots of fss entries.
|
|
const linearScanMaxFSS = 32
|
|
doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS)
|
|
|
|
if !doLinearScan {
|
|
// If we have a wildcard match against all tracked subjects we know about.
|
|
if wc {
|
|
subs = subs[:0]
|
|
for subj := range mb.fss {
|
|
if subjectIsSubsetMatch(subj, filter) {
|
|
subs = append(subs, subj)
|
|
}
|
|
}
|
|
}
|
|
fseq = mb.last.seq + 1
|
|
for _, subj := range subs {
|
|
ss := mb.fss[subj]
|
|
if ss != nil && ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
if ss == nil || start > ss.Last || ss.First >= fseq {
|
|
continue
|
|
}
|
|
if ss.First < start {
|
|
fseq = start
|
|
} else {
|
|
fseq = ss.First
|
|
}
|
|
}
|
|
}
|
|
|
|
if fseq > mb.last.seq {
|
|
return nil, false, ErrStoreMsgNotFound
|
|
}
|
|
|
|
if sm == nil {
|
|
sm = new(StoreMsg)
|
|
}
|
|
|
|
for seq := fseq; seq <= mb.last.seq; seq++ {
|
|
llseq := mb.llseq
|
|
fsm, err := mb.cacheLookup(seq, sm)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
expireOk := seq == mb.last.seq && mb.llseq == seq
|
|
if isAll {
|
|
return fsm, expireOk, nil
|
|
}
|
|
if doLinearScan {
|
|
if wc && subjectIsSubsetMatch(fsm.subj, filter) {
|
|
return fsm, expireOk, nil
|
|
} else if !wc && fsm.subj == filter {
|
|
return fsm, expireOk, nil
|
|
}
|
|
} else {
|
|
for _, subj := range subs {
|
|
if fsm.subj == subj {
|
|
return fsm, expireOk, nil
|
|
}
|
|
}
|
|
}
|
|
// If we are here we did not match, so put the llseq back.
|
|
mb.llseq = llseq
|
|
}
|
|
|
|
return nil, false, ErrStoreMsgNotFound
|
|
}
|
|
|
|
// This will traverse a message block and generate the filtered pending.
|
|
func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
return mb.filteredPendingLocked(subj, wc, seq)
|
|
}
|
|
|
|
// This will traverse a message block and generate the filtered pending.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
|
|
isAll := filter == _EMPTY_ || filter == fwcs
|
|
|
|
// First check if we can optimize this part.
|
|
// This means we want all and the starting sequence was before this block.
|
|
if isAll && sseq <= mb.first.seq {
|
|
return mb.msgs, mb.first.seq, mb.last.seq
|
|
}
|
|
|
|
update := func(ss *SimpleState) {
|
|
total += ss.Msgs
|
|
if first == 0 || ss.First < first {
|
|
first = ss.First
|
|
}
|
|
if ss.Last > last {
|
|
last = ss.Last
|
|
}
|
|
}
|
|
|
|
// Make sure we have fss loaded.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
|
|
tsa := [32]string{}
|
|
fsa := [32]string{}
|
|
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
|
|
|
|
// 1. See if we match any subs from fss.
|
|
// 2. If we match and the sseq is past ss.Last then we can use meta only.
|
|
// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.
|
|
|
|
isMatch := func(subj string) bool {
|
|
if !wc {
|
|
return subj == filter
|
|
}
|
|
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
|
|
return isSubsetMatchTokenized(tts, fts)
|
|
}
|
|
|
|
var havePartial bool
|
|
for subj, ss := range mb.fss {
|
|
if isAll || isMatch(subj) {
|
|
if ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
if sseq <= ss.First {
|
|
update(ss)
|
|
} else if sseq <= ss.Last {
|
|
// We matched but its a partial.
|
|
havePartial = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we did not encounter any partials we can return here.
|
|
if !havePartial {
|
|
return total, first, last
|
|
}
|
|
|
|
// If we are here we need to scan the msgs.
|
|
// Clear what we had.
|
|
total, first, last = 0, 0, 0
|
|
|
|
// If we load the cache for a linear scan we want to expire that cache upon exit.
|
|
var shouldExpire bool
|
|
if mb.cacheNotLoaded() {
|
|
mb.loadMsgsWithLock()
|
|
shouldExpire = true
|
|
}
|
|
|
|
var smv StoreMsg
|
|
for seq := sseq; seq <= mb.last.seq; seq++ {
|
|
sm, _ := mb.cacheLookup(seq, &smv)
|
|
if sm == nil {
|
|
continue
|
|
}
|
|
if isAll || isMatch(sm.subj) {
|
|
total++
|
|
if first == 0 || seq < first {
|
|
first = seq
|
|
}
|
|
if seq > last {
|
|
last = seq
|
|
}
|
|
}
|
|
}
|
|
// If we loaded this block for this operation go ahead and expire it here.
|
|
if shouldExpire {
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
|
|
return total, first, last
|
|
}
|
|
|
|
// FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
|
|
func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
lseq := fs.state.LastSeq
|
|
if sseq < fs.state.FirstSeq {
|
|
sseq = fs.state.FirstSeq
|
|
}
|
|
|
|
// Returned state.
|
|
var ss SimpleState
|
|
|
|
// If past the end no results.
|
|
if sseq > lseq {
|
|
return ss
|
|
}
|
|
|
|
// If we want all msgs that match we can shortcircuit.
|
|
// TODO(dlc) - This can be extended for all cases but would
|
|
// need to be careful on total msgs calculations etc.
|
|
if sseq == fs.state.FirstSeq {
|
|
fs.numFilteredPending(subj, &ss)
|
|
} else {
|
|
wc := subjectHasWildcard(subj)
|
|
// Tracking subject state.
|
|
// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
|
|
for _, mb := range fs.blks {
|
|
// Skip blocks that are less than our starting sequence.
|
|
if sseq > atomic.LoadUint64(&mb.last.seq) {
|
|
continue
|
|
}
|
|
t, f, l := mb.filteredPending(subj, wc, sseq)
|
|
ss.Msgs += t
|
|
if ss.First == 0 || (f > 0 && f < ss.First) {
|
|
ss.First = f
|
|
}
|
|
if l > ss.Last {
|
|
ss.Last = l
|
|
}
|
|
}
|
|
}
|
|
|
|
return ss
|
|
}
|
|
|
|
// Optimized way for getting all num pending matching a filter subject.
|
|
// Lock should be held.
|
|
func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
|
|
isAll := filter == _EMPTY_ || filter == fwcs
|
|
|
|
// If isAll we do not need to do anything special to calculate the first and last and total.
|
|
if isAll {
|
|
ss.First = fs.state.FirstSeq
|
|
ss.Last = fs.state.LastSeq
|
|
ss.Msgs = fs.state.Msgs
|
|
return
|
|
}
|
|
|
|
tsa := [32]string{}
|
|
fsa := [32]string{}
|
|
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
|
|
|
|
start, stop := uint32(math.MaxUint32), uint32(0)
|
|
for subj, psi := range fs.psim {
|
|
if isAll {
|
|
ss.Msgs += psi.total
|
|
} else {
|
|
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
|
|
if isSubsetMatchTokenized(tts, fts) {
|
|
ss.Msgs += psi.total
|
|
// Keep track of start and stop indexes for this subject.
|
|
if psi.fblk < start {
|
|
start = psi.fblk
|
|
}
|
|
if psi.lblk > stop {
|
|
stop = psi.lblk
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// If not collecting all we do need to figure out the first and last sequences.
|
|
if !isAll {
|
|
wc := subjectHasWildcard(filter)
|
|
// Do start
|
|
mb := fs.bim[start]
|
|
if mb != nil {
|
|
_, f, _ := mb.filteredPending(filter, wc, 0)
|
|
ss.First = f
|
|
}
|
|
if ss.First == 0 {
|
|
// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
|
|
for i := start + 1; i <= stop; i++ {
|
|
mb := fs.bim[i]
|
|
if mb == nil {
|
|
continue
|
|
}
|
|
if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
|
|
ss.First = f
|
|
break
|
|
}
|
|
}
|
|
}
|
|
// Now last
|
|
if mb = fs.bim[stop]; mb != nil {
|
|
_, _, l := mb.filteredPending(filter, wc, 0)
|
|
ss.Last = l
|
|
}
|
|
}
|
|
}
|
|
|
|
// SubjectsState returns a map of SimpleState for all matching subjects.
|
|
func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
if fs.state.Msgs == 0 {
|
|
return nil
|
|
}
|
|
|
|
start, stop := fs.blks[0], fs.lmb
|
|
// We can short circuit if not a wildcard using psim for start and stop.
|
|
if !subjectHasWildcard(subject) {
|
|
info := fs.psim[subject]
|
|
if info == nil {
|
|
return nil
|
|
}
|
|
start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
|
|
}
|
|
|
|
// Aggregate fss.
|
|
fss := make(map[string]SimpleState)
|
|
var startFound bool
|
|
|
|
for _, mb := range fs.blks {
|
|
if !startFound {
|
|
if mb != start {
|
|
continue
|
|
}
|
|
startFound = true
|
|
}
|
|
|
|
mb.mu.Lock()
|
|
// Make sure we have fss loaded.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj, ss := range mb.fss {
|
|
if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
|
|
if ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
oss := fss[subj]
|
|
if oss.First == 0 { // New
|
|
fss[subj] = *ss
|
|
} else {
|
|
// Merge here.
|
|
oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
|
|
fss[subj] = oss
|
|
}
|
|
}
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
if mb == stop {
|
|
break
|
|
}
|
|
}
|
|
|
|
return fss
|
|
}
|
|
|
|
// NumPending will return the number of pending messages matching the filter subject starting at sequence.
|
|
// Optimized for stream num pending calculations for consumers.
|
|
func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
// This can always be last for these purposes.
|
|
validThrough = fs.state.LastSeq
|
|
|
|
if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
|
|
return 0, validThrough
|
|
}
|
|
|
|
// Track starting for both block for the sseq and staring block that matches any subject.
|
|
var seqStart, subjStart int
|
|
|
|
// See if we need to figure out starting block per sseq.
|
|
if sseq > fs.state.FirstSeq {
|
|
// This should not, but can return -1, so make sure we check to avoid panic below.
|
|
if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 {
|
|
seqStart = 0
|
|
}
|
|
}
|
|
|
|
var tsa, fsa [32]string
|
|
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
|
|
isAll := filter == _EMPTY_ || filter == fwcs
|
|
wc := subjectHasWildcard(filter)
|
|
|
|
// See if filter was provided but its the only subject.
|
|
if !isAll && !wc && len(fs.psim) == 1 && fs.psim[filter] != nil {
|
|
isAll = true
|
|
}
|
|
|
|
// If we are isAll and have no deleted we can do a simpler calculation.
|
|
if isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
|
|
if sseq == 0 {
|
|
return fs.state.Msgs, validThrough
|
|
}
|
|
return fs.state.LastSeq - sseq + 1, validThrough
|
|
}
|
|
|
|
isMatch := func(subj string) bool {
|
|
if isAll {
|
|
return true
|
|
}
|
|
if !wc {
|
|
return subj == filter
|
|
}
|
|
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
|
|
return isSubsetMatchTokenized(tts, fts)
|
|
}
|
|
|
|
// If we would need to scan more from the beginning, revert back to calculating directly here.
|
|
// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
|
|
if lastPerSubject || seqStart >= (len(fs.blks)/2) {
|
|
// If we need to track seen for last per subject.
|
|
var seen map[string]bool
|
|
if lastPerSubject {
|
|
seen = make(map[string]bool)
|
|
}
|
|
|
|
for i := seqStart; i < len(fs.blks); i++ {
|
|
mb := fs.blks[i]
|
|
mb.mu.Lock()
|
|
var t uint64
|
|
if isAll && sseq <= mb.first.seq {
|
|
if lastPerSubject {
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj := range mb.fss {
|
|
if !seen[subj] {
|
|
total++
|
|
seen[subj] = true
|
|
}
|
|
}
|
|
} else {
|
|
total += mb.msgs
|
|
}
|
|
mb.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
// If we are here we need to at least scan the subject fss.
|
|
// Make sure we have fss loaded.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
var havePartial bool
|
|
for subj, ss := range mb.fss {
|
|
if !seen[subj] && isMatch(subj) {
|
|
if lastPerSubject {
|
|
// Can't have a partials with last by subject.
|
|
if sseq <= ss.Last {
|
|
t++
|
|
seen[subj] = true
|
|
}
|
|
} else {
|
|
if ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
if sseq <= ss.First {
|
|
t += ss.Msgs
|
|
} else if sseq <= ss.Last {
|
|
// We matched but its a partial.
|
|
havePartial = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// See if we need to scan msgs here.
|
|
if havePartial {
|
|
// Clear on partial.
|
|
t = 0
|
|
// If we load the cache for a linear scan we want to expire that cache upon exit.
|
|
var shouldExpire bool
|
|
if mb.cacheNotLoaded() {
|
|
mb.loadMsgsWithLock()
|
|
shouldExpire = true
|
|
}
|
|
var smv StoreMsg
|
|
for seq := sseq; seq <= mb.last.seq; seq++ {
|
|
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) {
|
|
t++
|
|
}
|
|
}
|
|
// If we loaded this block for this operation go ahead and expire it here.
|
|
if shouldExpire {
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
}
|
|
mb.mu.Unlock()
|
|
total += t
|
|
}
|
|
return total, validThrough
|
|
}
|
|
|
|
// If we are here its better to calculate totals from psim and adjust downward by scanning less blocks.
|
|
// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
|
|
start := uint32(math.MaxUint32)
|
|
for subj, psi := range fs.psim {
|
|
if isMatch(subj) {
|
|
if lastPerSubject {
|
|
total++
|
|
// Keep track of start index for this subject.
|
|
// Use last block in this case.
|
|
if psi.lblk < start {
|
|
start = psi.lblk
|
|
}
|
|
} else {
|
|
total += psi.total
|
|
// Keep track of start index for this subject.
|
|
if psi.fblk < start {
|
|
start = psi.fblk
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// See if we were asked for all, if so we are done.
|
|
if sseq <= fs.state.FirstSeq {
|
|
return total, validThrough
|
|
}
|
|
|
|
// If we are here we need to calculate partials for the first blocks.
|
|
subjStart = int(start)
|
|
firstSubjBlk := fs.bim[uint32(subjStart)]
|
|
var firstSubjBlkFound bool
|
|
var smv StoreMsg
|
|
|
|
// Adjust in case not found.
|
|
if firstSubjBlk == nil {
|
|
firstSubjBlkFound = true
|
|
}
|
|
|
|
// Track how many we need to adjust against the total.
|
|
var adjust uint64
|
|
|
|
for i := 0; i <= seqStart; i++ {
|
|
mb := fs.blks[i]
|
|
|
|
// We can skip blks if we know they are below the first one that has any subject matches.
|
|
if !firstSubjBlkFound {
|
|
if mb == firstSubjBlk {
|
|
firstSubjBlkFound = true
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// We need to scan this block.
|
|
var shouldExpire bool
|
|
mb.mu.Lock()
|
|
// Check if we should include all of this block in adjusting. If so work with metadata.
|
|
if sseq > mb.last.seq {
|
|
if isAll && !lastPerSubject {
|
|
adjust += mb.msgs
|
|
} else {
|
|
// We need to adjust for all matches in this block.
|
|
// We will scan fss state vs messages themselves.
|
|
// Make sure we have fss loaded.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj, ss := range mb.fss {
|
|
if isMatch(subj) {
|
|
if lastPerSubject {
|
|
adjust++
|
|
} else {
|
|
adjust += ss.Msgs
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// This is the last block. We need to scan per message here.
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
mb.mu.Unlock()
|
|
return 0, 0
|
|
}
|
|
shouldExpire = true
|
|
}
|
|
|
|
var last = mb.last.seq
|
|
if sseq < last {
|
|
last = sseq
|
|
}
|
|
for seq := mb.first.seq; seq < last; seq++ {
|
|
sm, _ := mb.cacheLookup(seq, &smv)
|
|
if sm == nil {
|
|
continue
|
|
}
|
|
// Check if it matches our filter.
|
|
if isMatch(sm.subj) && sm.seq < sseq {
|
|
adjust++
|
|
}
|
|
}
|
|
}
|
|
// If we loaded the block try to force expire.
|
|
if shouldExpire {
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
mb.mu.Unlock()
|
|
}
|
|
// Make final adjustment.
|
|
total -= adjust
|
|
|
|
return total, validThrough
|
|
}
|
|
|
|
// SubjectsTotal return message totals per subject.
|
|
func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
if len(fs.psim) == 0 {
|
|
return nil
|
|
}
|
|
|
|
tsa := [32]string{}
|
|
fsa := [32]string{}
|
|
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
|
|
isAll := filter == _EMPTY_ || filter == fwcs
|
|
wc := subjectHasWildcard(filter)
|
|
|
|
isMatch := func(subj string) bool {
|
|
if !wc {
|
|
return subj == filter
|
|
}
|
|
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
|
|
return isSubsetMatchTokenized(tts, fts)
|
|
}
|
|
|
|
fst := make(map[string]uint64)
|
|
for subj, psi := range fs.psim {
|
|
if isAll || isMatch(subj) {
|
|
fst[subj] = psi.total
|
|
}
|
|
}
|
|
return fst
|
|
}
|
|
|
|
// RegisterStorageUpdates registers a callback for updates to storage changes.
|
|
// It will present number of messages and bytes as a signed integer and an
|
|
// optional sequence number of the message if a single.
|
|
func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
|
|
fs.mu.Lock()
|
|
fs.scb = cb
|
|
bsz := fs.state.Bytes
|
|
fs.mu.Unlock()
|
|
if cb != nil && bsz > 0 {
|
|
cb(0, int64(bsz), 0, _EMPTY_)
|
|
}
|
|
}
|
|
|
|
// Helper to get hash key for specific message block.
|
|
// Lock should be held
|
|
func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
|
|
return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
|
|
}
|
|
|
|
func (mb *msgBlock) setupWriteCache(buf []byte) {
|
|
// Make sure we have a cache setup.
|
|
if mb.cache != nil {
|
|
return
|
|
}
|
|
|
|
// Setup simple cache.
|
|
mb.cache = &cache{buf: buf}
|
|
// Make sure we set the proper cache offset if we have existing data.
|
|
var fi os.FileInfo
|
|
if mb.mfd != nil {
|
|
fi, _ = mb.mfd.Stat()
|
|
} else if mb.mfn != _EMPTY_ {
|
|
fi, _ = os.Stat(mb.mfn)
|
|
}
|
|
if fi != nil {
|
|
mb.cache.off = int(fi.Size())
|
|
}
|
|
mb.llts = time.Now().UnixNano()
|
|
mb.startCacheExpireTimer()
|
|
}
|
|
|
|
// This rolls to a new append msg block.
|
|
// Lock should be held.
|
|
func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
|
|
index := uint32(1)
|
|
var rbuf []byte
|
|
|
|
if lmb := fs.lmb; lmb != nil {
|
|
index = lmb.index + 1
|
|
|
|
// Determine if we can reclaim any resources here.
|
|
if fs.fip {
|
|
lmb.mu.Lock()
|
|
lmb.closeFDsLocked()
|
|
if lmb.cache != nil {
|
|
// Reset write timestamp and see if we can expire this cache.
|
|
rbuf = lmb.tryExpireWriteCache()
|
|
}
|
|
lmb.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
|
|
|
|
// Lock should be held to quiet race detector.
|
|
mb.mu.Lock()
|
|
mb.setupWriteCache(rbuf)
|
|
mb.fss = make(map[string]*SimpleState)
|
|
|
|
// Set cache time to creation time to start.
|
|
ts := time.Now().UnixNano()
|
|
mb.llts, mb.lwts = 0, ts
|
|
// Remember our last sequence number.
|
|
mb.first.seq = fs.state.LastSeq + 1
|
|
mb.last.seq = fs.state.LastSeq
|
|
mb.mu.Unlock()
|
|
|
|
// Now do local hash.
|
|
key := sha256.Sum256(fs.hashKeyForBlock(index))
|
|
hh, err := highwayhash.New64(key[:])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create hash: %v", err)
|
|
}
|
|
mb.hh = hh
|
|
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, mb.index))
|
|
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
|
|
if err != nil {
|
|
mb.dirtyCloseWithRemove(true)
|
|
return nil, fmt.Errorf("Error creating msg block file [%q]: %v", mb.mfn, err)
|
|
}
|
|
mb.mfd = mfd
|
|
|
|
// Check if encryption is enabled.
|
|
if fs.prf != nil {
|
|
if err := fs.genEncryptionKeysForBlock(mb); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// If we know we will need this so go ahead and spin up.
|
|
if !fs.fip {
|
|
mb.spinUpFlushLoop()
|
|
}
|
|
|
|
// Add to our list of blocks and mark as last.
|
|
fs.addMsgBlock(mb)
|
|
|
|
if fs.dirty > 0 {
|
|
fs.kickFlushStateLoop()
|
|
}
|
|
|
|
return mb, nil
|
|
}
|
|
|
|
// Generate the keys for this message block and write them out.
|
|
func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
|
|
if mb == nil {
|
|
return nil
|
|
}
|
|
key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
|
|
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
mb.kfn = keyFile
|
|
return nil
|
|
}
|
|
|
|
// Stores a raw message with expected sequence number and timestamp.
|
|
// Lock should be held.
|
|
func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
|
|
if fs.closed {
|
|
return ErrStoreClosed
|
|
}
|
|
|
|
// Per subject max check needed.
|
|
mmp := uint64(fs.cfg.MaxMsgsPer)
|
|
var psmc uint64
|
|
psmax := mmp > 0 && len(subj) > 0
|
|
if psmax {
|
|
if info, ok := fs.psim[subj]; ok {
|
|
psmc = info.total
|
|
}
|
|
}
|
|
|
|
var fseq uint64
|
|
// Check if we are discarding new messages when we reach the limit.
|
|
if fs.cfg.Discard == DiscardNew {
|
|
var asl bool
|
|
if psmax && psmc >= mmp {
|
|
// If we are instructed to discard new per subject, this is an error.
|
|
if fs.cfg.DiscardNewPer {
|
|
return ErrMaxMsgsPerSubject
|
|
}
|
|
if fseq, err = fs.firstSeqForSubj(subj); err != nil {
|
|
return err
|
|
}
|
|
asl = true
|
|
}
|
|
if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
|
|
return ErrMaxMsgs
|
|
}
|
|
if fs.cfg.MaxBytes > 0 && fs.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(fs.cfg.MaxBytes) {
|
|
if !asl || fs.sizeForSeq(fseq) <= len(msg)+len(hdr) {
|
|
return ErrMaxBytes
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check sequence.
|
|
if seq != fs.state.LastSeq+1 {
|
|
if seq > 0 {
|
|
return ErrSequenceMismatch
|
|
}
|
|
seq = fs.state.LastSeq + 1
|
|
}
|
|
|
|
// Write msg record.
|
|
n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Adjust top level tracking of per subject msg counts.
|
|
if len(subj) > 0 {
|
|
index := fs.lmb.index
|
|
if info, ok := fs.psim[subj]; ok {
|
|
info.total++
|
|
if index > info.lblk {
|
|
info.lblk = index
|
|
}
|
|
} else {
|
|
fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index}
|
|
}
|
|
}
|
|
|
|
// Adjust first if needed.
|
|
now := time.Unix(0, ts).UTC()
|
|
if fs.state.Msgs == 0 {
|
|
fs.state.FirstSeq = seq
|
|
fs.state.FirstTime = now
|
|
}
|
|
|
|
fs.state.Msgs++
|
|
fs.state.Bytes += n
|
|
fs.state.LastSeq = seq
|
|
fs.state.LastTime = now
|
|
|
|
// Enforce per message limits.
|
|
// We snapshotted psmc before our actual write, so >= comparison needed.
|
|
if psmax && psmc >= mmp {
|
|
// We may have done this above.
|
|
if fseq == 0 {
|
|
fseq, _ = fs.firstSeqForSubj(subj)
|
|
}
|
|
if ok, _ := fs.removeMsgViaLimits(fseq); ok {
|
|
// Make sure we are below the limit.
|
|
if psmc--; psmc >= mmp {
|
|
for info, ok := fs.psim[subj]; ok && info.total > mmp; info, ok = fs.psim[subj] {
|
|
if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
|
|
if ok, _ := fs.removeMsgViaLimits(seq); !ok {
|
|
break
|
|
}
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
} else if mb := fs.selectMsgBlock(fseq); mb != nil {
|
|
// If we are here we could not remove fseq from above, so rebuild.
|
|
var ld *LostStreamData
|
|
if ld, _, _ = mb.rebuildState(); ld != nil {
|
|
fs.rebuildStateLocked(ld)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Limits checks and enforcement.
|
|
// If they do any deletions they will update the
|
|
// byte count on their own, so no need to compensate.
|
|
fs.enforceMsgLimit()
|
|
fs.enforceBytesLimit()
|
|
|
|
// Check if we have and need the age expiration timer running.
|
|
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
|
|
fs.startAgeChk()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// StoreRawMsg stores a raw message with expected sequence number and timestamp.
|
|
func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
|
|
fs.mu.Lock()
|
|
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
|
|
cb := fs.scb
|
|
// Check if first message timestamp requires expiry
|
|
// sooner than initial replica expiry timer set to MaxAge when initializing.
|
|
if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
|
|
fs.receivedAny = true
|
|
// don't block here by calling expireMsgs directly.
|
|
// Instead, set short timeout.
|
|
fs.resetAgeChk(int64(time.Millisecond * 50))
|
|
}
|
|
fs.mu.Unlock()
|
|
|
|
if err == nil && cb != nil {
|
|
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
// Store stores a message. We hold the main filestore lock for any write operation.
|
|
func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
|
|
fs.mu.Lock()
|
|
seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
|
|
err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
|
|
cb := fs.scb
|
|
fs.mu.Unlock()
|
|
|
|
if err != nil {
|
|
seq, ts = 0, 0
|
|
} else if cb != nil {
|
|
cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
|
|
}
|
|
|
|
return seq, ts, err
|
|
}
|
|
|
|
// skipMsg will update this message block for a skipped message.
|
|
// If we do not have any messages, just update the metadata, otherwise
|
|
// we will place an empty record marking the sequence as used. The
|
|
// sequence will be marked erased.
|
|
// fs lock should be held.
|
|
func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
|
|
if mb == nil {
|
|
return
|
|
}
|
|
var needsRecord bool
|
|
|
|
nowts := now.UnixNano()
|
|
|
|
mb.mu.Lock()
|
|
// If we are empty can just do meta.
|
|
if mb.msgs == 0 {
|
|
mb.last.seq = seq
|
|
mb.last.ts = nowts
|
|
mb.first.seq = seq + 1
|
|
mb.first.ts = nowts
|
|
} else {
|
|
needsRecord = true
|
|
mb.dmap.Insert(seq)
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
if needsRecord {
|
|
mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
|
|
} else {
|
|
mb.kickFlusher()
|
|
}
|
|
}
|
|
|
|
// SkipMsg will use the next sequence number but not store anything.
|
|
func (fs *fileStore) SkipMsg() uint64 {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
// Grab time and last seq.
|
|
now, seq := time.Now().UTC(), fs.state.LastSeq+1
|
|
fs.state.LastSeq, fs.state.LastTime = seq, now
|
|
if fs.state.Msgs == 0 {
|
|
fs.state.FirstSeq, fs.state.FirstTime = seq, now
|
|
}
|
|
if seq == fs.state.FirstSeq {
|
|
fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
|
|
}
|
|
fs.lmb.skipMsg(seq, now)
|
|
|
|
return seq
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) rebuildFirst() {
|
|
if len(fs.blks) == 0 {
|
|
return
|
|
}
|
|
fmb := fs.blks[0]
|
|
if fmb == nil {
|
|
return
|
|
}
|
|
|
|
ld, _, _ := fmb.rebuildState()
|
|
fmb.mu.RLock()
|
|
isEmpty := fmb.msgs == 0
|
|
fmb.mu.RUnlock()
|
|
if isEmpty {
|
|
fmb.mu.Lock()
|
|
fs.removeMsgBlock(fmb)
|
|
fmb.mu.Unlock()
|
|
}
|
|
fs.selectNextFirst()
|
|
fs.rebuildStateLocked(ld)
|
|
}
|
|
|
|
// Optimized helper function to return first sequence.
|
|
// subj will always be publish subject here, meaning non-wildcard.
|
|
// We assume a fast check that this subj even exists already happened.
|
|
// Lock should be held.
|
|
func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
|
|
if len(fs.blks) == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
// See if we can optimize where we start.
|
|
start, stop := fs.blks[0].index, fs.lmb.index
|
|
if info, ok := fs.psim[subj]; ok {
|
|
start, stop = info.fblk, info.lblk
|
|
}
|
|
|
|
for i := start; i <= stop; i++ {
|
|
mb := fs.bim[i]
|
|
if mb == nil {
|
|
continue
|
|
}
|
|
mb.mu.Lock()
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
mb.mu.Unlock()
|
|
return 0, err
|
|
}
|
|
if ss := mb.fss[subj]; ss != nil {
|
|
// Adjust first if it was not where we thought it should be.
|
|
if i != start {
|
|
if info, ok := fs.psim[subj]; ok {
|
|
info.fblk = i
|
|
}
|
|
}
|
|
if ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
mb.mu.Unlock()
|
|
return ss.First, nil
|
|
}
|
|
mb.mu.Unlock()
|
|
}
|
|
return 0, nil
|
|
}
|
|
|
|
// Will check the msg limit and drop firstSeq msg if needed.
|
|
// Lock should be held.
|
|
func (fs *fileStore) enforceMsgLimit() {
|
|
if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
|
|
return
|
|
}
|
|
for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
|
|
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
|
|
fs.rebuildFirst()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Will check the bytes limit and drop msgs if needed.
|
|
// Lock should be held.
|
|
func (fs *fileStore) enforceBytesLimit() {
|
|
if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
|
|
return
|
|
}
|
|
for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
|
|
if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
|
|
fs.rebuildFirst()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Will make sure we have limits honored for max msgs per subject on recovery or config update.
|
|
// We will make sure to go through all msg blocks etc. but in practice this
|
|
// will most likely only be the last one, so can take a more conservative approach.
|
|
// Lock should be held.
|
|
func (fs *fileStore) enforceMsgPerSubjectLimit() {
|
|
maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)
|
|
|
|
// We want to suppress callbacks from remove during this process
|
|
// since these should have already been deleted and accounted for.
|
|
cb := fs.scb
|
|
fs.scb = nil
|
|
defer func() { fs.scb = cb }()
|
|
|
|
var numMsgs uint64
|
|
|
|
// collect all that are not correct.
|
|
needAttention := make(map[string]*psi)
|
|
for subj, psi := range fs.psim {
|
|
numMsgs += psi.total
|
|
if psi.total > maxMsgsPer {
|
|
needAttention[subj] = psi
|
|
}
|
|
}
|
|
|
|
// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
|
|
// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
|
|
if numMsgs != fs.state.Msgs {
|
|
fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs)
|
|
// Clear any global subject state.
|
|
fs.psim = make(map[string]*psi)
|
|
for _, mb := range fs.blks {
|
|
ld, _, err := mb.rebuildState()
|
|
if err != nil && ld != nil {
|
|
fs.addLostData(ld)
|
|
}
|
|
fs.populateGlobalPerSubjectInfo(mb)
|
|
}
|
|
// Rebuild fs state too.
|
|
fs.rebuildStateLocked(nil)
|
|
// Need to redo blocks that need attention.
|
|
needAttention = make(map[string]*psi)
|
|
for subj, psi := range fs.psim {
|
|
if psi.total > maxMsgsPer {
|
|
needAttention[subj] = psi
|
|
}
|
|
}
|
|
}
|
|
|
|
// Collect all the msgBlks we alter.
|
|
blks := make(map[*msgBlock]struct{})
|
|
|
|
// For re-use below.
|
|
var sm StoreMsg
|
|
|
|
// Walk all subjects that need attention here.
|
|
for subj, info := range needAttention {
|
|
total, start, stop := info.total, info.fblk, info.lblk
|
|
|
|
for i := start; i <= stop; i++ {
|
|
mb := fs.bim[i]
|
|
if mb == nil {
|
|
continue
|
|
}
|
|
// Grab the ss entry for this subject in case sparse.
|
|
mb.mu.Lock()
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
ss := mb.fss[subj]
|
|
if ss != nil && ss.firstNeedsUpdate {
|
|
mb.recalculateFirstForSubj(subj, ss.First, ss)
|
|
}
|
|
mb.mu.Unlock()
|
|
if ss == nil {
|
|
continue
|
|
}
|
|
for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
|
|
m, _, err := mb.firstMatching(subj, false, seq, &sm)
|
|
if err == nil {
|
|
seq = m.seq + 1
|
|
if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
|
|
total--
|
|
blks[mb] = struct{}{}
|
|
}
|
|
} else {
|
|
// On error just do single increment.
|
|
seq++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Expire the cache if we can.
|
|
for mb := range blks {
|
|
mb.mu.Lock()
|
|
if mb.msgs > 0 {
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
mb.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) deleteFirstMsg() (bool, error) {
|
|
return fs.removeMsgViaLimits(fs.state.FirstSeq)
|
|
}
|
|
|
|
// If we remove via limits that can always be recovered on a restart we
|
|
// do not force the system to update the index file.
|
|
// Lock should be held.
|
|
func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
|
|
return fs.removeMsg(seq, false, true, false)
|
|
}
|
|
|
|
// RemoveMsg will remove the message from this store.
|
|
// Will return the number of bytes removed.
|
|
func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
|
|
return fs.removeMsg(seq, false, false, true)
|
|
}
|
|
|
|
func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
|
|
return fs.removeMsg(seq, true, false, true)
|
|
}
|
|
|
|
// Convenience function to remove per subject tracking at the filestore level.
|
|
// Lock should be held.
|
|
func (fs *fileStore) removePerSubject(subj string) {
|
|
if len(subj) == 0 {
|
|
return
|
|
}
|
|
// We do not update sense of fblk here but will do so when we resolve during lookup.
|
|
if info, ok := fs.psim[subj]; ok {
|
|
info.total--
|
|
if info.total == 0 {
|
|
delete(fs.psim, subj)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove a message, optionally rewriting the mb file.
|
|
func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
|
|
if seq == 0 {
|
|
return false, ErrStoreMsgNotFound
|
|
}
|
|
fsLock := func() {
|
|
if needFSLock {
|
|
fs.mu.Lock()
|
|
}
|
|
}
|
|
fsUnlock := func() {
|
|
if needFSLock {
|
|
fs.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
fsLock()
|
|
|
|
if fs.closed {
|
|
fsUnlock()
|
|
return false, ErrStoreClosed
|
|
}
|
|
if !viaLimits && fs.sips > 0 {
|
|
fsUnlock()
|
|
return false, ErrStoreSnapshotInProgress
|
|
}
|
|
// If in encrypted mode negate secure rewrite here.
|
|
if secure && fs.prf != nil {
|
|
secure = false
|
|
}
|
|
|
|
if fs.state.Msgs == 0 {
|
|
var err = ErrStoreEOF
|
|
if seq <= fs.state.LastSeq {
|
|
err = ErrStoreMsgNotFound
|
|
}
|
|
fsUnlock()
|
|
return false, err
|
|
}
|
|
|
|
mb := fs.selectMsgBlock(seq)
|
|
if mb == nil {
|
|
var err = ErrStoreEOF
|
|
if seq <= fs.state.LastSeq {
|
|
err = ErrStoreMsgNotFound
|
|
}
|
|
fsUnlock()
|
|
return false, err
|
|
}
|
|
|
|
mb.mu.Lock()
|
|
|
|
// See if we are closed or the sequence number is still relevant.
|
|
if mb.closed || seq < mb.first.seq {
|
|
mb.mu.Unlock()
|
|
fsUnlock()
|
|
return false, nil
|
|
}
|
|
|
|
// Now check dmap if it is there.
|
|
if mb.dmap.Exists(seq) {
|
|
mb.mu.Unlock()
|
|
fsUnlock()
|
|
return false, nil
|
|
}
|
|
|
|
// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
|
|
// Now just load regardless.
|
|
// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
|
|
if mb.cacheNotLoaded() {
|
|
// We do not want to block possible activity within another msg block.
|
|
// We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another
|
|
// go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block.
|
|
mb.mu.Unlock()
|
|
fsUnlock()
|
|
if err := mb.loadMsgs(); err != nil {
|
|
return false, err
|
|
}
|
|
fsLock()
|
|
// We need to check if things changed out from underneath us.
|
|
if fs.closed {
|
|
fsUnlock()
|
|
return false, ErrStoreClosed
|
|
}
|
|
mb.mu.Lock()
|
|
if mb.closed || seq < mb.first.seq {
|
|
mb.mu.Unlock()
|
|
fsUnlock()
|
|
return false, nil
|
|
}
|
|
// cacheLookup below will do dmap check so no need to repeat here.
|
|
}
|
|
|
|
var smv StoreMsg
|
|
sm, err := mb.cacheLookup(seq, &smv)
|
|
if err != nil {
|
|
mb.mu.Unlock()
|
|
fsUnlock()
|
|
// Mimic err behavior from above check to dmap. No error returned if already removed.
|
|
if err == errDeletedMsg {
|
|
err = nil
|
|
}
|
|
return false, err
|
|
}
|
|
// Grab size
|
|
msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
|
|
|
|
// Set cache timestamp for last remove.
|
|
mb.lrts = time.Now().UnixNano()
|
|
|
|
// Global stats
|
|
if fs.state.Msgs > 0 {
|
|
fs.state.Msgs--
|
|
}
|
|
if msz < fs.state.Bytes {
|
|
fs.state.Bytes -= msz
|
|
} else {
|
|
fs.state.Bytes = 0
|
|
}
|
|
|
|
// Now local mb updates.
|
|
if mb.msgs > 0 {
|
|
mb.msgs--
|
|
}
|
|
if msz < mb.bytes {
|
|
mb.bytes -= msz
|
|
} else {
|
|
mb.bytes = 0
|
|
}
|
|
|
|
// Mark as dirty for stream state.
|
|
fs.dirty++
|
|
|
|
// If we are tracking subjects here make sure we update that accounting.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
|
|
// If we are tracking multiple subjects here make sure we update that accounting.
|
|
mb.removeSeqPerSubject(sm.subj, seq)
|
|
fs.removePerSubject(sm.subj)
|
|
|
|
if secure {
|
|
// Grab record info.
|
|
ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
|
|
mb.eraseMsg(seq, int(ri), int(rl))
|
|
}
|
|
|
|
fifo := seq == mb.first.seq
|
|
isLastBlock := mb == fs.lmb
|
|
isEmpty := mb.msgs == 0
|
|
|
|
if fifo {
|
|
mb.selectNextFirst()
|
|
if !isEmpty {
|
|
// Can update this one in place.
|
|
if seq == fs.state.FirstSeq {
|
|
fs.state.FirstSeq = mb.first.seq // new one.
|
|
if mb.first.ts == 0 {
|
|
fs.state.FirstTime = time.Time{}
|
|
} else {
|
|
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
|
|
}
|
|
}
|
|
}
|
|
} else if !isEmpty {
|
|
if mb.dmap.IsEmpty() {
|
|
// Mark initial base for delete set.
|
|
mb.dmap.SetInitialMin(mb.first.seq)
|
|
}
|
|
// Out of order delete.
|
|
mb.dmap.Insert(seq)
|
|
// Check if <25% utilization and minimum size met.
|
|
if mb.rbytes > compactMinimum && !isLastBlock {
|
|
// Remove the interior delete records
|
|
rbytes := mb.rbytes - uint64(mb.dmap.Size()*emptyRecordLen)
|
|
if rbytes>>2 > mb.bytes {
|
|
mb.compact()
|
|
fs.kickFlushStateLoop()
|
|
}
|
|
}
|
|
}
|
|
|
|
if secure {
|
|
if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
|
|
// We have the mb lock here, this needs the mb locks so do in its own go routine.
|
|
go fs.rebuildState(ld)
|
|
}
|
|
}
|
|
|
|
// If empty remove this block and check if we need to update first sequence.
|
|
// We will write a tombstone at the end.
|
|
var firstSeqNeedsUpdate bool
|
|
if isEmpty {
|
|
// This writes tombstone iff mb == lmb, so no need to do below.
|
|
fs.removeMsgBlock(mb)
|
|
firstSeqNeedsUpdate = seq == fs.state.FirstSeq
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
// If we emptied the current message block and the seq was state.FirstSeq
|
|
// then we need to jump message blocks. We will also write the index so
|
|
// we don't lose track of the first sequence.
|
|
if firstSeqNeedsUpdate {
|
|
fs.selectNextFirst()
|
|
}
|
|
|
|
// Check if we need to write a deleted record tombstone.
|
|
// This is for user initiated removes or to hold the first seq
|
|
// when the last block is empty.
|
|
|
|
// If not via limits and not empty and last (empty writes tombstone above if last) write tombstone.
|
|
if !viaLimits && !(isEmpty && isLastBlock) {
|
|
if lmb := fs.lmb; sm != nil && lmb != nil {
|
|
lmb.writeTombstone(sm.seq, sm.ts)
|
|
}
|
|
}
|
|
|
|
if cb := fs.scb; cb != nil {
|
|
// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
|
|
fs.mu.Unlock()
|
|
// Storage updates.
|
|
var subj string
|
|
if sm != nil {
|
|
subj = sm.subj
|
|
}
|
|
delta := int64(msz)
|
|
cb(-1, -delta, seq, subj)
|
|
|
|
if !needFSLock {
|
|
fs.mu.Lock()
|
|
}
|
|
} else if needFSLock {
|
|
// We acquired it so release it.
|
|
fs.mu.Unlock()
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
|
|
// This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
|
|
// writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
|
|
// Write lock needs to be held.
|
|
func (mb *msgBlock) compact() {
|
|
wasLoaded := mb.cacheAlreadyLoaded()
|
|
if !wasLoaded {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
buf := mb.cache.buf
|
|
nbuf := make([]byte, 0, len(buf))
|
|
|
|
var le = binary.LittleEndian
|
|
var firstSet bool
|
|
|
|
isDeleted := func(seq uint64) bool {
|
|
if seq == 0 || seq&ebit != 0 || seq < mb.first.seq {
|
|
return true
|
|
}
|
|
return mb.dmap.Exists(seq)
|
|
}
|
|
|
|
for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
|
|
if index+msgHdrSize > lbuf {
|
|
return
|
|
}
|
|
hdr := buf[index : index+msgHdrSize]
|
|
rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
|
|
// Clear any headers bit that could be set.
|
|
rl &^= hbit
|
|
dlen := int(rl) - msgHdrSize
|
|
// Do some quick sanity checks here.
|
|
if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
|
|
return
|
|
}
|
|
// Only need to process non-deleted messages.
|
|
seq := le.Uint64(hdr[4:])
|
|
|
|
if !isDeleted(seq) {
|
|
// Check for tombstones.
|
|
if seq&tbit != 0 {
|
|
// If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb.
|
|
if mb == mb.fs.lmb && seq < mb.first.seq {
|
|
nbuf = append(nbuf, buf[index:index+rl]...)
|
|
}
|
|
} else {
|
|
// Normal message here.
|
|
nbuf = append(nbuf, buf[index:index+rl]...)
|
|
if !firstSet {
|
|
firstSet = true
|
|
mb.first.seq = seq
|
|
}
|
|
}
|
|
}
|
|
// Always set last as long as not a tombstone.
|
|
if seq&tbit == 0 {
|
|
mb.last.seq = seq &^ ebit
|
|
}
|
|
// Advance to next record.
|
|
index += rl
|
|
}
|
|
|
|
// Check for encryption.
|
|
if mb.bek != nil && len(nbuf) > 0 {
|
|
// Recreate to reset counter.
|
|
rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return
|
|
}
|
|
rbek.XORKeyStream(nbuf, nbuf)
|
|
}
|
|
|
|
// Close FDs first.
|
|
mb.closeFDsLocked()
|
|
|
|
// We will write to a new file and mv/rename it in case of failure.
|
|
mfn := filepath.Join(filepath.Join(mb.fs.fcfg.StoreDir, msgDir), fmt.Sprintf(newScan, mb.index))
|
|
if err := os.WriteFile(mfn, nbuf, defaultFilePerms); err != nil {
|
|
os.Remove(mfn)
|
|
return
|
|
}
|
|
if err := os.Rename(mfn, mb.mfn); err != nil {
|
|
os.Remove(mfn)
|
|
return
|
|
}
|
|
|
|
// Remove index file and wipe delete map, then rebuild.
|
|
mb.dmap.Empty()
|
|
mb.rebuildStateLocked()
|
|
|
|
// If we entered with the msgs loaded make sure to reload them.
|
|
if wasLoaded {
|
|
mb.loadMsgsWithLock()
|
|
}
|
|
}
|
|
|
|
// Grab info from a slot.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
|
|
if mb.cache == nil || slot >= len(mb.cache.idx) {
|
|
return 0, 0, false, errPartialCache
|
|
}
|
|
|
|
bi := mb.cache.idx[slot]
|
|
ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0
|
|
|
|
// If this is a deleted slot return here.
|
|
if bi == dbit {
|
|
return 0, 0, false, errDeletedMsg
|
|
}
|
|
|
|
// Determine record length
|
|
var rl uint32
|
|
if len(mb.cache.idx) > slot+1 {
|
|
ni := mb.cache.idx[slot+1] &^ hbit
|
|
rl = ni - ri
|
|
} else {
|
|
rl = mb.cache.lrl
|
|
}
|
|
if rl < msgHdrSize {
|
|
return 0, 0, false, errBadMsg
|
|
}
|
|
return uint32(ri), rl, hashChecked, nil
|
|
}
|
|
|
|
func (fs *fileStore) isClosed() bool {
|
|
fs.mu.RLock()
|
|
closed := fs.closed
|
|
fs.mu.RUnlock()
|
|
return closed
|
|
}
|
|
|
|
// Will spin up our flush loop.
|
|
func (mb *msgBlock) spinUpFlushLoop() {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
// Are we already running or closed?
|
|
if mb.flusher || mb.closed {
|
|
return
|
|
}
|
|
mb.flusher = true
|
|
mb.fch = make(chan struct{}, 1)
|
|
mb.qch = make(chan struct{})
|
|
fch, qch := mb.fch, mb.qch
|
|
|
|
go mb.flushLoop(fch, qch)
|
|
}
|
|
|
|
// Raw low level kicker for flush loops.
|
|
func kickFlusher(fch chan struct{}) {
|
|
if fch != nil {
|
|
select {
|
|
case fch <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
|
|
// Kick flusher for this message block.
|
|
func (mb *msgBlock) kickFlusher() {
|
|
mb.mu.RLock()
|
|
defer mb.mu.RUnlock()
|
|
kickFlusher(mb.fch)
|
|
}
|
|
|
|
func (mb *msgBlock) setInFlusher() {
|
|
mb.mu.Lock()
|
|
mb.flusher = true
|
|
mb.mu.Unlock()
|
|
}
|
|
|
|
func (mb *msgBlock) clearInFlusher() {
|
|
mb.mu.Lock()
|
|
mb.flusher = false
|
|
mb.mu.Unlock()
|
|
}
|
|
|
|
// flushLoop watches for messages, index info, or recently closed msg block updates.
|
|
func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
|
|
mb.setInFlusher()
|
|
defer mb.clearInFlusher()
|
|
|
|
for {
|
|
select {
|
|
case <-fch:
|
|
// If we have pending messages process them first.
|
|
if waiting := mb.pendingWriteSize(); waiting != 0 {
|
|
ts := 1 * time.Millisecond
|
|
var waited time.Duration
|
|
|
|
for waiting < coalesceMinimum {
|
|
time.Sleep(ts)
|
|
select {
|
|
case <-qch:
|
|
return
|
|
default:
|
|
}
|
|
newWaiting := mb.pendingWriteSize()
|
|
if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
|
|
break
|
|
}
|
|
waiting = newWaiting
|
|
ts *= 2
|
|
}
|
|
mb.flushPendingMsgs()
|
|
// Check if we are no longer the last message block. If we are
|
|
// not we can close FDs and exit.
|
|
mb.fs.mu.RLock()
|
|
notLast := mb != mb.fs.lmb
|
|
mb.fs.mu.RUnlock()
|
|
if notLast {
|
|
if err := mb.closeFDs(); err == nil {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
case <-qch:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
|
|
var le = binary.LittleEndian
|
|
var hdr [msgHdrSize]byte
|
|
|
|
le.PutUint32(hdr[0:], uint32(rl))
|
|
le.PutUint64(hdr[4:], seq|ebit)
|
|
le.PutUint64(hdr[12:], 0)
|
|
le.PutUint16(hdr[20:], 0)
|
|
|
|
// Randomize record
|
|
data := make([]byte, rl-emptyRecordLen)
|
|
rand.Read(data)
|
|
|
|
// Now write to underlying buffer.
|
|
var b bytes.Buffer
|
|
b.Write(hdr[:])
|
|
b.Write(data)
|
|
|
|
// Calculate hash.
|
|
mb.hh.Reset()
|
|
mb.hh.Write(hdr[4:20])
|
|
mb.hh.Write(data)
|
|
checksum := mb.hh.Sum(nil)
|
|
// Write to msg record.
|
|
b.Write(checksum)
|
|
|
|
// Update both cache and disk.
|
|
nbytes := b.Bytes()
|
|
|
|
// Cache
|
|
if ri >= mb.cache.off {
|
|
li := ri - mb.cache.off
|
|
buf := mb.cache.buf[li : li+rl]
|
|
copy(buf, nbytes)
|
|
}
|
|
|
|
// Disk
|
|
if mb.cache.off+mb.cache.wp > ri {
|
|
mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer mfd.Close()
|
|
if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
|
|
mfd.Sync()
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Truncate this message block to the storedMsg.
|
|
func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
|
|
// Make sure we are loaded to process messages etc.
|
|
if err := mb.loadMsgs(); err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
// Calculate new eof using slot info from our new last sm.
|
|
ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
// Calculate new eof.
|
|
eof := int64(ri + rl)
|
|
|
|
var purged, bytes uint64
|
|
|
|
mb.mu.Lock()
|
|
|
|
checkDmap := mb.dmap.Size() > 0
|
|
var smv StoreMsg
|
|
|
|
for seq := mb.last.seq; seq > sm.seq; seq-- {
|
|
if checkDmap {
|
|
if mb.dmap.Exists(seq) {
|
|
// Delete and skip to next.
|
|
mb.dmap.Delete(seq)
|
|
checkDmap = !mb.dmap.IsEmpty()
|
|
continue
|
|
}
|
|
}
|
|
// We should have a valid msg to calculate removal stats.
|
|
if m, err := mb.cacheLookup(seq, &smv); err == nil {
|
|
if mb.msgs > 0 {
|
|
rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
|
|
mb.msgs--
|
|
if rl > mb.bytes {
|
|
rl = mb.bytes
|
|
}
|
|
mb.bytes -= rl
|
|
mb.rbytes -= rl
|
|
// For return accounting.
|
|
purged++
|
|
bytes += uint64(rl)
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the block is compressed then we have to load it into memory
|
|
// and decompress it, truncate it and then write it back out.
|
|
// Otherwise, truncate the file itself and close the descriptor.
|
|
if mb.cmp != NoCompression {
|
|
buf, err := mb.loadBlock(nil)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
|
|
}
|
|
if mb.bek != nil && len(buf) > 0 {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
}
|
|
buf, err = mb.decompressIfNeeded(buf)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
|
|
}
|
|
buf = buf[:eof]
|
|
copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
|
|
buf, err = mb.cmp.Compress(buf)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
|
|
}
|
|
meta := &CompressionInfo{
|
|
Algorithm: mb.cmp,
|
|
OriginalSize: uint64(eof),
|
|
}
|
|
buf = append(meta.MarshalMetadata(), buf...)
|
|
if mb.bek != nil && len(buf) > 0 {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
}
|
|
n, err := mb.writeAt(buf, 0)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
|
|
}
|
|
if n != len(buf) {
|
|
return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
|
|
}
|
|
mb.mfd.Truncate(int64(len(buf)))
|
|
mb.mfd.Sync()
|
|
} else if mb.mfd != nil {
|
|
mb.mfd.Truncate(eof)
|
|
mb.mfd.Sync()
|
|
// Update our checksum.
|
|
var lchk [8]byte
|
|
mb.mfd.ReadAt(lchk[:], eof-8)
|
|
copy(mb.lchk[0:], lchk[:])
|
|
} else {
|
|
mb.mu.Unlock()
|
|
return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
|
|
}
|
|
|
|
// Update our last msg.
|
|
mb.last.seq = sm.seq
|
|
mb.last.ts = sm.ts
|
|
|
|
// Clear our cache.
|
|
mb.clearCacheAndOffset()
|
|
|
|
// Redo per subject info for this block.
|
|
mb.resetPerSubjectInfo()
|
|
|
|
mb.mu.Unlock()
|
|
|
|
// Load msgs again.
|
|
mb.loadMsgs()
|
|
|
|
return purged, bytes, nil
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) isEmpty() bool {
|
|
return mb.first.seq > mb.last.seq
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) selectNextFirst() {
|
|
var seq uint64
|
|
for seq = mb.first.seq + 1; seq <= mb.last.seq; seq++ {
|
|
if mb.dmap.Exists(seq) {
|
|
// We will move past this so we can delete the entry.
|
|
mb.dmap.Delete(seq)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
// Set new first sequence.
|
|
mb.first.seq = seq
|
|
|
|
// Check if we are empty..
|
|
if mb.isEmpty() {
|
|
mb.first.ts = 0
|
|
return
|
|
}
|
|
|
|
// Need to get the timestamp.
|
|
// We will try the cache direct and fallback if needed.
|
|
var smv StoreMsg
|
|
sm, _ := mb.cacheLookup(seq, &smv)
|
|
if sm == nil {
|
|
// Slow path, need to unlock.
|
|
mb.mu.Unlock()
|
|
sm, _, _ = mb.fetchMsg(seq, &smv)
|
|
mb.mu.Lock()
|
|
}
|
|
if sm != nil {
|
|
mb.first.ts = sm.ts
|
|
} else {
|
|
mb.first.ts = 0
|
|
}
|
|
}
|
|
|
|
// Select the next FirstSeq
|
|
// Lock should be held.
|
|
func (fs *fileStore) selectNextFirst() {
|
|
if len(fs.blks) > 0 {
|
|
mb := fs.blks[0]
|
|
mb.mu.RLock()
|
|
fs.state.FirstSeq = mb.first.seq
|
|
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
|
|
mb.mu.RUnlock()
|
|
} else {
|
|
// Could not find anything, so treat like purge
|
|
fs.state.FirstSeq = fs.state.LastSeq + 1
|
|
fs.state.FirstTime = time.Time{}
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
|
|
if td == 0 {
|
|
td = mb.cexp
|
|
}
|
|
if mb.ctmr == nil {
|
|
mb.ctmr = time.AfterFunc(td, mb.expireCache)
|
|
} else {
|
|
mb.ctmr.Reset(td)
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) startCacheExpireTimer() {
|
|
mb.resetCacheExpireTimer(0)
|
|
}
|
|
|
|
// Used when we load in a message block.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) clearCacheAndOffset() {
|
|
// Reset linear scan tracker.
|
|
mb.llseq = 0
|
|
if mb.cache != nil {
|
|
mb.cache.off = 0
|
|
mb.cache.wp = 0
|
|
}
|
|
mb.clearCache()
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) clearCache() {
|
|
if mb.ctmr != nil && mb.fss == nil {
|
|
mb.ctmr.Stop()
|
|
mb.ctmr = nil
|
|
}
|
|
|
|
if mb.cache == nil {
|
|
return
|
|
}
|
|
|
|
buf := mb.cache.buf
|
|
if mb.cache.off == 0 {
|
|
mb.cache = nil
|
|
} else {
|
|
// Clear msgs and index.
|
|
mb.cache.buf = nil
|
|
mb.cache.idx = nil
|
|
mb.cache.wp = 0
|
|
}
|
|
recycleMsgBlockBuf(buf)
|
|
}
|
|
|
|
// Called to possibly expire a message block cache.
|
|
func (mb *msgBlock) expireCache() {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
mb.expireCacheLocked()
|
|
}
|
|
|
|
func (mb *msgBlock) tryForceExpireCache() {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
|
|
// We will attempt to force expire this by temporarily clearing the last load time.
|
|
func (mb *msgBlock) tryForceExpireCacheLocked() {
|
|
llts := mb.llts
|
|
mb.llts = 0
|
|
mb.expireCacheLocked()
|
|
mb.llts = llts
|
|
}
|
|
|
|
// This is for expiration of the write cache, which will be partial with fip.
|
|
// So we want to bypass the Pools here.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) tryExpireWriteCache() []byte {
|
|
if mb.cache == nil {
|
|
return nil
|
|
}
|
|
lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
|
|
mb.lwts, mb.cache.nra = 0, true
|
|
mb.expireCacheLocked()
|
|
mb.lwts = lwts
|
|
if mb.cache != nil {
|
|
mb.cache.nra = nra
|
|
}
|
|
// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
|
|
if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
|
|
// Clear last write time since we now are about to move on to a new lmb.
|
|
mb.lwts = 0
|
|
return buf[:0]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) expireCacheLocked() {
|
|
if mb.cache == nil && mb.fss == nil {
|
|
if mb.ctmr != nil {
|
|
mb.ctmr.Stop()
|
|
mb.ctmr = nil
|
|
}
|
|
return
|
|
}
|
|
|
|
// Can't expire if we still have pending.
|
|
if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
|
|
mb.resetCacheExpireTimer(mb.cexp)
|
|
return
|
|
}
|
|
|
|
// Grab timestamp to compare.
|
|
tns := time.Now().UnixNano()
|
|
|
|
// For the core buffer of messages, we care about reads and writes, but not removes.
|
|
bufts := mb.llts
|
|
if mb.lwts > bufts {
|
|
bufts = mb.lwts
|
|
}
|
|
|
|
// Check for activity on the cache that would prevent us from expiring.
|
|
if tns-bufts <= int64(mb.cexp) {
|
|
mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
|
|
return
|
|
}
|
|
|
|
// If we are here we will at least expire the core msg buffer.
|
|
// We need to capture offset in case we do a write next before a full load.
|
|
if mb.cache != nil {
|
|
mb.cache.off += len(mb.cache.buf)
|
|
if !mb.cache.nra {
|
|
recycleMsgBlockBuf(mb.cache.buf)
|
|
}
|
|
mb.cache.buf = nil
|
|
mb.cache.wp = 0
|
|
}
|
|
|
|
// Check if we can clear out our fss and idx unless under force expire.
|
|
// We used to hold onto the idx longer but removes need buf now so no point.
|
|
mb.fss = nil
|
|
mb.clearCache()
|
|
}
|
|
|
|
func (fs *fileStore) startAgeChk() {
|
|
if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
|
|
fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) resetAgeChk(delta int64) {
|
|
if fs.cfg.MaxAge == 0 {
|
|
return
|
|
}
|
|
|
|
fireIn := fs.cfg.MaxAge
|
|
if delta > 0 && time.Duration(delta) < fireIn {
|
|
fireIn = time.Duration(delta)
|
|
}
|
|
if fs.ageChk != nil {
|
|
fs.ageChk.Reset(fireIn)
|
|
} else {
|
|
fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) cancelAgeChk() {
|
|
if fs.ageChk != nil {
|
|
fs.ageChk.Stop()
|
|
fs.ageChk = nil
|
|
}
|
|
}
|
|
|
|
// Will expire msgs that are too old.
|
|
func (fs *fileStore) expireMsgs() {
|
|
// We need to delete one by one here and can not optimize for the time being.
|
|
// Reason is that we need more information to adjust ack pending in consumers.
|
|
var smv StoreMsg
|
|
var sm *StoreMsg
|
|
fs.mu.RLock()
|
|
maxAge := int64(fs.cfg.MaxAge)
|
|
minAge := time.Now().UnixNano() - maxAge
|
|
fs.mu.RUnlock()
|
|
|
|
for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
|
|
fs.mu.Lock()
|
|
fs.removeMsgViaLimits(sm.seq)
|
|
fs.mu.Unlock()
|
|
// Recalculate in case we are expiring a bunch.
|
|
minAge = time.Now().UnixNano() - maxAge
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
|
|
if fs.state.Msgs == 0 {
|
|
fs.cancelAgeChk()
|
|
} else {
|
|
if sm == nil {
|
|
fs.resetAgeChk(0)
|
|
} else {
|
|
fs.resetAgeChk(sm.ts - minAge)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) checkAndFlushAllBlocks() {
|
|
for _, mb := range fs.blks {
|
|
if mb.pendingWriteSize() > 0 {
|
|
// Since fs lock is held need to pull this apart in case we need to rebuild state.
|
|
mb.mu.Lock()
|
|
ld, _ := mb.flushPendingMsgsLocked()
|
|
mb.mu.Unlock()
|
|
if ld != nil {
|
|
fs.rebuildStateLocked(ld)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// This will check all the checksums on messages and report back any sequence numbers with errors.
|
|
func (fs *fileStore) checkMsgs() *LostStreamData {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
fs.checkAndFlushAllBlocks()
|
|
|
|
// Clear any global subject state.
|
|
fs.psim = make(map[string]*psi)
|
|
|
|
for _, mb := range fs.blks {
|
|
// Make sure encryption loaded if needed for the block.
|
|
fs.loadEncryptionForMsgBlock(mb)
|
|
// FIXME(dlc) - check tombstones here too?
|
|
if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
|
|
// Rebuild fs state too.
|
|
mb.fs.rebuildStateLocked(ld)
|
|
}
|
|
fs.populateGlobalPerSubjectInfo(mb)
|
|
}
|
|
|
|
return fs.ld
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) enableForWriting(fip bool) error {
|
|
if mb == nil {
|
|
return errNoMsgBlk
|
|
}
|
|
if mb.mfd != nil {
|
|
return nil
|
|
}
|
|
mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
|
|
if err != nil {
|
|
return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
|
|
}
|
|
mb.mfd = mfd
|
|
|
|
// Spin up our flusher loop if needed.
|
|
if !fip {
|
|
mb.spinUpFlushLoop()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Helper function to place a delete tombstone.
|
|
func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
|
|
return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
|
|
}
|
|
|
|
// Will write the message record to the underlying message block.
|
|
// filestore lock will be held.
|
|
func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
// Enable for writing if our mfd is not open.
|
|
if mb.mfd == nil {
|
|
if err := mb.enableForWriting(flush); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Make sure we have a cache setup.
|
|
if mb.cache == nil {
|
|
mb.setupWriteCache(nil)
|
|
}
|
|
|
|
// Check if we are tracking per subject for our simple state.
|
|
// Do this before changing the cache that would trigger a flush pending msgs call
|
|
// if we needed to regenerate the per subject info.
|
|
// Note that tombstones have no subject so will not trigger here.
|
|
if len(subj) > 0 && !mb.noTrack {
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
return err
|
|
}
|
|
if ss := mb.fss[subj]; ss != nil {
|
|
ss.Msgs++
|
|
ss.Last = seq
|
|
} else {
|
|
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
|
|
}
|
|
}
|
|
|
|
// Indexing
|
|
index := len(mb.cache.buf) + int(mb.cache.off)
|
|
|
|
// Formats
|
|
// Format with no header
|
|
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
|
|
// With headers, high bit on total length will be set.
|
|
// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)
|
|
|
|
// First write header, etc.
|
|
var le = binary.LittleEndian
|
|
var hdr [msgHdrSize]byte
|
|
|
|
l := uint32(rl)
|
|
hasHeaders := len(mhdr) > 0
|
|
if hasHeaders {
|
|
l |= hbit
|
|
}
|
|
|
|
le.PutUint32(hdr[0:], l)
|
|
le.PutUint64(hdr[4:], seq)
|
|
le.PutUint64(hdr[12:], uint64(ts))
|
|
le.PutUint16(hdr[20:], uint16(len(subj)))
|
|
|
|
// Now write to underlying buffer.
|
|
mb.cache.buf = append(mb.cache.buf, hdr[:]...)
|
|
mb.cache.buf = append(mb.cache.buf, subj...)
|
|
|
|
if hasHeaders {
|
|
var hlen [4]byte
|
|
le.PutUint32(hlen[0:], uint32(len(mhdr)))
|
|
mb.cache.buf = append(mb.cache.buf, hlen[:]...)
|
|
mb.cache.buf = append(mb.cache.buf, mhdr...)
|
|
}
|
|
mb.cache.buf = append(mb.cache.buf, msg...)
|
|
|
|
// Calculate hash.
|
|
mb.hh.Reset()
|
|
mb.hh.Write(hdr[4:20])
|
|
mb.hh.Write([]byte(subj))
|
|
if hasHeaders {
|
|
mb.hh.Write(mhdr)
|
|
}
|
|
mb.hh.Write(msg)
|
|
checksum := mb.hh.Sum(nil)
|
|
// Grab last checksum
|
|
copy(mb.lchk[0:], checksum)
|
|
|
|
// Update write through cache.
|
|
// Write to msg record.
|
|
mb.cache.buf = append(mb.cache.buf, checksum...)
|
|
mb.cache.lrl = uint32(rl)
|
|
|
|
// Set cache timestamp for last store.
|
|
mb.lwts = ts
|
|
|
|
// Only update index and do accounting if not a delete tombstone.
|
|
if seq&tbit == 0 {
|
|
// Accounting, do this before stripping ebit, it is ebit aware.
|
|
mb.updateAccounting(seq, ts, rl)
|
|
// Strip ebit if set.
|
|
seq = seq &^ ebit
|
|
if mb.cache.fseq == 0 {
|
|
mb.cache.fseq = seq
|
|
}
|
|
// Write index
|
|
mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
|
|
}
|
|
|
|
fch, werr := mb.fch, mb.werr
|
|
|
|
// If we should be flushing, or had a write error, do so here.
|
|
if flush || werr != nil {
|
|
ld, err := mb.flushPendingMsgsLocked()
|
|
if ld != nil && mb.fs != nil {
|
|
// We have the mb lock here, this needs the mb locks so do in its own go routine.
|
|
go mb.fs.rebuildState(ld)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
// Kick the flusher here.
|
|
kickFlusher(fch)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// How many bytes pending to be written for this message block.
|
|
func (mb *msgBlock) pendingWriteSize() int {
|
|
if mb == nil {
|
|
return 0
|
|
}
|
|
mb.mu.RLock()
|
|
defer mb.mu.RUnlock()
|
|
return mb.pendingWriteSizeLocked()
|
|
}
|
|
|
|
// How many bytes pending to be written for this message block.
|
|
func (mb *msgBlock) pendingWriteSizeLocked() int {
|
|
if mb == nil {
|
|
return 0
|
|
}
|
|
var pending int
|
|
if !mb.closed && mb.mfd != nil && mb.cache != nil {
|
|
pending = len(mb.cache.buf) - int(mb.cache.wp)
|
|
}
|
|
return pending
|
|
}
|
|
|
|
// Try to close our FDs if we can.
|
|
func (mb *msgBlock) closeFDs() error {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
return mb.closeFDsLocked()
|
|
}
|
|
|
|
func (mb *msgBlock) closeFDsLocked() error {
|
|
if buf, _ := mb.bytesPending(); len(buf) > 0 {
|
|
return errPendingData
|
|
}
|
|
mb.closeFDsLockedNoCheck()
|
|
return nil
|
|
}
|
|
|
|
func (mb *msgBlock) closeFDsLockedNoCheck() {
|
|
if mb.mfd != nil {
|
|
mb.mfd.Close()
|
|
mb.mfd = nil
|
|
}
|
|
}
|
|
|
|
// bytesPending returns the buffer to be used for writing to the underlying file.
|
|
// This marks we are in flush and will return nil if asked again until cleared.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) bytesPending() ([]byte, error) {
|
|
if mb == nil || mb.mfd == nil {
|
|
return nil, errNoPending
|
|
}
|
|
if mb.cache == nil {
|
|
return nil, errNoCache
|
|
}
|
|
if len(mb.cache.buf) <= mb.cache.wp {
|
|
return nil, errNoPending
|
|
}
|
|
buf := mb.cache.buf[mb.cache.wp:]
|
|
if len(buf) == 0 {
|
|
return nil, errNoPending
|
|
}
|
|
return buf, nil
|
|
}
|
|
|
|
// Returns the current blkSize including deleted msgs etc.
|
|
func (mb *msgBlock) blkSize() uint64 {
|
|
mb.mu.RLock()
|
|
nb := mb.rbytes
|
|
mb.mu.RUnlock()
|
|
return nb
|
|
}
|
|
|
|
// Update accounting on a write msg.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
|
|
isDeleted := seq&ebit != 0
|
|
if isDeleted {
|
|
seq = seq &^ ebit
|
|
}
|
|
|
|
if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq {
|
|
mb.first.seq = seq
|
|
mb.first.ts = ts
|
|
}
|
|
// Need atomics here for selectMsgBlock speed.
|
|
atomic.StoreUint64(&mb.last.seq, seq)
|
|
mb.last.ts = ts
|
|
mb.rbytes += rl
|
|
if !isDeleted {
|
|
mb.bytes += rl
|
|
mb.msgs++
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
|
|
var err error
|
|
|
|
// Get size for this message.
|
|
rl := fileStoreMsgSize(subj, hdr, msg)
|
|
if rl&hbit != 0 {
|
|
return 0, ErrMsgTooLarge
|
|
}
|
|
// Grab our current last message block.
|
|
mb := fs.lmb
|
|
|
|
// Mark as dirty for stream state.
|
|
fs.dirty++
|
|
|
|
if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
|
|
if mb != nil && fs.fcfg.Compression != NoCompression {
|
|
// We've now reached the end of this message block, if we want
|
|
// to compress blocks then now's the time to do it.
|
|
go mb.recompressOnDiskIfNeeded()
|
|
}
|
|
if mb, err = fs.newMsgBlockForWrite(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
// Ask msg block to store in write through cache.
|
|
err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)
|
|
|
|
return rl, err
|
|
}
|
|
|
|
func (mb *msgBlock) recompressOnDiskIfNeeded() error {
|
|
// Wait for disk I/O slots to become available. This prevents us from
|
|
// running away with system resources.
|
|
<-dios
|
|
defer func() {
|
|
dios <- struct{}{}
|
|
}()
|
|
|
|
alg := mb.fs.fcfg.Compression
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
origFN := mb.mfn // The original message block on disk.
|
|
tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.
|
|
|
|
// Open up the file block and read in the entire contents into memory.
|
|
// One of two things will happen:
|
|
// 1. The block will be compressed already and have a valid metadata
|
|
// header, in which case we do nothing.
|
|
// 2. The block will be uncompressed, in which case we will compress it
|
|
// and then write it back out to disk, reencrypting if necessary.
|
|
origBuf, err := os.ReadFile(origFN)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read original block from disk: %w", err)
|
|
}
|
|
|
|
// If the block is encrypted then we will need to decrypt it before
|
|
// doing anything. We always encrypt after compressing because then the
|
|
// compression can be as efficient as possible on the raw data, whereas
|
|
// the encrypted ciphertext will not compress anywhere near as well.
|
|
// The block encryption also covers the optional compression metadata.
|
|
if mb.bek != nil && len(origBuf) > 0 {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(origBuf, origBuf)
|
|
}
|
|
|
|
meta := &CompressionInfo{}
|
|
if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
|
|
// An error is only returned here if there's a problem with parsing
|
|
// the metadata. If the file has no metadata at all, no error is
|
|
// returned and the algorithm defaults to no compression.
|
|
return fmt.Errorf("failed to read existing metadata header: %w", err)
|
|
}
|
|
if meta.Algorithm == alg {
|
|
// The block is already compressed with the chosen algorithm so there
|
|
// is nothing else to do. This is not a common case, it is here only
|
|
// to ensure we don't do unnecessary work in case something asked us
|
|
// to recompress an already compressed block with the same algorithm.
|
|
return nil
|
|
} else if alg != NoCompression {
|
|
// The block is already compressed using some algorithm, so we need
|
|
// to decompress the block using the existing algorithm before we can
|
|
// recompress it with the new one.
|
|
if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
|
|
return fmt.Errorf("failed to decompress original block: %w", err)
|
|
}
|
|
}
|
|
|
|
// Rather than modifying the existing block on disk (which is a dangerous
|
|
// operation if something goes wrong), create a new temporary file. We will
|
|
// write out the new block here and then swap the files around afterwards
|
|
// once everything else has succeeded correctly.
|
|
tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create temporary file: %w", err)
|
|
}
|
|
|
|
// The original buffer at this point is uncompressed, so we will now compress
|
|
// it if needed. Note that if the selected algorithm is NoCompression, the
|
|
// Compress function will just return the input buffer unmodified.
|
|
cmpBuf, err := alg.Compress(origBuf)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to compress block: %w", err)
|
|
}
|
|
|
|
// We only need to write out the metadata header if compression is enabled.
|
|
// If we're trying to uncompress the file on disk at this point, don't bother
|
|
// writing metadata.
|
|
if alg != NoCompression {
|
|
meta := &CompressionInfo{
|
|
Algorithm: alg,
|
|
OriginalSize: uint64(len(origBuf)),
|
|
}
|
|
cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
|
|
}
|
|
|
|
// Re-encrypt the block if necessary.
|
|
if mb.bek != nil && len(cmpBuf) > 0 {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(cmpBuf, cmpBuf)
|
|
}
|
|
|
|
// Write the new block data (which might be compressed or encrypted) to the
|
|
// temporary file.
|
|
errorCleanup := func(err error) error {
|
|
tmpFD.Close()
|
|
os.Remove(tmpFN)
|
|
return err
|
|
}
|
|
if n, err := tmpFD.Write(cmpBuf); err != nil {
|
|
return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
|
|
} else if n != len(cmpBuf) {
|
|
return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
|
|
}
|
|
if err := tmpFD.Sync(); err != nil {
|
|
return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
|
|
}
|
|
if err := tmpFD.Close(); err != nil {
|
|
return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
|
|
}
|
|
|
|
// Now replace the original file with the newly updated temp file.
|
|
if err := os.Rename(tmpFN, origFN); err != nil {
|
|
return fmt.Errorf("failed to move temporary file into place: %w", err)
|
|
}
|
|
|
|
// Since the message block might be retained in memory, make sure the
|
|
// compression algorithm is up-to-date, since this will be needed when
|
|
// compacting or truncating.
|
|
mb.cmp = alg
|
|
return nil
|
|
}
|
|
|
|
func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
|
|
var meta CompressionInfo
|
|
if n, err := meta.UnmarshalMetadata(buf); err != nil {
|
|
// There was a problem parsing the metadata header of the block.
|
|
// If there's no metadata header, an error isn't returned here,
|
|
// we will instead just use default values of no compression.
|
|
return nil, err
|
|
} else if n == 0 {
|
|
// There were no metadata bytes, so we assume the block is not
|
|
// compressed and return it as-is.
|
|
return buf, nil
|
|
} else {
|
|
// Metadata was present so it's quite likely the block contents
|
|
// are compressed. If by any chance the metadata claims that the
|
|
// block is uncompressed, then the input slice is just returned
|
|
// unmodified.
|
|
return meta.Algorithm.Decompress(buf[n:])
|
|
}
|
|
}
|
|
|
|
// Sync msg and index files as needed. This is called from a timer.
|
|
func (fs *fileStore) syncBlocks() {
|
|
fs.mu.RLock()
|
|
if fs.closed {
|
|
fs.mu.RUnlock()
|
|
return
|
|
}
|
|
blks := append([]*msgBlock(nil), fs.blks...)
|
|
fs.mu.RUnlock()
|
|
|
|
for _, mb := range blks {
|
|
// Do actual sync. Hold lock for consistency.
|
|
mb.mu.Lock()
|
|
if mb.closed {
|
|
mb.mu.Unlock()
|
|
continue
|
|
}
|
|
// See if we can close FDs due to being idle.
|
|
if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
|
|
mb.dirtyCloseWithRemove(false)
|
|
}
|
|
// Check if we need to sync. We will not hold lock during actual sync.
|
|
var fn string
|
|
if mb.needSync {
|
|
// Flush anything that may be pending.
|
|
if mb.pendingWriteSizeLocked() > 0 {
|
|
mb.flushPendingMsgsLocked()
|
|
}
|
|
fn = mb.mfn
|
|
mb.needSync = false
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
// Check if we need to sync.
|
|
// This is done not holding any locks.
|
|
if fn != _EMPTY_ {
|
|
if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
|
|
fd.Sync()
|
|
fd.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
|
|
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
|
|
syncAlways := fs.fcfg.SyncAlways
|
|
fs.mu.Unlock()
|
|
|
|
if !syncAlways {
|
|
if fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms); fd != nil {
|
|
fd.Sync()
|
|
fd.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
// Select the message block where this message should be found.
|
|
// Return nil if not in the set.
|
|
// Read lock should be held.
|
|
func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
|
|
_, mb := fs.selectMsgBlockWithIndex(seq)
|
|
return mb
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
|
|
// Check for out of range.
|
|
if seq < fs.state.FirstSeq || seq > fs.state.LastSeq {
|
|
return -1, nil
|
|
}
|
|
|
|
const linearThresh = 32
|
|
nb := len(fs.blks) - 1
|
|
|
|
if nb < linearThresh {
|
|
for i, mb := range fs.blks {
|
|
if seq <= atomic.LoadUint64(&mb.last.seq) {
|
|
return i, mb
|
|
}
|
|
}
|
|
return -1, nil
|
|
}
|
|
|
|
// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
|
|
for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
|
|
mb := fs.blks[mid]
|
|
// Right now these atomic loads do not factor in, so fine to leave. Was considering
|
|
// uplifting these to fs scope to avoid atomic load but not needed.
|
|
first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
|
|
if seq > last {
|
|
low = mid + 1
|
|
} else if seq < first {
|
|
// A message block's first sequence can change here meaning we could find a gap.
|
|
// We want to behave like above, which if inclusive (we check at start) should
|
|
// always return an index and a valid mb.
|
|
// If we have a gap then our seq would be > fs.blks[mid-1].last.seq
|
|
if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) {
|
|
return mid, mb
|
|
}
|
|
high = mid - 1
|
|
} else {
|
|
return mid, mb
|
|
}
|
|
}
|
|
|
|
return -1, nil
|
|
}
|
|
|
|
// Select the message block where this message should be found.
|
|
// Return nil if not in the set.
|
|
func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
t := minTime.UnixNano()
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
found := t <= mb.last.ts
|
|
mb.mu.RUnlock()
|
|
if found {
|
|
return mb
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Index a raw msg buffer.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) indexCacheBuf(buf []byte) error {
|
|
var le = binary.LittleEndian
|
|
|
|
var fseq uint64
|
|
var idx []uint32
|
|
var index uint32
|
|
|
|
if mb.cache == nil {
|
|
// Approximation, may adjust below.
|
|
fseq = mb.first.seq
|
|
idx = make([]uint32, 0, mb.msgs)
|
|
mb.cache = &cache{}
|
|
} else {
|
|
fseq = mb.cache.fseq
|
|
idx = mb.cache.idx
|
|
if len(idx) == 0 {
|
|
idx = make([]uint32, 0, mb.msgs)
|
|
}
|
|
index = uint32(len(mb.cache.buf))
|
|
buf = append(mb.cache.buf, buf...)
|
|
}
|
|
|
|
// Create FSS if we should track.
|
|
if !mb.noTrack {
|
|
mb.fss = make(map[string]*SimpleState)
|
|
}
|
|
|
|
lbuf := uint32(len(buf))
|
|
for index < lbuf {
|
|
if index+msgHdrSize > lbuf {
|
|
return errCorruptState
|
|
}
|
|
hdr := buf[index : index+msgHdrSize]
|
|
rl, seq, slen := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]), int(le.Uint16(hdr[20:]))
|
|
|
|
// Clear any headers bit that could be set.
|
|
rl &^= hbit
|
|
dlen := int(rl) - msgHdrSize
|
|
|
|
// Do some quick sanity checks here.
|
|
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
|
|
// This means something is off.
|
|
// TODO(dlc) - Add into bad list?
|
|
return errCorruptState
|
|
}
|
|
|
|
// Check for tombstones which we can skip in terms of indexing.
|
|
if seq&tbit != 0 {
|
|
index += rl
|
|
continue
|
|
}
|
|
|
|
// Clear any erase bits.
|
|
erased := seq&ebit != 0
|
|
seq = seq &^ ebit
|
|
|
|
// We defer checksum checks to individual msg cache lookups to amortorize costs and
|
|
// not introduce latency for first message from a newly loaded block.
|
|
if seq >= mb.first.seq {
|
|
// Track that we do not have holes.
|
|
if slot := int(seq - mb.first.seq); slot != len(idx) {
|
|
// If we have a hole fill it.
|
|
for dseq := mb.first.seq + uint64(len(idx)); dseq < seq; dseq++ {
|
|
idx = append(idx, dbit)
|
|
mb.dmap.Insert(dseq)
|
|
}
|
|
}
|
|
// Add to our index.
|
|
idx = append(idx, index)
|
|
mb.cache.lrl = uint32(rl)
|
|
// Adjust if we guessed wrong.
|
|
if seq != 0 && seq < fseq {
|
|
fseq = seq
|
|
}
|
|
|
|
// Make sure our dmap has this entry if it was erased.
|
|
if erased {
|
|
mb.dmap.Insert(seq)
|
|
}
|
|
|
|
// Handle FSS inline here.
|
|
if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
|
|
bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
|
|
if ss := mb.fss[string(bsubj)]; ss != nil {
|
|
ss.Msgs++
|
|
ss.Last = seq
|
|
} else {
|
|
subj := mb.subjString(bsubj)
|
|
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
|
|
}
|
|
}
|
|
}
|
|
index += rl
|
|
}
|
|
|
|
mb.cache.buf = buf
|
|
mb.cache.idx = idx
|
|
mb.cache.fseq = fseq
|
|
mb.cache.wp += int(lbuf)
|
|
|
|
return nil
|
|
}
|
|
|
|
// flushPendingMsgs writes out any messages for this message block.
|
|
func (mb *msgBlock) flushPendingMsgs() error {
|
|
mb.mu.Lock()
|
|
fsLostData, err := mb.flushPendingMsgsLocked()
|
|
fs := mb.fs
|
|
mb.mu.Unlock()
|
|
|
|
// Signals us that we need to rebuild filestore state.
|
|
if fsLostData != nil && fs != nil {
|
|
// Rebuild fs state too.
|
|
fs.rebuildState(fsLostData)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Write function for actual data.
|
|
// mb.mfd should not be nil.
|
|
// Lock should held.
|
|
func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
|
|
// Used to mock write failures.
|
|
if mb.mockWriteErr {
|
|
// Reset on trip.
|
|
mb.mockWriteErr = false
|
|
return 0, errors.New("mock write error")
|
|
}
|
|
return mb.mfd.WriteAt(buf, woff)
|
|
}
|
|
|
|
// flushPendingMsgsLocked writes out any messages for this message block.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
|
|
// Signals us that we need to rebuild filestore state.
|
|
var fsLostData *LostStreamData
|
|
|
|
if mb.cache == nil || mb.mfd == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
buf, err := mb.bytesPending()
|
|
// If we got an error back return here.
|
|
if err != nil {
|
|
// No pending data to be written is not an error.
|
|
if err == errNoPending || err == errNoCache {
|
|
err = nil
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
woff := int64(mb.cache.off + mb.cache.wp)
|
|
lob := len(buf)
|
|
|
|
// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
|
|
// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
|
|
// under heavy load.
|
|
|
|
// Check if we need to encrypt.
|
|
if mb.bek != nil && lob > 0 {
|
|
// Need to leave original alone.
|
|
var dst []byte
|
|
if lob <= defaultLargeBlockSize {
|
|
dst = getMsgBlockBuf(lob)[:lob]
|
|
} else {
|
|
dst = make([]byte, lob)
|
|
}
|
|
mb.bek.XORKeyStream(dst, buf)
|
|
buf = dst
|
|
}
|
|
|
|
// Append new data to the message block file.
|
|
for lbb := lob; lbb > 0; lbb = len(buf) {
|
|
n, err := mb.writeAt(buf, woff)
|
|
if err != nil {
|
|
mb.dirtyCloseWithRemove(false)
|
|
ld, _, _ := mb.rebuildStateLocked()
|
|
mb.werr = err
|
|
return ld, err
|
|
}
|
|
// Update our write offset.
|
|
woff += int64(n)
|
|
// Partial write.
|
|
if n != lbb {
|
|
buf = buf[n:]
|
|
} else {
|
|
// Done.
|
|
break
|
|
}
|
|
}
|
|
|
|
// Clear any error.
|
|
mb.werr = nil
|
|
|
|
// Cache may be gone.
|
|
if mb.cache == nil || mb.mfd == nil {
|
|
return fsLostData, mb.werr
|
|
}
|
|
|
|
// Check if we are in sync always mode.
|
|
if mb.syncAlways {
|
|
mb.mfd.Sync()
|
|
} else {
|
|
mb.needSync = true
|
|
}
|
|
|
|
// Check for additional writes while we were writing to the disk.
|
|
moreBytes := len(mb.cache.buf) - mb.cache.wp - lob
|
|
|
|
// Decide what we want to do with the buffer in hand. If we have load interest
|
|
// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
|
|
if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
|
|
mb.cache.wp += lob
|
|
} else {
|
|
if cap(mb.cache.buf) <= maxBufReuse {
|
|
buf = mb.cache.buf[:0]
|
|
} else {
|
|
recycleMsgBlockBuf(mb.cache.buf)
|
|
buf = nil
|
|
}
|
|
if moreBytes > 0 {
|
|
nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
|
|
if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
|
|
buf = nbuf
|
|
} else {
|
|
buf = append(buf, nbuf...)
|
|
}
|
|
}
|
|
// Update our cache offset.
|
|
mb.cache.off = int(woff)
|
|
// Reset write pointer.
|
|
mb.cache.wp = 0
|
|
// Place buffer back in the cache structure.
|
|
mb.cache.buf = buf
|
|
// Mark fseq to 0
|
|
mb.cache.fseq = 0
|
|
}
|
|
|
|
return fsLostData, mb.werr
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) clearLoading() {
|
|
mb.loading = false
|
|
}
|
|
|
|
// Will load msgs from disk.
|
|
func (mb *msgBlock) loadMsgs() error {
|
|
// We hold the lock here the whole time by design.
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
return mb.loadMsgsWithLock()
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) cacheAlreadyLoaded() bool {
|
|
if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
|
|
return false
|
|
}
|
|
numEntries := mb.msgs + uint64(mb.dmap.Size()) + (mb.first.seq - mb.cache.fseq)
|
|
return numEntries == uint64(len(mb.cache.idx))
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) cacheNotLoaded() bool {
|
|
return !mb.cacheAlreadyLoaded()
|
|
}
|
|
|
|
// Used to load in the block contents.
|
|
// Lock should be held and all conditionals satisfied prior.
|
|
func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
|
|
f, err := os.Open(mb.mfn)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
err = errNoBlkData
|
|
}
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
|
|
var sz int
|
|
if info, err := f.Stat(); err == nil {
|
|
sz64 := info.Size()
|
|
if int64(int(sz64)) == sz64 {
|
|
sz = int(sz64)
|
|
} else {
|
|
return nil, errMsgBlkTooBig
|
|
}
|
|
}
|
|
|
|
if buf == nil {
|
|
buf = getMsgBlockBuf(sz)
|
|
if sz > cap(buf) {
|
|
// We know we will make a new one so just recycle for now.
|
|
recycleMsgBlockBuf(buf)
|
|
buf = nil
|
|
}
|
|
}
|
|
|
|
if sz > cap(buf) {
|
|
buf = make([]byte, sz)
|
|
} else {
|
|
buf = buf[:sz]
|
|
}
|
|
|
|
n, err := io.ReadFull(f, buf)
|
|
// On success capture raw bytes size.
|
|
if err == nil {
|
|
mb.rbytes = uint64(n)
|
|
}
|
|
return buf[:n], err
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) loadMsgsWithLock() error {
|
|
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
|
|
if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
|
|
if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Check to see if we are loading already.
|
|
if mb.loading {
|
|
return nil
|
|
}
|
|
|
|
// Set loading status.
|
|
mb.loading = true
|
|
defer mb.clearLoading()
|
|
|
|
var nchecks int
|
|
|
|
checkCache:
|
|
nchecks++
|
|
if nchecks > 8 {
|
|
return errCorruptState
|
|
}
|
|
|
|
// Check to see if we have a full cache.
|
|
if mb.cacheAlreadyLoaded() {
|
|
return nil
|
|
}
|
|
|
|
mb.llts = time.Now().UnixNano()
|
|
|
|
// FIXME(dlc) - We could be smarter here.
|
|
if buf, _ := mb.bytesPending(); len(buf) > 0 {
|
|
ld, err := mb.flushPendingMsgsLocked()
|
|
if ld != nil && mb.fs != nil {
|
|
// We do not know if fs is locked or not at this point.
|
|
// This should be an exceptional condition so do so in Go routine.
|
|
go mb.fs.rebuildState(ld)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
goto checkCache
|
|
}
|
|
|
|
// Load in the whole block.
|
|
// We want to hold the mb lock here to avoid any changes to state.
|
|
buf, err := mb.loadBlock(nil)
|
|
if err != nil {
|
|
if err == errNoBlkData {
|
|
if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
|
|
// Rebuild fs state too.
|
|
go mb.fs.rebuildState(ld)
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Reset the cache since we just read everything in.
|
|
// Make sure this is cleared in case we had a partial when we started.
|
|
mb.clearCacheAndOffset()
|
|
|
|
// Check if we need to decrypt.
|
|
if mb.bek != nil && len(buf) > 0 {
|
|
bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
mb.bek = bek
|
|
mb.bek.XORKeyStream(buf, buf)
|
|
}
|
|
|
|
// Check for compression.
|
|
if buf, err = mb.decompressIfNeeded(buf); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := mb.indexCacheBuf(buf); err != nil {
|
|
if err == errCorruptState {
|
|
var ld *LostStreamData
|
|
if ld, _, err = mb.rebuildStateLocked(); ld != nil {
|
|
// We do not know if fs is locked or not at this point.
|
|
// This should be an exceptional condition so do so in Go routine.
|
|
go mb.fs.rebuildState(ld)
|
|
}
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
goto checkCache
|
|
}
|
|
|
|
if len(buf) > 0 {
|
|
mb.cloads++
|
|
mb.startCacheExpireTimer()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Fetch a message from this block, possibly reading in and caching the messages.
|
|
// We assume the block was selected and is correct, so we do not do range checks.
|
|
func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
return nil, false, err
|
|
}
|
|
}
|
|
fsm, err := mb.cacheLookup(seq, sm)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
expireOk := seq == mb.last.seq && mb.llseq == seq
|
|
return fsm, expireOk, err
|
|
}
|
|
|
|
var (
|
|
errNoCache = errors.New("no message cache")
|
|
errBadMsg = errors.New("malformed or corrupt message")
|
|
errDeletedMsg = errors.New("deleted message")
|
|
errPartialCache = errors.New("partial cache")
|
|
errNoPending = errors.New("message block does not have pending data")
|
|
errNotReadable = errors.New("storage directory not readable")
|
|
errCorruptState = errors.New("corrupt state file")
|
|
errPriorState = errors.New("prior state file")
|
|
errPendingData = errors.New("pending data still present")
|
|
errNoEncryption = errors.New("encryption not enabled")
|
|
errBadKeySize = errors.New("encryption bad key size")
|
|
errNoMsgBlk = errors.New("no message block")
|
|
errMsgBlkTooBig = errors.New("message block size exceeded int capacity")
|
|
errUnknownCipher = errors.New("unknown cipher")
|
|
errNoMainKey = errors.New("encrypted store encountered with no main key")
|
|
errNoBlkData = errors.New("message block data missing")
|
|
)
|
|
|
|
const (
|
|
// Used for marking messages that have had their checksums checked.
|
|
// Used to signal a message record with headers.
|
|
hbit = 1 << 31
|
|
// Used for marking erased messages sequences.
|
|
ebit = 1 << 63
|
|
// Used for marking tombstone sequences.
|
|
tbit = 1 << 62
|
|
// Used to mark a bad index as deleted.
|
|
dbit = 1 << 30
|
|
)
|
|
|
|
// Will do a lookup from cache.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
|
|
if seq < mb.first.seq || seq > mb.last.seq {
|
|
return nil, ErrStoreMsgNotFound
|
|
}
|
|
|
|
// If we have a delete map check it.
|
|
if mb.dmap.Exists(seq) {
|
|
mb.llts = time.Now().UnixNano()
|
|
return nil, errDeletedMsg
|
|
}
|
|
|
|
// Detect no cache loaded.
|
|
if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
|
|
return nil, errNoCache
|
|
}
|
|
// Check partial cache status.
|
|
if seq < mb.cache.fseq {
|
|
return nil, errPartialCache
|
|
}
|
|
|
|
bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Update cache activity.
|
|
mb.llts = time.Now().UnixNano()
|
|
// The llseq signals us when we can expire a cache at the end of a linear scan.
|
|
// We want to only update when we know the last reads (multiple consumers) are sequential.
|
|
if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 {
|
|
mb.llseq = seq
|
|
}
|
|
|
|
li := int(bi) - mb.cache.off
|
|
if li >= len(mb.cache.buf) {
|
|
return nil, errPartialCache
|
|
}
|
|
buf := mb.cache.buf[li:]
|
|
|
|
// We use the high bit to denote we have already checked the checksum.
|
|
var hh hash.Hash64
|
|
if !hashChecked {
|
|
hh = mb.hh // This will force the hash check in msgFromBuf.
|
|
}
|
|
|
|
// Parse from the raw buffer.
|
|
fsm, err := mb.msgFromBuf(buf, sm, hh)
|
|
if err != nil || fsm == nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Deleted messages that are decoded return a 0 for sequence.
|
|
if fsm.seq == 0 {
|
|
return nil, errDeletedMsg
|
|
}
|
|
|
|
if seq != fsm.seq {
|
|
recycleMsgBlockBuf(mb.cache.buf)
|
|
mb.cache.buf = nil
|
|
return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
|
|
}
|
|
|
|
// Clear the check bit here after we know all is good.
|
|
if !hashChecked {
|
|
mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
|
|
}
|
|
|
|
return fsm, nil
|
|
}
|
|
|
|
// Used when we are checking if discarding a message due to max msgs per subject will give us
|
|
// enough room for a max bytes condition.
|
|
// Lock should be already held.
|
|
func (fs *fileStore) sizeForSeq(seq uint64) int {
|
|
if seq == 0 {
|
|
return 0
|
|
}
|
|
var smv StoreMsg
|
|
if mb := fs.selectMsgBlock(seq); mb != nil {
|
|
if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
|
|
return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// Will return message for the given sequence number.
|
|
func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
|
|
// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
|
|
// be stalled. Need another lock if want to happen in parallel.
|
|
fs.mu.RLock()
|
|
if fs.closed {
|
|
fs.mu.RUnlock()
|
|
return nil, ErrStoreClosed
|
|
}
|
|
// Indicates we want first msg.
|
|
if seq == 0 {
|
|
seq = fs.state.FirstSeq
|
|
}
|
|
// Make sure to snapshot here.
|
|
mb, lmb, lseq := fs.selectMsgBlock(seq), fs.lmb, fs.state.LastSeq
|
|
fs.mu.RUnlock()
|
|
|
|
if mb == nil {
|
|
var err = ErrStoreEOF
|
|
if seq <= lseq {
|
|
err = ErrStoreMsgNotFound
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
fsm, expireOk, err := mb.fetchMsg(seq, sm)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// We detected a linear scan and access to the last message.
|
|
// If we are not the last message block we can try to expire the cache.
|
|
if mb != lmb && expireOk {
|
|
mb.tryForceExpireCache()
|
|
}
|
|
|
|
return fsm, nil
|
|
}
|
|
|
|
// Internal function to return msg parts from a raw buffer.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
|
|
if len(buf) < emptyRecordLen {
|
|
return nil, errBadMsg
|
|
}
|
|
var le = binary.LittleEndian
|
|
|
|
hdr := buf[:msgHdrSize]
|
|
rl := le.Uint32(hdr[0:])
|
|
hasHeaders := rl&hbit != 0
|
|
rl &^= hbit // clear header bit
|
|
dlen := int(rl) - msgHdrSize
|
|
slen := int(le.Uint16(hdr[20:]))
|
|
// Simple sanity check.
|
|
if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
|
|
return nil, errBadMsg
|
|
}
|
|
data := buf[msgHdrSize : msgHdrSize+dlen]
|
|
// Do checksum tests here if requested.
|
|
if hh != nil {
|
|
hh.Reset()
|
|
hh.Write(hdr[4:20])
|
|
hh.Write(data[:slen])
|
|
if hasHeaders {
|
|
hh.Write(data[slen+4 : dlen-recordHashSize])
|
|
} else {
|
|
hh.Write(data[slen : dlen-recordHashSize])
|
|
}
|
|
if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
|
|
return nil, errBadMsg
|
|
}
|
|
}
|
|
seq := le.Uint64(hdr[4:])
|
|
if seq&ebit != 0 {
|
|
seq = 0
|
|
}
|
|
ts := int64(le.Uint64(hdr[12:]))
|
|
|
|
// Create a StoreMsg if needed.
|
|
if sm == nil {
|
|
sm = new(StoreMsg)
|
|
} else {
|
|
sm.clear()
|
|
}
|
|
// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
|
|
// layers and for us to be safe to expire, and recycle, the large msgBlocks.
|
|
end := dlen - 8
|
|
|
|
if hasHeaders {
|
|
hl := le.Uint32(data[slen:])
|
|
bi := slen + 4
|
|
li := bi + int(hl)
|
|
sm.buf = append(sm.buf, data[bi:end]...)
|
|
li, end = li-bi, end-bi
|
|
sm.hdr = sm.buf[0:li:li]
|
|
sm.msg = sm.buf[li:end]
|
|
} else {
|
|
sm.buf = append(sm.buf, data[slen:end]...)
|
|
sm.msg = sm.buf[0 : end-slen]
|
|
}
|
|
sm.seq, sm.ts = seq, ts
|
|
// Treat subject a bit different to not reference underlying buf.
|
|
if slen > 0 {
|
|
sm.subj = mb.subjString(data[:slen])
|
|
}
|
|
|
|
return sm, nil
|
|
}
|
|
|
|
// Used to intern strings for subjects.
|
|
// Based on idea from https://github.com/josharian/intern/blob/master/intern.go
|
|
var subjPool = sync.Pool{
|
|
New: func() any {
|
|
return make(map[string]string)
|
|
},
|
|
}
|
|
|
|
// Get an interned string from a byte slice.
|
|
func subjFromBytes(b []byte) string {
|
|
sm := subjPool.Get().(map[string]string)
|
|
defer subjPool.Put(sm)
|
|
subj, ok := sm[string(b)]
|
|
if ok {
|
|
return subj
|
|
}
|
|
s := string(b)
|
|
sm[s] = s
|
|
return s
|
|
}
|
|
|
|
// Given the `key` byte slice, this function will return the subject
|
|
// as an interned string of `key` or a configured subject as to minimize memory allocations.
|
|
// Lock should be held.
|
|
func (fs *fileStore) subjString(skey []byte) string {
|
|
if fs == nil || len(skey) == 0 {
|
|
return _EMPTY_
|
|
}
|
|
|
|
if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 {
|
|
if lsubjs == 1 {
|
|
// The cast for the comparison does not make a copy
|
|
if string(skey) == fs.cfg.Subjects[0] {
|
|
return fs.cfg.Subjects[0]
|
|
}
|
|
} else {
|
|
for _, subj := range fs.cfg.Subjects {
|
|
if string(skey) == subj {
|
|
return subj
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return subjFromBytes(skey)
|
|
}
|
|
|
|
// Given the `key` byte slice, this function will return the subject
|
|
// as an interned string of `key` or a configured subject as to minimize memory allocations.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) subjString(skey []byte) string {
|
|
return mb.fs.subjString(skey)
|
|
}
|
|
|
|
// LoadMsg will lookup the message by sequence number and return it if found.
|
|
func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
|
|
return fs.msgForSeq(seq, sm)
|
|
}
|
|
|
|
// loadLast will load the last message for a subject. Subject should be non empty and not ">".
|
|
func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
if fs.closed || fs.lmb == nil {
|
|
return nil, ErrStoreClosed
|
|
}
|
|
|
|
if len(fs.blks) == 0 {
|
|
return nil, ErrStoreMsgNotFound
|
|
}
|
|
|
|
start, stop := fs.lmb.index, fs.blks[0].index
|
|
wc := subjectHasWildcard(subj)
|
|
// If literal subject check for presence.
|
|
if !wc {
|
|
if info := fs.psim[subj]; info == nil {
|
|
return nil, ErrStoreMsgNotFound
|
|
} else {
|
|
start, stop = info.lblk, info.fblk
|
|
}
|
|
}
|
|
|
|
// Walk blocks backwards.
|
|
for i := start; i >= stop; i-- {
|
|
mb := fs.bim[i]
|
|
if mb == nil {
|
|
continue
|
|
}
|
|
mb.mu.Lock()
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
mb.mu.Unlock()
|
|
return nil, err
|
|
}
|
|
var l uint64
|
|
// Optimize if subject is not a wildcard.
|
|
if !wc {
|
|
if ss := mb.fss[subj]; ss != nil {
|
|
l = ss.Last
|
|
}
|
|
}
|
|
if l == 0 {
|
|
_, _, l = mb.filteredPendingLocked(subj, wc, mb.first.seq)
|
|
}
|
|
if l > 0 {
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
mb.mu.Unlock()
|
|
return nil, err
|
|
}
|
|
}
|
|
lsm, err = mb.cacheLookup(l, sm)
|
|
}
|
|
mb.mu.Unlock()
|
|
if l > 0 {
|
|
break
|
|
}
|
|
}
|
|
return lsm, err
|
|
}
|
|
|
|
// LoadLastMsg will return the last message we have that matches a given subject.
|
|
// The subject can be a wildcard.
|
|
func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
|
|
if subject == _EMPTY_ || subject == fwcs {
|
|
sm, err = fs.msgForSeq(fs.lastSeq(), smv)
|
|
} else {
|
|
sm, err = fs.loadLast(subject, smv)
|
|
}
|
|
if sm == nil || (err != nil && err != ErrStoreClosed) {
|
|
err = ErrStoreMsgNotFound
|
|
}
|
|
return sm, err
|
|
}
|
|
|
|
func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
if fs.closed {
|
|
return nil, 0, ErrStoreClosed
|
|
}
|
|
if start < fs.state.FirstSeq {
|
|
start = fs.state.FirstSeq
|
|
}
|
|
|
|
if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
|
|
for i := bi; i < len(fs.blks); i++ {
|
|
mb := fs.blks[i]
|
|
if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
|
|
if expireOk && mb != fs.lmb {
|
|
mb.tryForceExpireCache()
|
|
}
|
|
return sm, sm.seq, nil
|
|
} else if err != ErrStoreMsgNotFound {
|
|
return nil, 0, err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, fs.state.LastSeq, ErrStoreEOF
|
|
}
|
|
|
|
// Type returns the type of the underlying store.
|
|
func (fs *fileStore) Type() StorageType {
|
|
return FileStorage
|
|
}
|
|
|
|
// Returns number of subjects in this store.
|
|
// Lock should be held.
|
|
func (fs *fileStore) numSubjects() int {
|
|
return len(fs.psim)
|
|
}
|
|
|
|
// FastState will fill in state with only the following.
|
|
// Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
|
|
func (fs *fileStore) FastState(state *StreamState) {
|
|
fs.mu.RLock()
|
|
state.Msgs = fs.state.Msgs
|
|
state.Bytes = fs.state.Bytes
|
|
state.FirstSeq = fs.state.FirstSeq
|
|
state.FirstTime = fs.state.FirstTime
|
|
state.LastSeq = fs.state.LastSeq
|
|
state.LastTime = fs.state.LastTime
|
|
if state.LastSeq > state.FirstSeq {
|
|
state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
|
|
if state.NumDeleted < 0 {
|
|
state.NumDeleted = 0
|
|
}
|
|
}
|
|
state.Consumers = len(fs.cfs)
|
|
state.NumSubjects = fs.numSubjects()
|
|
fs.mu.RUnlock()
|
|
}
|
|
|
|
// State returns the current state of the stream.
|
|
func (fs *fileStore) State() StreamState {
|
|
fs.mu.RLock()
|
|
state := fs.state
|
|
state.Consumers = len(fs.cfs)
|
|
state.NumSubjects = fs.numSubjects()
|
|
state.Deleted = nil // make sure.
|
|
|
|
if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
|
|
state.Deleted = make([]uint64, 0, numDeleted)
|
|
cur := fs.state.FirstSeq
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.mu.Lock()
|
|
fseq := mb.first.seq
|
|
// Account for messages missing from the head.
|
|
if fseq > cur {
|
|
for seq := cur; seq < fseq; seq++ {
|
|
state.Deleted = append(state.Deleted, seq)
|
|
}
|
|
}
|
|
cur = mb.last.seq + 1 // Expected next first.
|
|
|
|
mb.dmap.Range(func(seq uint64) bool {
|
|
if seq < fseq {
|
|
mb.dmap.Delete(seq)
|
|
} else {
|
|
state.Deleted = append(state.Deleted, seq)
|
|
}
|
|
return true
|
|
})
|
|
mb.mu.Unlock()
|
|
}
|
|
}
|
|
fs.mu.RUnlock()
|
|
|
|
state.Lost = fs.lostData()
|
|
|
|
// Can not be guaranteed to be sorted.
|
|
if len(state.Deleted) > 0 {
|
|
sort.Slice(state.Deleted, func(i, j int) bool {
|
|
return state.Deleted[i] < state.Deleted[j]
|
|
})
|
|
state.NumDeleted = len(state.Deleted)
|
|
}
|
|
return state
|
|
}
|
|
|
|
func (fs *fileStore) Utilization() (total, reported uint64, err error) {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
reported += mb.bytes
|
|
total += mb.rbytes
|
|
mb.mu.RUnlock()
|
|
}
|
|
return total, reported, nil
|
|
}
|
|
|
|
func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
|
|
if len(hdr) == 0 {
|
|
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
|
|
return uint64(22 + len(subj) + len(msg) + 8)
|
|
}
|
|
// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
|
|
return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
|
|
}
|
|
|
|
func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
|
|
return uint64(emptyRecordLen + slen + 4 + maxPayload)
|
|
}
|
|
|
|
// Determine time since last write or remove of a message.
|
|
// Read lock should be held.
|
|
func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
|
|
if mb.closed {
|
|
return 0
|
|
}
|
|
last := mb.lwts
|
|
if mb.lrts > last {
|
|
last = mb.lrts
|
|
}
|
|
return time.Since(time.Unix(0, last).UTC())
|
|
}
|
|
|
|
func checkNewHeader(hdr []byte) error {
|
|
if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
|
|
(hdr[1] != version && hdr[1] != newVersion) {
|
|
return errCorruptState
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// readIndexInfo will read in the index information for the message block.
|
|
func (mb *msgBlock) readIndexInfo() error {
|
|
ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
|
|
buf, err := os.ReadFile(ifn)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Set if first time.
|
|
if mb.liwsz == 0 {
|
|
mb.liwsz = int64(len(buf))
|
|
}
|
|
|
|
// Decrypt if needed.
|
|
if mb.aek != nil {
|
|
buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := checkNewHeader(buf); err != nil {
|
|
defer os.Remove(ifn)
|
|
return fmt.Errorf("bad index file")
|
|
}
|
|
|
|
bi := hdrLen
|
|
|
|
// Helpers, will set i to -1 on error.
|
|
readSeq := func() uint64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
seq, n := binary.Uvarint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return 0
|
|
}
|
|
bi += n
|
|
return seq &^ ebit
|
|
}
|
|
readCount := readSeq
|
|
readTimeStamp := func() int64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
ts, n := binary.Varint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return -1
|
|
}
|
|
bi += n
|
|
return ts
|
|
}
|
|
mb.msgs = readCount()
|
|
mb.bytes = readCount()
|
|
mb.first.seq = readSeq()
|
|
mb.first.ts = readTimeStamp()
|
|
mb.last.seq = readSeq()
|
|
mb.last.ts = readTimeStamp()
|
|
dmapLen := readCount()
|
|
|
|
// Check if this is a short write index file.
|
|
if bi < 0 || bi+checksumSize > len(buf) {
|
|
os.Remove(ifn)
|
|
return fmt.Errorf("short index file")
|
|
}
|
|
|
|
// Check for consistency if accounting. If something is off bail and we will rebuild.
|
|
if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen {
|
|
os.Remove(ifn)
|
|
return fmt.Errorf("accounting inconsistent")
|
|
}
|
|
|
|
// Checksum
|
|
copy(mb.lchk[0:], buf[bi:bi+checksumSize])
|
|
bi += checksumSize
|
|
|
|
// Now check for presence of a delete map
|
|
if dmapLen > 0 {
|
|
// New version is encoded avl seqset.
|
|
if buf[1] == newVersion {
|
|
dmap, _, err := avl.Decode(buf[bi:])
|
|
if err != nil {
|
|
return fmt.Errorf("could not decode avl dmap: %v", err)
|
|
}
|
|
mb.dmap = *dmap
|
|
} else {
|
|
// This is the old version.
|
|
for i := 0; i < int(dmapLen); i++ {
|
|
seq := readSeq()
|
|
if seq == 0 {
|
|
break
|
|
}
|
|
mb.dmap.Insert(seq + mb.first.seq)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Will return total number of cache loads.
|
|
func (fs *fileStore) cacheLoads() uint64 {
|
|
var tl uint64
|
|
fs.mu.RLock()
|
|
for _, mb := range fs.blks {
|
|
tl += mb.cloads
|
|
}
|
|
fs.mu.RUnlock()
|
|
return tl
|
|
}
|
|
|
|
// Will return total number of cached bytes.
|
|
func (fs *fileStore) cacheSize() uint64 {
|
|
var sz uint64
|
|
fs.mu.RLock()
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
if mb.cache != nil {
|
|
sz += uint64(len(mb.cache.buf))
|
|
}
|
|
mb.mu.RUnlock()
|
|
}
|
|
fs.mu.RUnlock()
|
|
return sz
|
|
}
|
|
|
|
// Will return total number of dmapEntries for all msg blocks.
|
|
func (fs *fileStore) dmapEntries() int {
|
|
var total int
|
|
fs.mu.RLock()
|
|
for _, mb := range fs.blks {
|
|
total += mb.dmap.Size()
|
|
}
|
|
fs.mu.RUnlock()
|
|
return total
|
|
}
|
|
|
|
// Fixed helper for iterating.
|
|
func subjectsEqual(a, b string) bool {
|
|
return a == b
|
|
}
|
|
|
|
func subjectsAll(a, b string) bool {
|
|
return true
|
|
}
|
|
|
|
func compareFn(subject string) func(string, string) bool {
|
|
if subject == _EMPTY_ || subject == fwcs {
|
|
return subjectsAll
|
|
}
|
|
if subjectHasWildcard(subject) {
|
|
return subjectIsSubsetMatch
|
|
}
|
|
return subjectsEqual
|
|
}
|
|
|
|
// PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
|
|
// Will return the number of purged messages.
|
|
func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
|
|
if subject == _EMPTY_ || subject == fwcs {
|
|
if keep == 0 && (sequence == 0 || sequence == 1) {
|
|
return fs.Purge()
|
|
}
|
|
if sequence > 1 {
|
|
return fs.Compact(sequence)
|
|
}
|
|
}
|
|
|
|
eq, wc := compareFn(subject), subjectHasWildcard(subject)
|
|
var firstSeqNeedsUpdate bool
|
|
var bytes uint64
|
|
|
|
// If we have a "keep" designation need to get full filtered state so we know how many to purge.
|
|
var maxp uint64
|
|
if keep > 0 {
|
|
ss := fs.FilteredState(1, subject)
|
|
if keep >= ss.Msgs {
|
|
return 0, nil
|
|
}
|
|
maxp = ss.Msgs - keep
|
|
}
|
|
|
|
var smv StoreMsg
|
|
|
|
fs.mu.Lock()
|
|
// We may remove blocks as we purge, so don't range directly on fs.blks
|
|
// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
|
|
for i := 0; i < len(fs.blks); i++ {
|
|
mb := fs.blks[i]
|
|
mb.mu.Lock()
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
mb.mu.Unlock()
|
|
continue
|
|
}
|
|
t, f, l := mb.filteredPendingLocked(subject, wc, mb.first.seq)
|
|
if t == 0 {
|
|
mb.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
var shouldExpire bool
|
|
if mb.cacheNotLoaded() {
|
|
mb.loadMsgsWithLock()
|
|
shouldExpire = true
|
|
}
|
|
if sequence > 1 && sequence <= l {
|
|
l = sequence - 1
|
|
}
|
|
|
|
for seq := f; seq <= l; seq++ {
|
|
if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
|
|
rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
|
|
// Do fast in place remove.
|
|
// Stats
|
|
if mb.msgs > 0 {
|
|
// Msgs
|
|
fs.state.Msgs--
|
|
mb.msgs--
|
|
// Bytes, make sure to not go negative.
|
|
if rl > fs.state.Bytes {
|
|
rl = fs.state.Bytes
|
|
}
|
|
if rl > mb.bytes {
|
|
rl = mb.bytes
|
|
}
|
|
fs.state.Bytes -= rl
|
|
mb.bytes -= rl
|
|
// Totals
|
|
purged++
|
|
bytes += rl
|
|
}
|
|
// FSS updates.
|
|
mb.removeSeqPerSubject(sm.subj, seq)
|
|
fs.removePerSubject(sm.subj)
|
|
|
|
// Check for first message.
|
|
if seq == mb.first.seq {
|
|
mb.selectNextFirst()
|
|
if mb.isEmpty() {
|
|
fs.removeMsgBlock(mb)
|
|
i--
|
|
// keep flag set, if set previously
|
|
firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
|
|
} else if seq == fs.state.FirstSeq {
|
|
fs.state.FirstSeq = mb.first.seq // new one.
|
|
fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
|
|
}
|
|
} else {
|
|
// Out of order delete.
|
|
mb.dmap.Insert(seq)
|
|
}
|
|
|
|
if maxp > 0 && purged >= maxp {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
// Expire if we were responsible for loading.
|
|
if shouldExpire {
|
|
// Expire this cache before moving on.
|
|
mb.tryForceExpireCacheLocked()
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
// Check if we should break out of top level too.
|
|
if maxp > 0 && purged >= maxp {
|
|
break
|
|
}
|
|
}
|
|
if firstSeqNeedsUpdate {
|
|
fs.selectNextFirst()
|
|
}
|
|
|
|
fs.dirty++
|
|
cb := fs.scb
|
|
fs.mu.Unlock()
|
|
|
|
fs.kickFlushStateLoop()
|
|
|
|
if cb != nil {
|
|
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
|
|
}
|
|
|
|
return purged, nil
|
|
}
|
|
|
|
// Purge will remove all messages from this store.
|
|
// Will return the number of purged messages.
|
|
func (fs *fileStore) Purge() (uint64, error) {
|
|
return fs.purge(0)
|
|
}
|
|
|
|
func (fs *fileStore) purge(fseq uint64) (uint64, error) {
|
|
fs.mu.Lock()
|
|
if fs.closed {
|
|
fs.mu.Unlock()
|
|
return 0, ErrStoreClosed
|
|
}
|
|
|
|
purged := fs.state.Msgs
|
|
rbytes := int64(fs.state.Bytes)
|
|
|
|
fs.state.FirstSeq = fs.state.LastSeq + 1
|
|
fs.state.FirstTime = time.Time{}
|
|
|
|
fs.state.Bytes = 0
|
|
fs.state.Msgs = 0
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.dirtyClose()
|
|
}
|
|
|
|
fs.blks = nil
|
|
fs.lmb = nil
|
|
fs.bim = make(map[uint32]*msgBlock)
|
|
// Clear any per subject tracking.
|
|
fs.psim = make(map[string]*psi)
|
|
// Mark dirty
|
|
fs.dirty++
|
|
|
|
// Move the msgs directory out of the way, will delete out of band.
|
|
// FIXME(dlc) - These can error and we need to change api above to propagate?
|
|
mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
|
|
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
|
|
// If purge directory still exists then we need to wait
|
|
// in place and remove since rename would fail.
|
|
if _, err := os.Stat(pdir); err == nil {
|
|
os.RemoveAll(pdir)
|
|
}
|
|
os.Rename(mdir, pdir)
|
|
|
|
go os.RemoveAll(pdir)
|
|
|
|
// Create new one.
|
|
os.MkdirAll(mdir, defaultDirPerms)
|
|
|
|
// Make sure we have a lmb to write to.
|
|
if _, err := fs.newMsgBlockForWrite(); err != nil {
|
|
fs.mu.Unlock()
|
|
return purged, err
|
|
}
|
|
|
|
// Check if we need to set the first seq to a new number.
|
|
if fseq > fs.state.FirstSeq {
|
|
fs.state.FirstSeq = fseq
|
|
fs.state.LastSeq = fseq - 1
|
|
}
|
|
|
|
lmb := fs.lmb
|
|
lmb.first.seq = fs.state.FirstSeq
|
|
lmb.last.seq = fs.state.LastSeq
|
|
lmb.last.ts = fs.state.LastTime.UnixNano()
|
|
|
|
if fs.lmb.last.seq > 1 {
|
|
// Leave a tombstone so we can remember our starting sequence in case
|
|
// full state becomes corrupted.
|
|
lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts)
|
|
}
|
|
|
|
cb := fs.scb
|
|
fs.mu.Unlock()
|
|
|
|
if cb != nil {
|
|
cb(-int64(purged), -rbytes, 0, _EMPTY_)
|
|
}
|
|
|
|
return purged, nil
|
|
}
|
|
|
|
// Compact will remove all messages from this store up to
|
|
// but not including the seq parameter.
|
|
// Will return the number of purged messages.
|
|
func (fs *fileStore) Compact(seq uint64) (uint64, error) {
|
|
if seq == 0 {
|
|
return fs.purge(seq)
|
|
}
|
|
|
|
var purged, bytes uint64
|
|
|
|
fs.mu.Lock()
|
|
// Same as purge all.
|
|
if lseq := fs.state.LastSeq; seq > lseq {
|
|
fs.mu.Unlock()
|
|
return fs.purge(seq)
|
|
}
|
|
// We have to delete interior messages.
|
|
smb := fs.selectMsgBlock(seq)
|
|
if smb == nil {
|
|
fs.mu.Unlock()
|
|
return 0, nil
|
|
}
|
|
|
|
// All msgblocks up to this one can be thrown away.
|
|
var deleted int
|
|
for _, mb := range fs.blks {
|
|
if mb == smb {
|
|
break
|
|
}
|
|
mb.mu.Lock()
|
|
purged += mb.msgs
|
|
bytes += mb.bytes
|
|
// Make sure we do subject cleanup as well.
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
for subj := range mb.fss {
|
|
fs.removePerSubject(subj)
|
|
}
|
|
// Now close.
|
|
mb.dirtyCloseWithRemove(true)
|
|
mb.mu.Unlock()
|
|
deleted++
|
|
}
|
|
|
|
var smv StoreMsg
|
|
var err error
|
|
var isEmpty bool
|
|
|
|
smb.mu.Lock()
|
|
if smb.first.seq == seq {
|
|
goto SKIP
|
|
}
|
|
|
|
// Make sure we have the messages loaded.
|
|
if smb.cacheNotLoaded() {
|
|
if err = smb.loadMsgsWithLock(); err != nil {
|
|
goto SKIP
|
|
}
|
|
}
|
|
for mseq := smb.first.seq; mseq < seq; mseq++ {
|
|
sm, err := smb.cacheLookup(mseq, &smv)
|
|
if err == errDeletedMsg {
|
|
// Update dmap.
|
|
if !smb.dmap.IsEmpty() {
|
|
smb.dmap.Delete(seq)
|
|
}
|
|
} else if sm != nil {
|
|
sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
|
|
if smb.msgs > 0 {
|
|
smb.msgs--
|
|
if sz > smb.bytes {
|
|
sz = smb.bytes
|
|
}
|
|
smb.bytes -= sz
|
|
bytes += sz
|
|
purged++
|
|
}
|
|
// Update fss
|
|
smb.removeSeqPerSubject(sm.subj, mseq)
|
|
fs.removePerSubject(sm.subj)
|
|
}
|
|
}
|
|
|
|
// Check if empty after processing, could happen if tail of messages are all deleted.
|
|
isEmpty = smb.msgs == 0
|
|
if isEmpty {
|
|
smb.dirtyCloseWithRemove(true)
|
|
// Update fs first here as well.
|
|
fs.state.FirstSeq = smb.last.seq + 1
|
|
fs.state.FirstTime = time.Time{}
|
|
deleted++
|
|
} else {
|
|
// Make sure to sync changes.
|
|
smb.needSync = true
|
|
// Update fs first seq and time.
|
|
smb.first.seq = seq - 1 // Just for start condition for selectNextFirst.
|
|
smb.selectNextFirst()
|
|
|
|
fs.state.FirstSeq = smb.first.seq
|
|
fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
|
|
|
|
// Check if we should reclaim the head space from this block.
|
|
// This will be optimistic only, so don't continue if we encounter any errors here.
|
|
if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
|
|
var moff uint32
|
|
moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq))
|
|
if err != nil || moff >= uint32(len(smb.cache.buf)) {
|
|
goto SKIP
|
|
}
|
|
buf := smb.cache.buf[moff:]
|
|
// Don't reuse, copy to new recycled buf.
|
|
nbuf := getMsgBlockBuf(len(buf))
|
|
nbuf = append(nbuf, buf...)
|
|
smb.closeFDsLockedNoCheck()
|
|
// Check for encryption.
|
|
if smb.bek != nil && len(nbuf) > 0 {
|
|
// Recreate to reset counter.
|
|
bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
|
|
if err != nil {
|
|
goto SKIP
|
|
}
|
|
// For future writes make sure to set smb.bek to keep counter correct.
|
|
smb.bek = bek
|
|
smb.bek.XORKeyStream(nbuf, nbuf)
|
|
}
|
|
// Recompress if necessary (smb.cmp contains the algorithm used when
|
|
// the block was loaded from disk, or defaults to NoCompression if not)
|
|
if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
|
|
goto SKIP
|
|
}
|
|
if err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms); err != nil {
|
|
goto SKIP
|
|
}
|
|
// Make sure to remove fss state.
|
|
smb.fss = nil
|
|
smb.clearCacheAndOffset()
|
|
smb.rbytes = uint64(len(nbuf))
|
|
}
|
|
}
|
|
|
|
SKIP:
|
|
smb.mu.Unlock()
|
|
|
|
if deleted > 0 {
|
|
// Update block map.
|
|
if fs.bim != nil {
|
|
for _, mb := range fs.blks[:deleted] {
|
|
delete(fs.bim, mb.index)
|
|
}
|
|
}
|
|
// Update blks slice.
|
|
fs.blks = copyMsgBlocks(fs.blks[deleted:])
|
|
if lb := len(fs.blks); lb == 0 {
|
|
fs.lmb = nil
|
|
} else {
|
|
fs.lmb = fs.blks[lb-1]
|
|
}
|
|
}
|
|
|
|
// Update top level accounting.
|
|
if purged > fs.state.Msgs {
|
|
purged = fs.state.Msgs
|
|
}
|
|
fs.state.Msgs -= purged
|
|
|
|
if bytes > fs.state.Bytes {
|
|
bytes = fs.state.Bytes
|
|
}
|
|
fs.state.Bytes -= bytes
|
|
|
|
fs.dirty++
|
|
fs.kickFlushStateLoop()
|
|
|
|
cb := fs.scb
|
|
fs.mu.Unlock()
|
|
|
|
if cb != nil && purged > 0 {
|
|
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
|
|
}
|
|
|
|
return purged, err
|
|
}
|
|
|
|
// Will completely reset our store.
|
|
func (fs *fileStore) reset() error {
|
|
fs.mu.Lock()
|
|
if fs.closed {
|
|
fs.mu.Unlock()
|
|
return ErrStoreClosed
|
|
}
|
|
if fs.sips > 0 {
|
|
fs.mu.Unlock()
|
|
return ErrStoreSnapshotInProgress
|
|
}
|
|
|
|
var purged, bytes uint64
|
|
cb := fs.scb
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.mu.Lock()
|
|
purged += mb.msgs
|
|
bytes += mb.bytes
|
|
mb.dirtyCloseWithRemove(true)
|
|
mb.mu.Unlock()
|
|
}
|
|
|
|
// Reset
|
|
fs.state.FirstSeq = 0
|
|
fs.state.FirstTime = time.Time{}
|
|
fs.state.LastSeq = 0
|
|
fs.state.LastTime = time.Now().UTC()
|
|
// Update msgs and bytes.
|
|
fs.state.Msgs = 0
|
|
fs.state.Bytes = 0
|
|
|
|
// Reset blocks.
|
|
fs.blks, fs.lmb = nil, nil
|
|
|
|
// Reset subject mappings.
|
|
fs.psim = make(map[string]*psi)
|
|
fs.bim = make(map[uint32]*msgBlock)
|
|
|
|
// If we purged anything, make sure we kick flush state loop.
|
|
if purged > 0 {
|
|
fs.dirty++
|
|
fs.kickFlushStateLoop()
|
|
}
|
|
|
|
fs.mu.Unlock()
|
|
|
|
if cb != nil {
|
|
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Truncate will truncate a stream store up to seq. Sequence needs to be valid.
|
|
func (fs *fileStore) Truncate(seq uint64) error {
|
|
// Check for request to reset.
|
|
if seq == 0 {
|
|
return fs.reset()
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
|
|
if fs.closed {
|
|
fs.mu.Unlock()
|
|
return ErrStoreClosed
|
|
}
|
|
if fs.sips > 0 {
|
|
fs.mu.Unlock()
|
|
return ErrStoreSnapshotInProgress
|
|
}
|
|
|
|
nlmb := fs.selectMsgBlock(seq)
|
|
if nlmb == nil {
|
|
fs.mu.Unlock()
|
|
return ErrInvalidSequence
|
|
}
|
|
lsm, _, _ := nlmb.fetchMsg(seq, nil)
|
|
if lsm == nil {
|
|
fs.mu.Unlock()
|
|
return ErrInvalidSequence
|
|
}
|
|
|
|
// Set lmb to nlmb and make sure writeable.
|
|
fs.lmb = nlmb
|
|
if err := nlmb.enableForWriting(fs.fip); err != nil {
|
|
return err
|
|
}
|
|
|
|
var purged, bytes uint64
|
|
|
|
// Truncate our new last message block.
|
|
nmsgs, nbytes, err := nlmb.truncate(lsm)
|
|
if err != nil {
|
|
fs.mu.Unlock()
|
|
return fmt.Errorf("nlmb.truncate: %w", err)
|
|
}
|
|
// Account for the truncated msgs and bytes.
|
|
purged += nmsgs
|
|
bytes += nbytes
|
|
|
|
// Remove any left over msg blocks.
|
|
getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
|
|
for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
|
|
mb.mu.Lock()
|
|
purged += mb.msgs
|
|
bytes += mb.bytes
|
|
fs.removeMsgBlock(mb)
|
|
mb.mu.Unlock()
|
|
}
|
|
|
|
// Reset last.
|
|
fs.state.LastSeq = lsm.seq
|
|
fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
|
|
// Update msgs and bytes.
|
|
if purged > fs.state.Msgs {
|
|
purged = fs.state.Msgs
|
|
}
|
|
fs.state.Msgs -= purged
|
|
if bytes > fs.state.Bytes {
|
|
bytes = fs.state.Bytes
|
|
}
|
|
fs.state.Bytes -= bytes
|
|
|
|
// Reset our subject lookup info.
|
|
fs.resetGlobalPerSubjectInfo()
|
|
|
|
fs.dirty++
|
|
fs.kickFlushStateLoop()
|
|
|
|
cb := fs.scb
|
|
fs.mu.Unlock()
|
|
|
|
if cb != nil {
|
|
cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (fs *fileStore) lastSeq() uint64 {
|
|
fs.mu.RLock()
|
|
seq := fs.state.LastSeq
|
|
fs.mu.RUnlock()
|
|
return seq
|
|
}
|
|
|
|
// Returns number of msg blks.
|
|
func (fs *fileStore) numMsgBlocks() int {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
return len(fs.blks)
|
|
}
|
|
|
|
// Will add a new msgBlock.
|
|
// Lock should be held.
|
|
func (fs *fileStore) addMsgBlock(mb *msgBlock) {
|
|
fs.blks = append(fs.blks, mb)
|
|
fs.lmb = mb
|
|
fs.bim[mb.index] = mb
|
|
}
|
|
|
|
// Remove from our list of blks.
|
|
// Both locks should be held.
|
|
func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
|
|
// Remove from list.
|
|
for i, omb := range fs.blks {
|
|
if mb == omb {
|
|
fs.dirty++
|
|
blks := append(fs.blks[:i], fs.blks[i+1:]...)
|
|
fs.blks = copyMsgBlocks(blks)
|
|
if fs.bim != nil {
|
|
delete(fs.bim, mb.index)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Removes the msgBlock
|
|
// Both locks should be held.
|
|
func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
|
|
mb.dirtyCloseWithRemove(true)
|
|
fs.removeMsgBlockFromList(mb)
|
|
// Check for us being last message block
|
|
if mb == fs.lmb {
|
|
last := mb.last
|
|
// Creating a new message write block requires that the lmb lock is not held.
|
|
mb.mu.Unlock()
|
|
// Write the tombstone to remember since this was last block.
|
|
if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
|
|
lmb.writeTombstone(last.seq, last.ts)
|
|
}
|
|
mb.mu.Lock()
|
|
}
|
|
}
|
|
|
|
// Called by purge to simply get rid of the cache and close our fds.
|
|
// Lock should not be held.
|
|
func (mb *msgBlock) dirtyClose() {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
mb.dirtyCloseWithRemove(false)
|
|
}
|
|
|
|
// Should be called with lock held.
|
|
func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
|
|
if mb == nil {
|
|
return
|
|
}
|
|
// Stop cache expiration timer.
|
|
if mb.ctmr != nil {
|
|
mb.ctmr.Stop()
|
|
mb.ctmr = nil
|
|
}
|
|
// Clear any tracking by subject.
|
|
mb.fss = nil
|
|
// Close cache
|
|
mb.clearCacheAndOffset()
|
|
// Quit our loops.
|
|
if mb.qch != nil {
|
|
close(mb.qch)
|
|
mb.qch = nil
|
|
}
|
|
if mb.mfd != nil {
|
|
mb.mfd.Close()
|
|
mb.mfd = nil
|
|
}
|
|
if remove {
|
|
if mb.mfn != _EMPTY_ {
|
|
os.Remove(mb.mfn)
|
|
mb.mfn = _EMPTY_
|
|
}
|
|
if mb.kfn != _EMPTY_ {
|
|
os.Remove(mb.kfn)
|
|
}
|
|
// Since we are removing a block kick the state flusher.
|
|
mb.fs.kickFlushStateLoop()
|
|
}
|
|
}
|
|
|
|
// Remove a seq from the fss and select new first.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
|
|
mb.ensurePerSubjectInfoLoaded()
|
|
ss := mb.fss[subj]
|
|
if ss == nil {
|
|
return
|
|
}
|
|
|
|
if ss.Msgs == 1 {
|
|
delete(mb.fss, subj)
|
|
return
|
|
}
|
|
|
|
ss.Msgs--
|
|
|
|
// Only one left.
|
|
if ss.Msgs == 1 {
|
|
if seq == ss.Last {
|
|
ss.Last = ss.First
|
|
} else {
|
|
ss.First = ss.Last
|
|
}
|
|
ss.firstNeedsUpdate = false
|
|
return
|
|
}
|
|
|
|
// We can lazily calculate the first sequence when needed.
|
|
ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
|
|
}
|
|
|
|
// Will recalulate the first sequence for this subject in this block.
|
|
// Will avoid slower path message lookups and scan the cache directly instead.
|
|
func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
|
|
// Need to make sure messages are loaded.
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// Mark first as updated.
|
|
ss.firstNeedsUpdate = false
|
|
startSeq++
|
|
|
|
startSlot := int(startSeq - mb.cache.fseq)
|
|
if startSlot >= len(mb.cache.idx) {
|
|
ss.First = ss.Last
|
|
return
|
|
} else if startSlot < 0 {
|
|
startSlot = 0
|
|
}
|
|
|
|
var le = binary.LittleEndian
|
|
for slot := startSlot; slot < len(mb.cache.idx); slot++ {
|
|
li := int(mb.cache.idx[slot]&^hbit) - mb.cache.off
|
|
if li >= len(mb.cache.buf) {
|
|
ss.First = ss.Last
|
|
return
|
|
}
|
|
buf := mb.cache.buf[li:]
|
|
hdr := buf[:msgHdrSize]
|
|
slen := int(le.Uint16(hdr[20:]))
|
|
if subj == string(buf[msgHdrSize:msgHdrSize+slen]) {
|
|
seq := le.Uint64(hdr[4:])
|
|
if seq < mb.first.seq || seq&ebit != 0 {
|
|
continue
|
|
}
|
|
if mb.dmap.Exists(seq) {
|
|
continue
|
|
}
|
|
ss.First = seq
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) resetGlobalPerSubjectInfo() {
|
|
// Clear any global subject state.
|
|
fs.psim = make(map[string]*psi)
|
|
for _, mb := range fs.blks {
|
|
fs.populateGlobalPerSubjectInfo(mb)
|
|
}
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (mb *msgBlock) resetPerSubjectInfo() error {
|
|
mb.fss = nil
|
|
return mb.generatePerSubjectInfo()
|
|
}
|
|
|
|
// generatePerSubjectInfo will generate the per subject info via the raw msg block.
|
|
// Lock should be held.
|
|
func (mb *msgBlock) generatePerSubjectInfo() error {
|
|
// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
|
|
if mb.msgs == 0 {
|
|
return nil
|
|
}
|
|
|
|
if mb.cacheNotLoaded() {
|
|
if err := mb.loadMsgsWithLock(); err != nil {
|
|
return err
|
|
}
|
|
// indexCaceheBuf can produce fss now, so if non-nil we are good.
|
|
if mb.fss != nil {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Create new one regardless.
|
|
mb.fss = make(map[string]*SimpleState)
|
|
|
|
var smv StoreMsg
|
|
fseq, lseq := mb.first.seq, mb.last.seq
|
|
for seq := fseq; seq <= lseq; seq++ {
|
|
sm, err := mb.cacheLookup(seq, &smv)
|
|
if err != nil {
|
|
// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
|
|
if err == ErrStoreMsgNotFound || err == errDeletedMsg {
|
|
continue
|
|
}
|
|
if err == errNoCache {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
if sm != nil && len(sm.subj) > 0 {
|
|
if ss := mb.fss[sm.subj]; ss != nil {
|
|
ss.Msgs++
|
|
ss.Last = seq
|
|
} else {
|
|
mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(mb.fss) > 0 {
|
|
// Make sure we run the cache expire timer.
|
|
mb.llts = time.Now().UnixNano()
|
|
mb.startCacheExpireTimer()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Helper to make sure fss loaded if we are tracking.
|
|
// Lock should be held
|
|
func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
|
|
if mb.fss != nil || mb.noTrack {
|
|
return nil
|
|
}
|
|
if mb.msgs == 0 {
|
|
mb.fss = make(map[string]*SimpleState)
|
|
return nil
|
|
}
|
|
return mb.generatePerSubjectInfo()
|
|
}
|
|
|
|
// Called on recovery to populate the global psim state.
|
|
// Lock should be held.
|
|
func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
|
|
return
|
|
}
|
|
|
|
// Now populate psim.
|
|
for subj, ss := range mb.fss {
|
|
if len(subj) > 0 {
|
|
if info, ok := fs.psim[subj]; ok {
|
|
info.total += ss.Msgs
|
|
if mb.index > info.lblk {
|
|
info.lblk = mb.index
|
|
}
|
|
} else {
|
|
fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close the message block.
|
|
func (mb *msgBlock) close(sync bool) {
|
|
if mb == nil {
|
|
return
|
|
}
|
|
mb.mu.Lock()
|
|
defer mb.mu.Unlock()
|
|
|
|
if mb.closed {
|
|
return
|
|
}
|
|
|
|
// Stop cache expiration timer.
|
|
if mb.ctmr != nil {
|
|
mb.ctmr.Stop()
|
|
mb.ctmr = nil
|
|
}
|
|
|
|
mb.fss = nil
|
|
|
|
// Close cache
|
|
mb.clearCacheAndOffset()
|
|
// Quit our loops.
|
|
if mb.qch != nil {
|
|
close(mb.qch)
|
|
mb.qch = nil
|
|
}
|
|
if mb.mfd != nil {
|
|
if sync {
|
|
mb.mfd.Sync()
|
|
}
|
|
mb.mfd.Close()
|
|
}
|
|
mb.mfd = nil
|
|
// Mark as closed.
|
|
mb.closed = true
|
|
}
|
|
|
|
func (fs *fileStore) closeAllMsgBlocks(sync bool) {
|
|
for _, mb := range fs.blks {
|
|
mb.close(sync)
|
|
}
|
|
}
|
|
|
|
func (fs *fileStore) Delete() error {
|
|
if fs.isClosed() {
|
|
// Always attempt to remove since we could have been closed beforehand.
|
|
os.RemoveAll(fs.fcfg.StoreDir)
|
|
// Since we did remove, if we did have anything remaining make sure to
|
|
// call into any storage updates that had been registered.
|
|
fs.mu.Lock()
|
|
cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
|
|
// Guard against double accounting if called twice.
|
|
fs.state.Msgs, fs.state.Bytes = 0, 0
|
|
fs.mu.Unlock()
|
|
if msgs > 0 && cb != nil {
|
|
cb(-msgs, -bytes, 0, _EMPTY_)
|
|
}
|
|
return ErrStoreClosed
|
|
}
|
|
|
|
pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
|
|
// If purge directory still exists then we need to wait
|
|
// in place and remove since rename would fail.
|
|
if _, err := os.Stat(pdir); err == nil {
|
|
os.RemoveAll(pdir)
|
|
}
|
|
|
|
// Do Purge() since if we have lots of blocks uses a mv/rename.
|
|
fs.Purge()
|
|
|
|
if err := fs.Stop(); err != nil {
|
|
return err
|
|
}
|
|
|
|
err := os.RemoveAll(fs.fcfg.StoreDir)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
ttl := time.Now().Add(time.Second)
|
|
for time.Now().Before(ttl) {
|
|
time.Sleep(10 * time.Millisecond)
|
|
if err = os.RemoveAll(fs.fcfg.StoreDir); err == nil {
|
|
return nil
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (fs *fileStore) cancelSyncTimer() {
|
|
if fs.syncTmr != nil {
|
|
fs.syncTmr.Stop()
|
|
fs.syncTmr = nil
|
|
}
|
|
}
|
|
|
|
const (
|
|
fullStateMagic = uint8(11)
|
|
fullStateVersion = uint8(1)
|
|
)
|
|
|
|
// This go routine runs and receives kicks to write out our full stream state index.
|
|
// This will get kicked when we create a new block or when we delete a block in general.
|
|
// This is also called during Stop().
|
|
func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) {
|
|
for {
|
|
select {
|
|
case <-fch:
|
|
fs.writeFullState()
|
|
case <-qch:
|
|
close(done)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Kick the flusher.
|
|
func (fs *fileStore) kickFlushStateLoop() {
|
|
kickFlusher(fs.fch)
|
|
}
|
|
|
|
// Helper since unixnano of zero time undefined.
|
|
func timestampNormalized(t time.Time) int64 {
|
|
if t.IsZero() {
|
|
return 0
|
|
}
|
|
return t.UnixNano()
|
|
}
|
|
|
|
// This will write the full binary state for the stream.
|
|
// This plus everything new since last hash will be the total recovered state.
|
|
// This state dump will have the following.
|
|
// 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
|
|
// 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
|
|
// 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
|
|
// 4. Last block index and hash of record inclusive to this stream state.
|
|
func (fs *fileStore) writeFullState() error {
|
|
fs.mu.Lock()
|
|
|
|
if fs.closed || fs.dirty == 0 {
|
|
fs.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
var _buf [32 * 1024]byte
|
|
_buf[0], _buf[1] = fullStateMagic, fullStateVersion
|
|
buf := _buf[:hdrLen]
|
|
|
|
buf = binary.AppendUvarint(buf, fs.state.Msgs)
|
|
buf = binary.AppendUvarint(buf, fs.state.Bytes)
|
|
buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
|
|
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
|
|
buf = binary.AppendUvarint(buf, fs.state.LastSeq)
|
|
buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))
|
|
|
|
// Do per subject information map if applicable.
|
|
numSubjects := len(fs.psim)
|
|
buf = binary.AppendUvarint(buf, uint64(numSubjects))
|
|
|
|
if numSubjects > 0 {
|
|
for subj, psi := range fs.psim {
|
|
buf = binary.AppendUvarint(buf, uint64(len(subj)))
|
|
buf = append(buf, subj...)
|
|
buf = binary.AppendUvarint(buf, psi.total)
|
|
buf = binary.AppendUvarint(buf, uint64(psi.fblk))
|
|
if psi.total > 1 {
|
|
buf = binary.AppendUvarint(buf, uint64(psi.lblk))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Now walk all blocks and write out first and last and optional dmap encoding.
|
|
var lbi uint32
|
|
var lchk [8]byte
|
|
|
|
nb := len(fs.blks)
|
|
buf = binary.AppendUvarint(buf, uint64(nb))
|
|
|
|
// Use basetime to save some space.
|
|
baseTime := timestampNormalized(fs.state.FirstTime)
|
|
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
buf = binary.AppendUvarint(buf, uint64(mb.index))
|
|
buf = binary.AppendUvarint(buf, mb.bytes)
|
|
buf = binary.AppendUvarint(buf, mb.first.seq)
|
|
buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
|
|
buf = binary.AppendUvarint(buf, mb.last.seq)
|
|
buf = binary.AppendVarint(buf, mb.last.ts-baseTime)
|
|
|
|
numDeleted := mb.dmap.Size()
|
|
buf = binary.AppendUvarint(buf, uint64(numDeleted))
|
|
if numDeleted > 0 {
|
|
var scratch [8 * 1024]byte
|
|
dmap, _ := mb.dmap.Encode(scratch[:0])
|
|
buf = append(buf, dmap...)
|
|
}
|
|
// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
|
|
// We use this to quickly open this file on recovery.
|
|
if mb == fs.lmb {
|
|
lbi = mb.index
|
|
mb.ensureLastChecksumLoaded()
|
|
copy(lchk[0:], mb.lchk[:])
|
|
}
|
|
mb.mu.RUnlock()
|
|
}
|
|
|
|
// Place block index and hash onto the end.
|
|
buf = binary.AppendUvarint(buf, uint64(lbi))
|
|
buf = append(buf, lchk[:]...)
|
|
|
|
// Encrypt if needed.
|
|
if fs.prf != nil {
|
|
if err := fs.setupAEK(); err != nil {
|
|
fs.mu.Unlock()
|
|
return err
|
|
}
|
|
nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
|
|
rand.Read(nonce)
|
|
buf = fs.aek.Seal(nonce, nonce, buf, nil)
|
|
}
|
|
|
|
fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
|
|
|
|
fs.hh.Reset()
|
|
fs.hh.Write(buf)
|
|
buf = fs.hh.Sum(buf)
|
|
|
|
// Snapshot prior dirty count.
|
|
priorDirty := fs.dirty
|
|
// Release lock.
|
|
fs.mu.Unlock()
|
|
|
|
// Write to a tmp file and rename.
|
|
const tmpPre = streamStreamStateFile + tsep
|
|
f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmpName := f.Name()
|
|
defer os.Remove(tmpName)
|
|
if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
|
|
f.Sync()
|
|
}
|
|
f.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Rename into position under our lock, clear prior dirty pending on success.
|
|
fs.mu.Lock()
|
|
if !fs.closed {
|
|
if err := os.Rename(tmpName, fn); err != nil {
|
|
fs.mu.Unlock()
|
|
return err
|
|
}
|
|
fs.dirty -= priorDirty
|
|
}
|
|
fs.mu.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop the current filestore.
|
|
func (fs *fileStore) Stop() error {
|
|
fs.mu.Lock()
|
|
if fs.closed {
|
|
fs.mu.Unlock()
|
|
return ErrStoreClosed
|
|
}
|
|
|
|
fs.checkAndFlushAllBlocks()
|
|
fs.closeAllMsgBlocks(false)
|
|
|
|
fs.cancelSyncTimer()
|
|
fs.cancelAgeChk()
|
|
|
|
// Release the state flusher loop.
|
|
close(fs.qch)
|
|
|
|
// Wait for the state flush loop to exit.
|
|
fsld := fs.fsld
|
|
fs.mu.Unlock()
|
|
<-fsld
|
|
// Write full state if needed. If not dirty this is a no-op.
|
|
fs.writeFullState()
|
|
fs.mu.Lock()
|
|
|
|
// Mark as closed.
|
|
fs.closed = true
|
|
fs.lmb = nil
|
|
|
|
// We should update the upper usage layer on a stop.
|
|
cb, bytes := fs.scb, int64(fs.state.Bytes)
|
|
|
|
var _cfs [256]ConsumerStore
|
|
cfs := append(_cfs[:0], fs.cfs...)
|
|
fs.cfs = nil
|
|
fs.mu.Unlock()
|
|
|
|
for _, o := range cfs {
|
|
o.Stop()
|
|
}
|
|
|
|
if bytes > 0 && cb != nil {
|
|
cb(0, -bytes, 0, _EMPTY_)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
const errFile = "errors.txt"
|
|
|
|
// Stream our snapshot through S2 compression and tar.
|
|
func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) {
|
|
defer w.Close()
|
|
|
|
enc := s2.NewWriter(w)
|
|
defer enc.Close()
|
|
|
|
tw := tar.NewWriter(enc)
|
|
defer tw.Close()
|
|
|
|
defer func() {
|
|
fs.mu.Lock()
|
|
fs.sips--
|
|
fs.mu.Unlock()
|
|
}()
|
|
|
|
modTime := time.Now().UTC()
|
|
|
|
writeFile := func(name string, buf []byte) error {
|
|
hdr := &tar.Header{
|
|
Name: name,
|
|
Mode: 0600,
|
|
ModTime: modTime,
|
|
Uname: "nats",
|
|
Gname: "nats",
|
|
Size: int64(len(buf)),
|
|
Format: tar.FormatPAX,
|
|
}
|
|
if err := tw.WriteHeader(hdr); err != nil {
|
|
return err
|
|
}
|
|
if _, err := tw.Write(buf); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
writeErr := func(err string) {
|
|
writeFile(errFile, []byte(err))
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
blks := fs.blks
|
|
// Grab our general meta data.
|
|
// We do this now instead of pulling from files since they could be encrypted.
|
|
meta, err := json.Marshal(fs.cfg)
|
|
if err != nil {
|
|
fs.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
|
|
return
|
|
}
|
|
hh := fs.hh
|
|
hh.Reset()
|
|
hh.Write(meta)
|
|
sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
|
|
fs.mu.Unlock()
|
|
|
|
// Meta first.
|
|
if writeFile(JetStreamMetaFile, meta) != nil {
|
|
return
|
|
}
|
|
if writeFile(JetStreamMetaFileSum, sum) != nil {
|
|
return
|
|
}
|
|
|
|
// Can't use join path here, tar only recognizes relative paths with forward slashes.
|
|
msgPre := msgDir + "/"
|
|
var bbuf []byte
|
|
|
|
const minLen = 32
|
|
sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
|
|
if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
|
|
if fs.aek != nil {
|
|
ns := fs.aek.NonceSize()
|
|
buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
|
|
if err == nil {
|
|
// Redo hash checksum at end on plaintext.
|
|
fs.mu.Lock()
|
|
hh.Reset()
|
|
hh.Write(buf)
|
|
buf = fs.hh.Sum(buf)
|
|
fs.mu.Unlock()
|
|
}
|
|
}
|
|
if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// Now do messages themselves.
|
|
for _, mb := range blks {
|
|
if mb.pendingWriteSize() > 0 {
|
|
mb.flushPendingMsgs()
|
|
}
|
|
mb.mu.Lock()
|
|
// We could stream but don't want to hold the lock and prevent changes, so just read in and
|
|
// release the lock for now.
|
|
bbuf, err = mb.loadBlock(bbuf)
|
|
if err != nil {
|
|
mb.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
|
|
return
|
|
}
|
|
// Check for encryption.
|
|
if mb.bek != nil && len(bbuf) > 0 {
|
|
rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
|
|
if err != nil {
|
|
mb.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
|
|
return
|
|
}
|
|
rbek.XORKeyStream(bbuf, bbuf)
|
|
}
|
|
// Check for compression.
|
|
if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
|
|
mb.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
|
|
return
|
|
}
|
|
mb.mu.Unlock()
|
|
|
|
// Do this one unlocked.
|
|
if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// Bail if no consumers requested.
|
|
if !includeConsumers {
|
|
return
|
|
}
|
|
|
|
// Do consumers' state last.
|
|
fs.mu.RLock()
|
|
cfs := fs.cfs
|
|
fs.mu.RUnlock()
|
|
|
|
for _, cs := range cfs {
|
|
o, ok := cs.(*consumerFileStore)
|
|
if !ok {
|
|
continue
|
|
}
|
|
o.mu.Lock()
|
|
// Grab our general meta data.
|
|
// We do this now instead of pulling from files since they could be encrypted.
|
|
meta, err := json.Marshal(o.cfg)
|
|
if err != nil {
|
|
o.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
|
|
return
|
|
}
|
|
o.hh.Reset()
|
|
o.hh.Write(meta)
|
|
sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))
|
|
|
|
// We can have the running state directly encoded now.
|
|
state, err := o.encodeState()
|
|
if err != nil {
|
|
o.mu.Unlock()
|
|
writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
|
|
return
|
|
}
|
|
odirPre := filepath.Join(consumerDir, o.name)
|
|
o.mu.Unlock()
|
|
|
|
// Write all the consumer files.
|
|
if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
|
|
return
|
|
}
|
|
if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
|
|
return
|
|
}
|
|
writeFile(filepath.Join(odirPre, consumerState), state)
|
|
}
|
|
}
|
|
|
|
// Create a snapshot of this stream and its consumer's state along with messages.
|
|
func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
|
|
fs.mu.Lock()
|
|
if fs.closed {
|
|
fs.mu.Unlock()
|
|
return nil, ErrStoreClosed
|
|
}
|
|
// Only allow one at a time.
|
|
if fs.sips > 0 {
|
|
fs.mu.Unlock()
|
|
return nil, ErrStoreSnapshotInProgress
|
|
}
|
|
// Mark us as snapshotting
|
|
fs.sips += 1
|
|
fs.mu.Unlock()
|
|
|
|
if checkMsgs {
|
|
ld := fs.checkMsgs()
|
|
if ld != nil && len(ld.Msgs) > 0 {
|
|
return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
|
|
}
|
|
}
|
|
|
|
pr, pw := net.Pipe()
|
|
|
|
// Set a write deadline here to protect ourselves.
|
|
if deadline > 0 {
|
|
pw.SetWriteDeadline(time.Now().Add(deadline))
|
|
}
|
|
|
|
// We can add to our stream while snapshotting but not "user" delete anything.
|
|
var state StreamState
|
|
fs.FastState(&state)
|
|
|
|
// Stream in separate Go routine.
|
|
go fs.streamSnapshot(pw, &state, includeConsumers)
|
|
|
|
return &SnapshotResult{pr, state}, nil
|
|
}
|
|
|
|
// Helper to return the config.
|
|
func (fs *fileStore) fileStoreConfig() FileStoreConfig {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
return fs.fcfg
|
|
}
|
|
|
|
// Read lock all existing message blocks.
|
|
// Lock held on entry.
|
|
func (fs *fileStore) readLockAllMsgBlocks() {
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RLock()
|
|
}
|
|
}
|
|
|
|
// Read unlock all existing message blocks.
|
|
// Lock held on entry.
|
|
func (fs *fileStore) readUnlockAllMsgBlocks() {
|
|
for _, mb := range fs.blks {
|
|
mb.mu.RUnlock()
|
|
}
|
|
}
|
|
|
|
// Binary encoded state snapshot, >= v2.10 server.
|
|
func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
|
|
fs.mu.RLock()
|
|
defer fs.mu.RUnlock()
|
|
|
|
// Calculate deleted.
|
|
var numDeleted int64
|
|
if fs.state.LastSeq > fs.state.FirstSeq {
|
|
numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
|
|
if numDeleted < 0 {
|
|
numDeleted = 0
|
|
}
|
|
}
|
|
|
|
// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
|
|
var buf [1024]byte
|
|
buf[0], buf[1] = streamStateMagic, streamStateVersion
|
|
n := hdrLen
|
|
n += binary.PutUvarint(buf[n:], fs.state.Msgs)
|
|
n += binary.PutUvarint(buf[n:], fs.state.Bytes)
|
|
n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
|
|
n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
|
|
n += binary.PutUvarint(buf[n:], failed)
|
|
n += binary.PutUvarint(buf[n:], uint64(numDeleted))
|
|
|
|
b := buf[0:n]
|
|
|
|
if numDeleted > 0 {
|
|
var scratch [4 * 1024]byte
|
|
|
|
fs.readLockAllMsgBlocks()
|
|
defer fs.readUnlockAllMsgBlocks()
|
|
|
|
for _, db := range fs.deleteBlocks() {
|
|
switch db := db.(type) {
|
|
case *DeleteRange:
|
|
first, _, num := db.State()
|
|
scratch[0] = runLengthMagic
|
|
i := 1
|
|
i += binary.PutUvarint(scratch[i:], first)
|
|
i += binary.PutUvarint(scratch[i:], num)
|
|
b = append(b, scratch[0:i]...)
|
|
case *avl.SequenceSet:
|
|
buf, err := db.Encode(scratch[:0])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
b = append(b, buf...)
|
|
default:
|
|
return nil, errors.New("no impl")
|
|
}
|
|
}
|
|
}
|
|
|
|
return b, nil
|
|
}
|
|
|
|
// We used to be more sophisticated to save memory, but speed is more important.
|
|
// All blocks should be at least read locked.
|
|
func (fs *fileStore) deleteBlocks() DeleteBlocks {
|
|
var dbs DeleteBlocks
|
|
var prevLast uint64
|
|
|
|
for _, mb := range fs.blks {
|
|
// Detect if we have a gap between these blocks.
|
|
if prevLast > 0 && prevLast+1 != mb.first.seq {
|
|
gap := mb.first.seq - prevLast - 1
|
|
dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: gap})
|
|
}
|
|
if mb.dmap.Size() > 0 {
|
|
dbs = append(dbs, &mb.dmap)
|
|
}
|
|
prevLast = mb.last.seq
|
|
}
|
|
return dbs
|
|
}
|
|
|
|
// SyncDeleted will make sure this stream has same deleted state as dbs.
|
|
func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
|
|
if len(dbs) == 0 {
|
|
return
|
|
}
|
|
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
|
|
var needsCheck DeleteBlocks
|
|
|
|
fs.readLockAllMsgBlocks()
|
|
mdbs := fs.deleteBlocks()
|
|
for i, db := range dbs {
|
|
// If the block is same as what we have we can skip.
|
|
if i < len(mdbs) {
|
|
first, last, num := db.State()
|
|
eFirst, eLast, eNum := mdbs[i].State()
|
|
if first == eFirst && last == eLast && num == eNum {
|
|
continue
|
|
}
|
|
}
|
|
// Need to insert these.
|
|
needsCheck = append(needsCheck, db)
|
|
}
|
|
fs.readUnlockAllMsgBlocks()
|
|
|
|
for _, db := range needsCheck {
|
|
db.Range(func(dseq uint64) bool {
|
|
fs.removeMsg(dseq, false, true, false)
|
|
return true
|
|
})
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Consumers
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
type consumerFileStore struct {
|
|
mu sync.Mutex
|
|
fs *fileStore
|
|
cfg *FileConsumerInfo
|
|
prf keyGen
|
|
aek cipher.AEAD
|
|
name string
|
|
odir string
|
|
ifn string
|
|
hh hash.Hash64
|
|
state ConsumerState
|
|
fch chan struct{}
|
|
qch chan struct{}
|
|
flusher bool
|
|
writing bool
|
|
dirty bool
|
|
closed bool
|
|
}
|
|
|
|
func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
|
|
if fs == nil {
|
|
return nil, fmt.Errorf("filestore is nil")
|
|
}
|
|
if fs.isClosed() {
|
|
return nil, ErrStoreClosed
|
|
}
|
|
if cfg == nil || name == _EMPTY_ {
|
|
return nil, fmt.Errorf("bad consumer config")
|
|
}
|
|
|
|
// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
|
|
if cfg.MemoryStorage {
|
|
// Create directly here.
|
|
o := &consumerMemStore{ms: fs, cfg: *cfg}
|
|
fs.AddConsumer(o)
|
|
return o, nil
|
|
}
|
|
|
|
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
|
|
if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
|
|
return nil, fmt.Errorf("could not create consumer directory - %v", err)
|
|
}
|
|
csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
|
|
o := &consumerFileStore{
|
|
fs: fs,
|
|
cfg: csi,
|
|
prf: fs.prf,
|
|
name: name,
|
|
odir: odir,
|
|
ifn: filepath.Join(odir, consumerState),
|
|
}
|
|
key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
|
|
hh, err := highwayhash.New64(key[:])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create hash: %v", err)
|
|
}
|
|
o.hh = hh
|
|
|
|
// Check for encryption.
|
|
if o.prf != nil {
|
|
if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
|
|
if len(ekey) < minBlkKeySize {
|
|
return nil, errBadKeySize
|
|
}
|
|
// Recover key encryption key.
|
|
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
sc := fs.fcfg.Cipher
|
|
kek, err := genEncryptionKey(sc, rb)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ns := kek.NonceSize()
|
|
nonce := ekey[:ns]
|
|
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
|
|
if err != nil {
|
|
// We may be here on a cipher conversion, so attempt to convert.
|
|
if err = o.convertCipher(); err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
o.aek, err = genEncryptionKey(sc, seed)
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Track if we are creating the directory so that we can clean up if we encounter an error.
|
|
var didCreate bool
|
|
|
|
// Write our meta data iff does not exist.
|
|
meta := filepath.Join(odir, JetStreamMetaFile)
|
|
if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
|
|
didCreate = true
|
|
csi.Created = time.Now().UTC()
|
|
if err := o.writeConsumerMeta(); err != nil {
|
|
os.RemoveAll(odir)
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// If we expect to be encrypted check that what we are restoring is not plaintext.
|
|
// This can happen on snapshot restores or conversions.
|
|
if o.prf != nil {
|
|
keyFile := filepath.Join(odir, JetStreamMetaFileKey)
|
|
if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
|
|
if err := o.writeConsumerMeta(); err != nil {
|
|
if didCreate {
|
|
os.RemoveAll(odir)
|
|
}
|
|
return nil, err
|
|
}
|
|
// Redo the state file as well here if we have one and we can tell it was plaintext.
|
|
if buf, err := os.ReadFile(o.ifn); err == nil {
|
|
if _, err := decodeConsumerState(buf); err == nil {
|
|
if err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms); err != nil {
|
|
if didCreate {
|
|
os.RemoveAll(odir)
|
|
}
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create channels to control our flush go routine.
|
|
o.fch = make(chan struct{}, 1)
|
|
o.qch = make(chan struct{})
|
|
go o.flushLoop(o.fch, o.qch)
|
|
|
|
// Make sure to load in our state from disk if needed.
|
|
o.loadState()
|
|
|
|
// Assign to filestore.
|
|
fs.AddConsumer(o)
|
|
|
|
return o, nil
|
|
}
|
|
|
|
func (o *consumerFileStore) convertCipher() error {
|
|
fs := o.fs
|
|
odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)
|
|
|
|
ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(ekey) < minBlkKeySize {
|
|
return errBadKeySize
|
|
}
|
|
// Recover key encryption key.
|
|
rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Do these in reverse since converting.
|
|
sc := fs.fcfg.Cipher
|
|
osc := AES
|
|
if sc == AES {
|
|
osc = ChaCha
|
|
}
|
|
kek, err := genEncryptionKey(osc, rb)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns := kek.NonceSize()
|
|
nonce := ekey[:ns]
|
|
seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
aek, err := genEncryptionKey(osc, seed)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Now read in and decode our state using the old cipher.
|
|
buf, err := os.ReadFile(o.ifn)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Since we are here we recovered our old state.
|
|
// Now write our meta, which will generate the new keys with the new cipher.
|
|
if err := o.writeConsumerMeta(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Now write out or state with the new cipher.
|
|
return o.writeState(buf)
|
|
}
|
|
|
|
// Kick flusher for this consumer.
|
|
// Lock should be held.
|
|
func (o *consumerFileStore) kickFlusher() {
|
|
if o.fch != nil {
|
|
select {
|
|
case o.fch <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
o.dirty = true
|
|
}
|
|
|
|
// Set in flusher status
|
|
func (o *consumerFileStore) setInFlusher() {
|
|
o.mu.Lock()
|
|
o.flusher = true
|
|
o.mu.Unlock()
|
|
}
|
|
|
|
// Clear in flusher status
|
|
func (o *consumerFileStore) clearInFlusher() {
|
|
o.mu.Lock()
|
|
o.flusher = false
|
|
o.mu.Unlock()
|
|
}
|
|
|
|
// Report in flusher status
|
|
func (o *consumerFileStore) inFlusher() bool {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
return o.flusher
|
|
}
|
|
|
|
// flushLoop watches for consumer updates and the quit channel.
|
|
func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {
|
|
|
|
o.setInFlusher()
|
|
defer o.clearInFlusher()
|
|
|
|
// Maintain approximately 10 updates per second per consumer under load.
|
|
const minTime = 100 * time.Millisecond
|
|
var lastWrite time.Time
|
|
var dt *time.Timer
|
|
|
|
setDelayTimer := func(addWait time.Duration) {
|
|
if dt == nil {
|
|
dt = time.NewTimer(addWait)
|
|
return
|
|
}
|
|
if !dt.Stop() {
|
|
select {
|
|
case <-dt.C:
|
|
default:
|
|
}
|
|
}
|
|
dt.Reset(addWait)
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-fch:
|
|
if ts := time.Since(lastWrite); ts < minTime {
|
|
setDelayTimer(minTime - ts)
|
|
select {
|
|
case <-dt.C:
|
|
case <-qch:
|
|
return
|
|
}
|
|
}
|
|
o.mu.Lock()
|
|
if o.closed {
|
|
o.mu.Unlock()
|
|
return
|
|
}
|
|
buf, err := o.encodeState()
|
|
o.mu.Unlock()
|
|
if err != nil {
|
|
return
|
|
}
|
|
// TODO(dlc) - if we error should start failing upwards.
|
|
if err := o.writeState(buf); err == nil {
|
|
lastWrite = time.Now()
|
|
}
|
|
case <-qch:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// SetStarting sets our starting stream sequence.
|
|
func (o *consumerFileStore) SetStarting(sseq uint64) error {
|
|
o.mu.Lock()
|
|
o.state.Delivered.Stream = sseq
|
|
buf, err := o.encodeState()
|
|
o.mu.Unlock()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return o.writeState(buf)
|
|
}
|
|
|
|
// HasState returns if this store has a recorded state.
|
|
func (o *consumerFileStore) HasState() bool {
|
|
o.mu.Lock()
|
|
_, err := os.Stat(o.ifn)
|
|
o.mu.Unlock()
|
|
return err == nil
|
|
}
|
|
|
|
// UpdateDelivered is called whenever a new message has been delivered.
|
|
func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
|
|
if dc != 1 && o.cfg.AckPolicy == AckNone {
|
|
return ErrNoAckPolicy
|
|
}
|
|
|
|
// On restarts the old leader may get a replay from the raft logs that are old.
|
|
if dseq <= o.state.AckFloor.Consumer {
|
|
return nil
|
|
}
|
|
|
|
// See if we expect an ack for this.
|
|
if o.cfg.AckPolicy != AckNone {
|
|
// Need to create pending records here.
|
|
if o.state.Pending == nil {
|
|
o.state.Pending = make(map[uint64]*Pending)
|
|
}
|
|
var p *Pending
|
|
// Check for an update to a message already delivered.
|
|
if sseq <= o.state.Delivered.Stream {
|
|
if p = o.state.Pending[sseq]; p != nil {
|
|
p.Sequence, p.Timestamp = dseq, ts
|
|
}
|
|
} else {
|
|
// Add to pending.
|
|
o.state.Pending[sseq] = &Pending{dseq, ts}
|
|
}
|
|
// Update delivered as needed.
|
|
if dseq > o.state.Delivered.Consumer {
|
|
o.state.Delivered.Consumer = dseq
|
|
}
|
|
if sseq > o.state.Delivered.Stream {
|
|
o.state.Delivered.Stream = sseq
|
|
}
|
|
|
|
if dc > 1 {
|
|
if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
|
|
// Make sure to remove from pending.
|
|
delete(o.state.Pending, sseq)
|
|
}
|
|
if o.state.Redelivered == nil {
|
|
o.state.Redelivered = make(map[uint64]uint64)
|
|
}
|
|
// Only update if greater then what we already have.
|
|
if o.state.Redelivered[sseq] < dc-1 {
|
|
o.state.Redelivered[sseq] = dc - 1
|
|
}
|
|
}
|
|
} else {
|
|
// For AckNone just update delivered and ackfloor at the same time.
|
|
if dseq > o.state.Delivered.Consumer {
|
|
o.state.Delivered.Consumer = dseq
|
|
o.state.AckFloor.Consumer = dseq
|
|
}
|
|
if sseq > o.state.Delivered.Stream {
|
|
o.state.Delivered.Stream = sseq
|
|
o.state.AckFloor.Stream = sseq
|
|
}
|
|
}
|
|
// Make sure we flush to disk.
|
|
o.kickFlusher()
|
|
|
|
return nil
|
|
}
|
|
|
|
// UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
|
|
func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
|
|
if o.cfg.AckPolicy == AckNone {
|
|
return ErrNoAckPolicy
|
|
}
|
|
|
|
// On restarts the old leader may get a replay from the raft logs that are old.
|
|
if dseq <= o.state.AckFloor.Consumer {
|
|
return nil
|
|
}
|
|
|
|
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
|
|
return ErrStoreMsgNotFound
|
|
}
|
|
|
|
// Check for AckAll here.
|
|
if o.cfg.AckPolicy == AckAll {
|
|
sgap := sseq - o.state.AckFloor.Stream
|
|
o.state.AckFloor.Consumer = dseq
|
|
o.state.AckFloor.Stream = sseq
|
|
for seq := sseq; seq > sseq-sgap; seq-- {
|
|
delete(o.state.Pending, seq)
|
|
if len(o.state.Redelivered) > 0 {
|
|
delete(o.state.Redelivered, seq)
|
|
}
|
|
}
|
|
o.kickFlusher()
|
|
return nil
|
|
}
|
|
|
|
// AckExplicit
|
|
|
|
// First delete from our pending state.
|
|
if p, ok := o.state.Pending[sseq]; ok {
|
|
delete(o.state.Pending, sseq)
|
|
dseq = p.Sequence // Use the original.
|
|
}
|
|
if len(o.state.Pending) == 0 {
|
|
o.state.AckFloor.Consumer = o.state.Delivered.Consumer
|
|
o.state.AckFloor.Stream = o.state.Delivered.Stream
|
|
} else if dseq == o.state.AckFloor.Consumer+1 {
|
|
o.state.AckFloor.Consumer = dseq
|
|
o.state.AckFloor.Stream = sseq
|
|
|
|
if o.state.Delivered.Consumer > dseq {
|
|
for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
|
|
if p, ok := o.state.Pending[ss]; ok {
|
|
if p.Sequence > 0 {
|
|
o.state.AckFloor.Consumer = p.Sequence - 1
|
|
o.state.AckFloor.Stream = ss - 1
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// We do these regardless.
|
|
delete(o.state.Redelivered, sseq)
|
|
|
|
o.kickFlusher()
|
|
return nil
|
|
}
|
|
|
|
const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen
|
|
|
|
// Encode our consumer state, version 2.
|
|
// Lock should be held.
|
|
|
|
func (o *consumerFileStore) EncodedState() ([]byte, error) {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
return o.encodeState()
|
|
}
|
|
|
|
func (o *consumerFileStore) encodeState() ([]byte, error) {
|
|
// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
|
|
state, err := o.stateWithCopyLocked(false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return encodeConsumerState(state), nil
|
|
}
|
|
|
|
func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
|
|
// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
|
|
csi := o.cfg
|
|
csi.ConsumerConfig = *cfg
|
|
|
|
return o.writeConsumerMeta()
|
|
}
|
|
|
|
func (o *consumerFileStore) Update(state *ConsumerState) error {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
|
|
// Check to see if this is an outdated update.
|
|
if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
|
|
return nil
|
|
}
|
|
|
|
// Sanity checks.
|
|
if state.AckFloor.Consumer > state.Delivered.Consumer {
|
|
return fmt.Errorf("bad ack floor for consumer")
|
|
}
|
|
if state.AckFloor.Stream > state.Delivered.Stream {
|
|
return fmt.Errorf("bad ack floor for stream")
|
|
}
|
|
|
|
// Copy to our state.
|
|
var pending map[uint64]*Pending
|
|
var redelivered map[uint64]uint64
|
|
if len(state.Pending) > 0 {
|
|
pending = make(map[uint64]*Pending, len(state.Pending))
|
|
for seq, p := range state.Pending {
|
|
pending[seq] = &Pending{p.Sequence, p.Timestamp}
|
|
if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
|
|
return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
|
|
}
|
|
}
|
|
}
|
|
if len(state.Redelivered) > 0 {
|
|
redelivered = make(map[uint64]uint64, len(state.Redelivered))
|
|
for seq, dc := range state.Redelivered {
|
|
redelivered[seq] = dc
|
|
}
|
|
}
|
|
|
|
o.state.Delivered = state.Delivered
|
|
o.state.AckFloor = state.AckFloor
|
|
o.state.Pending = pending
|
|
o.state.Redelivered = redelivered
|
|
|
|
o.kickFlusher()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
|
|
// Lock should be held.
|
|
func (o *consumerFileStore) encryptState(buf []byte) []byte {
|
|
if o.aek == nil {
|
|
return buf
|
|
}
|
|
// TODO(dlc) - Optimize on space usage a bit?
|
|
nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
|
|
rand.Read(nonce)
|
|
return o.aek.Seal(nonce, nonce, buf, nil)
|
|
}
|
|
|
|
// Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
|
|
// https://github.com/nats-io/nats-server/issues/2742
|
|
var dios chan struct{}
|
|
|
|
// Used to setup our simplistic counting semaphore using buffered channels.
|
|
// golang.org's semaphore seemed a bit heavy.
|
|
func init() {
|
|
// Limit ourselves to a max of 4 blocking IO calls.
|
|
const nIO = 4
|
|
dios = make(chan struct{}, nIO)
|
|
// Fill it up to start.
|
|
for i := 0; i < nIO; i++ {
|
|
dios <- struct{}{}
|
|
}
|
|
}
|
|
|
|
func (o *consumerFileStore) writeState(buf []byte) error {
|
|
// Check if we have the index file open.
|
|
o.mu.Lock()
|
|
if o.writing || len(buf) == 0 {
|
|
o.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// Check on encryption.
|
|
if o.aek != nil {
|
|
buf = o.encryptState(buf)
|
|
}
|
|
|
|
o.writing = true
|
|
o.dirty = false
|
|
ifn := o.ifn
|
|
o.mu.Unlock()
|
|
|
|
// Lock not held here but we do limit number of outstanding calls that could block OS threads.
|
|
<-dios
|
|
err := os.WriteFile(ifn, buf, defaultFilePerms)
|
|
dios <- struct{}{}
|
|
|
|
o.mu.Lock()
|
|
if err != nil {
|
|
o.dirty = true
|
|
}
|
|
o.writing = false
|
|
o.mu.Unlock()
|
|
|
|
return err
|
|
}
|
|
|
|
// Will upodate the config. Only used when recovering ephemerals.
|
|
func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
|
|
return o.writeConsumerMeta()
|
|
}
|
|
|
|
// Write out the consumer meta data, i.e. state.
|
|
// Lock should be held.
|
|
func (cfs *consumerFileStore) writeConsumerMeta() error {
|
|
meta := filepath.Join(cfs.odir, JetStreamMetaFile)
|
|
if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
|
|
if cfs.prf != nil && cfs.aek == nil {
|
|
fs := cfs.fs
|
|
key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cfs.aek = key
|
|
keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
|
|
if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if err := os.WriteFile(keyFile, encrypted, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
b, err := json.Marshal(cfs.cfg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Encrypt if needed.
|
|
if cfs.aek != nil {
|
|
nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
|
|
rand.Read(nonce)
|
|
b = cfs.aek.Seal(nonce, nonce, b, nil)
|
|
}
|
|
|
|
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
cfs.hh.Reset()
|
|
cfs.hh.Write(b)
|
|
checksum := hex.EncodeToString(cfs.hh.Sum(nil))
|
|
sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
|
|
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Consumer version.
|
|
func checkConsumerHeader(hdr []byte) (uint8, error) {
|
|
if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
|
|
return 0, errCorruptState
|
|
}
|
|
version := hdr[1]
|
|
switch version {
|
|
case 1, 2:
|
|
return version, nil
|
|
}
|
|
return 0, fmt.Errorf("unsupported version: %d", version)
|
|
}
|
|
|
|
func (o *consumerFileStore) copyPending() map[uint64]*Pending {
|
|
pending := make(map[uint64]*Pending, len(o.state.Pending))
|
|
for seq, p := range o.state.Pending {
|
|
pending[seq] = &Pending{p.Sequence, p.Timestamp}
|
|
}
|
|
return pending
|
|
}
|
|
|
|
func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
|
|
redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
|
|
for seq, dc := range o.state.Redelivered {
|
|
redelivered[seq] = dc
|
|
}
|
|
return redelivered
|
|
}
|
|
|
|
// Type returns the type of the underlying store.
|
|
func (o *consumerFileStore) Type() StorageType { return FileStorage }
|
|
|
|
// State retrieves the state from the state file.
|
|
// This is not expected to be called in high performance code, only on startup.
|
|
func (o *consumerFileStore) State() (*ConsumerState, error) {
|
|
return o.stateWithCopy(true)
|
|
}
|
|
|
|
// This will not copy pending or redelivered, so should only be done under the
|
|
// consumer owner's lock.
|
|
func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
|
|
return o.stateWithCopy(false)
|
|
}
|
|
|
|
func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
|
|
o.mu.Lock()
|
|
defer o.mu.Unlock()
|
|
return o.stateWithCopyLocked(doCopy)
|
|
}
|
|
|
|
// Lock should be held.
|
|
func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
|
|
if o.closed {
|
|
return nil, ErrStoreClosed
|
|
}
|
|
|
|
state := &ConsumerState{}
|
|
|
|
// See if we have a running state or if we need to read in from disk.
|
|
if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
|
|
state.Delivered = o.state.Delivered
|
|
state.AckFloor = o.state.AckFloor
|
|
if len(o.state.Pending) > 0 {
|
|
if doCopy {
|
|
state.Pending = o.copyPending()
|
|
} else {
|
|
state.Pending = o.state.Pending
|
|
}
|
|
}
|
|
if len(o.state.Redelivered) > 0 {
|
|
if doCopy {
|
|
state.Redelivered = o.copyRedelivered()
|
|
} else {
|
|
state.Redelivered = o.state.Redelivered
|
|
}
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
// Read the state in here from disk..
|
|
buf, err := os.ReadFile(o.ifn)
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return nil, err
|
|
}
|
|
|
|
if len(buf) == 0 {
|
|
return state, nil
|
|
}
|
|
|
|
// Check on encryption.
|
|
if o.aek != nil {
|
|
ns := o.aek.NonceSize()
|
|
buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
state, err = decodeConsumerState(buf)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Copy this state into our own.
|
|
o.state.Delivered = state.Delivered
|
|
o.state.AckFloor = state.AckFloor
|
|
if len(state.Pending) > 0 {
|
|
if doCopy {
|
|
o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
|
|
for seq, p := range state.Pending {
|
|
o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
|
|
}
|
|
} else {
|
|
o.state.Pending = state.Pending
|
|
}
|
|
}
|
|
if len(state.Redelivered) > 0 {
|
|
if doCopy {
|
|
o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
|
|
for seq, dc := range state.Redelivered {
|
|
o.state.Redelivered[seq] = dc
|
|
}
|
|
} else {
|
|
o.state.Redelivered = state.Redelivered
|
|
}
|
|
}
|
|
|
|
return state, nil
|
|
}
|
|
|
|
// Lock should be held. Called at startup.
|
|
func (o *consumerFileStore) loadState() {
|
|
if _, err := os.Stat(o.ifn); err == nil {
|
|
// This will load our state in from disk.
|
|
o.stateWithCopyLocked(false)
|
|
}
|
|
}
|
|
|
|
// Decode consumer state.
|
|
func decodeConsumerState(buf []byte) (*ConsumerState, error) {
|
|
version, err := checkConsumerHeader(buf)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
bi := hdrLen
|
|
// Helpers, will set i to -1 on error.
|
|
readSeq := func() uint64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
seq, n := binary.Uvarint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return 0
|
|
}
|
|
bi += n
|
|
return seq
|
|
}
|
|
readTimeStamp := func() int64 {
|
|
if bi < 0 {
|
|
return 0
|
|
}
|
|
ts, n := binary.Varint(buf[bi:])
|
|
if n <= 0 {
|
|
bi = -1
|
|
return -1
|
|
}
|
|
bi += n
|
|
return ts
|
|
}
|
|
// Just for clarity below.
|
|
readLen := readSeq
|
|
readCount := readSeq
|
|
|
|
state := &ConsumerState{}
|
|
state.AckFloor.Consumer = readSeq()
|
|
state.AckFloor.Stream = readSeq()
|
|
state.Delivered.Consumer = readSeq()
|
|
state.Delivered.Stream = readSeq()
|
|
|
|
if bi == -1 {
|
|
return nil, errCorruptState
|
|
}
|
|
if version == 1 {
|
|
// Adjust back. Version 1 also stored delivered as next to be delivered,
|
|
// so adjust that back down here.
|
|
if state.AckFloor.Consumer > 1 {
|
|
state.Delivered.Consumer += state.AckFloor.Consumer - 1
|
|
}
|
|
if state.AckFloor.Stream > 1 {
|
|
state.Delivered.Stream += state.AckFloor.Stream - 1
|
|
}
|
|
}
|
|
|
|
// We have additional stuff.
|
|
if numPending := readLen(); numPending > 0 {
|
|
mints := readTimeStamp()
|
|
state.Pending = make(map[uint64]*Pending, numPending)
|
|
for i := 0; i < int(numPending); i++ {
|
|
sseq := readSeq()
|
|
var dseq uint64
|
|
if version == 2 {
|
|
dseq = readSeq()
|
|
}
|
|
ts := readTimeStamp()
|
|
// Check the state machine for corruption, not the value which could be -1.
|
|
if bi == -1 {
|
|
return nil, errCorruptState
|
|
}
|
|
// Adjust seq back.
|
|
sseq += state.AckFloor.Stream
|
|
if sseq == 0 {
|
|
return nil, errCorruptState
|
|
}
|
|
if version == 2 {
|
|
dseq += state.AckFloor.Consumer
|
|
}
|
|
// Adjust the timestamp back.
|
|
if version == 1 {
|
|
ts = (ts + mints) * int64(time.Second)
|
|
} else {
|
|
ts = (mints - ts) * int64(time.Second)
|
|
}
|
|
// Store in pending.
|
|
state.Pending[sseq] = &Pending{dseq, ts}
|
|
}
|
|
}
|
|
|
|
// We have redelivered entries here.
|
|
if numRedelivered := readLen(); numRedelivered > 0 {
|
|
state.Redelivered = make(map[uint64]uint64, numRedelivered)
|
|
for i := 0; i < int(numRedelivered); i++ {
|
|
if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
|
|
// Adjust seq back.
|
|
seq += state.AckFloor.Stream
|
|
state.Redelivered[seq] = n
|
|
}
|
|
}
|
|
}
|
|
|
|
return state, nil
|
|
}
|
|
|
|
// Stop the processing of the consumers's state.
|
|
func (o *consumerFileStore) Stop() error {
|
|
o.mu.Lock()
|
|
if o.closed {
|
|
o.mu.Unlock()
|
|
return nil
|
|
}
|
|
if o.qch != nil {
|
|
close(o.qch)
|
|
o.qch = nil
|
|
}
|
|
|
|
var err error
|
|
var buf []byte
|
|
|
|
if o.dirty {
|
|
// Make sure to write this out..
|
|
if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
|
|
if o.aek != nil {
|
|
buf = o.encryptState(buf)
|
|
}
|
|
}
|
|
}
|
|
|
|
o.odir = _EMPTY_
|
|
o.closed = true
|
|
ifn, fs := o.ifn, o.fs
|
|
o.mu.Unlock()
|
|
|
|
fs.RemoveConsumer(o)
|
|
|
|
if len(buf) > 0 {
|
|
o.waitOnFlusher()
|
|
<-dios
|
|
err = os.WriteFile(ifn, buf, defaultFilePerms)
|
|
dios <- struct{}{}
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (o *consumerFileStore) waitOnFlusher() {
|
|
if !o.inFlusher() {
|
|
return
|
|
}
|
|
|
|
timeout := time.Now().Add(100 * time.Millisecond)
|
|
for time.Now().Before(timeout) {
|
|
if !o.inFlusher() {
|
|
return
|
|
}
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
// Delete the consumer.
|
|
func (o *consumerFileStore) Delete() error {
|
|
return o.delete(false)
|
|
}
|
|
|
|
func (o *consumerFileStore) StreamDelete() error {
|
|
return o.delete(true)
|
|
}
|
|
|
|
func (o *consumerFileStore) delete(streamDeleted bool) error {
|
|
o.mu.Lock()
|
|
if o.closed {
|
|
o.mu.Unlock()
|
|
return nil
|
|
}
|
|
if o.qch != nil {
|
|
close(o.qch)
|
|
o.qch = nil
|
|
}
|
|
|
|
var err error
|
|
odir := o.odir
|
|
o.odir = _EMPTY_
|
|
o.closed = true
|
|
fs := o.fs
|
|
o.mu.Unlock()
|
|
|
|
// If our stream was not deleted this will remove the directories.
|
|
if odir != _EMPTY_ && !streamDeleted {
|
|
<-dios
|
|
err = os.RemoveAll(odir)
|
|
dios <- struct{}{}
|
|
}
|
|
|
|
if !streamDeleted {
|
|
fs.RemoveConsumer(o)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (fs *fileStore) AddConsumer(o ConsumerStore) error {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
fs.cfs = append(fs.cfs, o)
|
|
return nil
|
|
}
|
|
|
|
func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
|
|
fs.mu.Lock()
|
|
defer fs.mu.Unlock()
|
|
for i, cfs := range fs.cfs {
|
|
if o == cfs {
|
|
fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
|
|
break
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Templates
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
type templateFileStore struct {
|
|
dir string
|
|
hh hash.Hash64
|
|
}
|
|
|
|
func newTemplateFileStore(storeDir string) *templateFileStore {
|
|
tdir := filepath.Join(storeDir, tmplsDir)
|
|
key := sha256.Sum256([]byte("templates"))
|
|
hh, err := highwayhash.New64(key[:])
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return &templateFileStore{dir: tdir, hh: hh}
|
|
}
|
|
|
|
func (ts *templateFileStore) Store(t *streamTemplate) error {
|
|
dir := filepath.Join(ts.dir, t.Name)
|
|
if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
|
|
return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
|
|
}
|
|
meta := filepath.Join(dir, JetStreamMetaFile)
|
|
if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
|
|
return err
|
|
}
|
|
t.mu.Lock()
|
|
b, err := json.Marshal(t)
|
|
t.mu.Unlock()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
// FIXME(dlc) - Do checksum
|
|
ts.hh.Reset()
|
|
ts.hh.Write(b)
|
|
checksum := hex.EncodeToString(ts.hh.Sum(nil))
|
|
sum := filepath.Join(dir, JetStreamMetaFileSum)
|
|
if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (ts *templateFileStore) Delete(t *streamTemplate) error {
|
|
return os.RemoveAll(filepath.Join(ts.dir, t.Name))
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Compression
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
type CompressionInfo struct {
|
|
Algorithm StoreCompression
|
|
OriginalSize uint64
|
|
}
|
|
|
|
func (c *CompressionInfo) MarshalMetadata() []byte {
|
|
b := make([]byte, 14) // 4 + potentially up to 10 for uint64
|
|
b[0], b[1], b[2] = 'c', 'm', 'p'
|
|
b[3] = byte(c.Algorithm)
|
|
n := binary.PutUvarint(b[4:], c.OriginalSize)
|
|
return b[:4+n]
|
|
}
|
|
|
|
func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
|
|
c.Algorithm = NoCompression
|
|
c.OriginalSize = 0
|
|
if len(b) < 5 { // 4 + min 1 for uvarint uint64
|
|
return 0, nil
|
|
}
|
|
if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
|
|
return 0, nil
|
|
}
|
|
var n int
|
|
c.Algorithm = StoreCompression(b[3])
|
|
c.OriginalSize, n = binary.Uvarint(b[4:])
|
|
if n <= 0 {
|
|
return 0, fmt.Errorf("metadata incomplete")
|
|
}
|
|
return 4 + n, nil
|
|
}
|
|
|
|
func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
|
|
if len(buf) < checksumSize {
|
|
return nil, fmt.Errorf("uncompressed buffer is too short")
|
|
}
|
|
bodyLen := int64(len(buf) - checksumSize)
|
|
var output bytes.Buffer
|
|
var writer io.WriteCloser
|
|
switch alg {
|
|
case NoCompression:
|
|
return buf, nil
|
|
case S2Compression:
|
|
writer = s2.NewWriter(&output)
|
|
default:
|
|
return nil, fmt.Errorf("compression algorithm not known")
|
|
}
|
|
|
|
input := bytes.NewReader(buf[:bodyLen])
|
|
checksum := buf[bodyLen:]
|
|
|
|
// Compress the block content, but don't compress the checksum.
|
|
// We will preserve it at the end of the block as-is.
|
|
if n, err := io.CopyN(writer, input, bodyLen); err != nil {
|
|
return nil, fmt.Errorf("error writing to compression writer: %w", err)
|
|
} else if n != bodyLen {
|
|
return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
|
|
}
|
|
if err := writer.Close(); err != nil {
|
|
return nil, fmt.Errorf("error closing compression writer: %w", err)
|
|
}
|
|
|
|
// Now add the checksum back onto the end of the block.
|
|
if n, err := output.Write(checksum); err != nil {
|
|
return nil, fmt.Errorf("error writing checksum: %w", err)
|
|
} else if n != checksumSize {
|
|
return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
|
|
}
|
|
|
|
return output.Bytes(), nil
|
|
}
|
|
|
|
func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
|
|
if len(buf) < checksumSize {
|
|
return nil, fmt.Errorf("compressed buffer is too short")
|
|
}
|
|
bodyLen := int64(len(buf) - checksumSize)
|
|
input := bytes.NewReader(buf[:bodyLen])
|
|
|
|
var reader io.ReadCloser
|
|
switch alg {
|
|
case NoCompression:
|
|
return buf, nil
|
|
case S2Compression:
|
|
reader = io.NopCloser(s2.NewReader(input))
|
|
default:
|
|
return nil, fmt.Errorf("compression algorithm not known")
|
|
}
|
|
|
|
// Decompress the block content. The checksum isn't compressed so
|
|
// we can preserve it from the end of the block as-is.
|
|
checksum := buf[bodyLen:]
|
|
output, err := io.ReadAll(reader)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error reading compression reader: %w", err)
|
|
}
|
|
output = append(output, checksum...)
|
|
|
|
return output, reader.Close()
|
|
}
|