diff --git a/adapter/redis.go b/adapter/redis.go index df76dbc5..7bf4fb6f 100644 --- a/adapter/redis.go +++ b/adapter/redis.go @@ -4,11 +4,8 @@ import ( "bytes" "context" "fmt" - "io" "log" "log/slog" - "maps" - "math" "net" "os" "sort" @@ -132,45 +129,6 @@ const ( listPopDeltaOverhead = 1 ) -var redisTxnKeyPrefix = []byte("!txn|") - -type redisSetOptions struct { - existsCond bool - missingCond bool - returnOld bool - ttl *time.Time -} - -type redisSetState struct { - rawTyp redisValueType // TTL-unaware type, for internal-key cleanup - typ redisValueType // TTL-aware type, for NX/XX/GET semantics - oldValue []byte -} - -type redisSetExecution struct { - state redisSetState - wroteNull bool - wroteOldBulk bool -} - -type txnCommandHandler func(*txnContext, redcon.Command) (redisResult, error) - -var txnApplyHandlers = map[string]txnCommandHandler{ - cmdSet: (*txnContext).applySet, - cmdDel: (*txnContext).applyDel, - cmdGet: (*txnContext).applyGet, - cmdExists: (*txnContext).applyExists, - cmdRPush: (*txnContext).applyRPush, - cmdLRange: (*txnContext).applyLRange, - cmdZIncrBy: (*txnContext).applyZIncrBy, - cmdExpire: (*txnContext).applyExpireSeconds, - cmdPExpire: (*txnContext).applyExpireMilliseconds, -} - -// argsLen is derived from redisCommandSpecs in adapter/redis_command_specs.go. -// See that file for the canonical row list and the rationale for the -// single source of truth. - type RedisServer struct { listen net.Listener store store.MVCCStore @@ -817,151 +775,6 @@ func formatTraceArgs(args [][]byte) string { return "[" + strings.Join(parts, " ") + "]" } -func parseRedisSetOptions(args [][]byte, now time.Time) (redisSetOptions, error) { - opts := redisSetOptions{} - for i := 0; i < len(args); i++ { - opt := strings.ToUpper(string(args[i])) - switch opt { - case "EX", "PX": - ttl, nextIndex, err := parseRedisSetTTL(args, i, opt, now) - if err != nil { - return redisSetOptions{}, err - } - opts.ttl = ttl - i = nextIndex - case "NX": - opts.missingCond = true - case "XX": - opts.existsCond = true - case "GET": - opts.returnOld = true - default: - return redisSetOptions{}, errors.New("ERR syntax error") - } - } - if opts.existsCond && opts.missingCond { - return redisSetOptions{}, errors.New("ERR syntax error") - } - return opts, nil -} - -func parseRedisSetTTL(args [][]byte, index int, opt string, now time.Time) (*time.Time, int, error) { - if index+1 >= len(args) { - return nil, index, errors.New("ERR syntax error") - } - n, err := strconv.ParseInt(string(args[index+1]), 10, 64) - if err != nil { - // Match Redis behavior: invalid numeric TTL value should not expose - // internal parsing errors, but return a stable protocol error. - return nil, index, errors.New("ERR value is not an integer or out of range") - } - if n <= 0 { - return nil, index, errors.New("ERR invalid expire time in 'set' command") - } - - unit := time.Millisecond - if opt == "EX" { - unit = time.Second - } - if n > math.MaxInt64/int64(unit) { - return nil, index, errors.New("ERR invalid expire time in 'set' command") - } - - expireAt := now.Add(time.Duration(n) * unit) - return &expireAt, index + 1, nil -} - -func (o redisSetOptions) isFastPath() bool { - return !o.returnOld && !o.existsCond && !o.missingCond -} - -func (o redisSetOptions) allows(exists bool) bool { - if o.existsCond && !exists { - return false - } - if o.missingCond && exists { - return false - } - return true -} - -func (r *RedisServer) loadRedisSetState(ctx context.Context, key []byte, readTS uint64, returnOld bool) (redisSetState, error) { - // Probe type ONCE (rawKeyTypeAt issues up to ~17 pebble seeks), - // then derive both the raw and TTL-filtered views from it. The - // previous implementation called rawKeyTypeAt + keyTypeAt, which - // called rawKeyTypeAt again inside -- doubling every SET to ~34 - // seeks for purely redundant work. - rawTyp, err := r.rawKeyTypeAt(ctx, key, readTS) - if err != nil { - return redisSetState{}, err - } - // typ (TTL-aware) drives NX/XX/GET Redis semantics: expired keys are "gone". - typ, err := r.applyTTLFilter(ctx, key, readTS, rawTyp) - if err != nil { - return redisSetState{}, err - } - - state := redisSetState{rawTyp: rawTyp, typ: typ} - if !returnOld || typ != redisTypeString { - return state, nil - } - - oldValue, _, err := r.readRedisStringAt(key, readTS) - if err != nil && !errors.Is(err, store.ErrKeyNotFound) { - return redisSetState{}, err - } - state.oldValue = oldValue - return state, nil -} - -func (r *RedisServer) replaceWithStringTxn(ctx context.Context, key, value []byte, ttl *time.Time, typ redisValueType, readTS uint64) error { - var elems []*kv.Elem[kv.OP] - if isNonStringCollectionType(typ) { - delElems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - elems = append(elems, delElems...) - } - // Embed TTL in the string value; write !redis|ttl| as a secondary scan index. - encoded := encodeRedisStr(bytes.Clone(value), ttl) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisStrKey(key), Value: encoded}) - if ttl != nil { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(*ttl)}) - } else { - // Clear any prior scan index so a persistent string is not later expired. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(key)}) - } - return r.dispatchElems(ctx, true, readTS, elems) -} - -func (r *RedisServer) executeSet(ctx context.Context, key, value []byte, opts redisSetOptions) (redisSetExecution, error) { - var result redisSetExecution - err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - state, err := r.loadRedisSetState(ctx, key, readTS, opts.returnOld) - if err != nil { - return err - } - - exists := state.typ != redisTypeNone - if !opts.allows(exists) { - result = redisSetExecution{wroteNull: true} - return nil - } - if opts.returnOld && exists && state.typ != redisTypeString { - return wrongTypeError() - } - // Use rawTyp for cleanup so expired-but-lingering internal keys are deleted. - if err := r.replaceWithStringTxn(ctx, key, value, opts.ttl, state.rawTyp, readTS); err != nil { - return err - } - result = redisSetExecution{state: state, wroteOldBulk: opts.returnOld} - return nil - }) - return result, err -} - func (r *RedisServer) observeRedisError(command string, dur time.Duration) { if r.requestObserver == nil { return @@ -1120,3364 +933,3 @@ func (r *RedisServer) validateCmd(cmd redcon.Command) error { func (r *RedisServer) ping(conn redcon.Conn, _ redcon.Command) { conn.WriteString("PONG") } - -// trySetFastPath attempts the fast-path for SET (no NX/XX/GET flags) when the -// key is a string or absent. Returns true if the fast-path handled the command. -// When the key holds a non-string type, returns false so the caller can fall -// through to executeSet which cleans up internal keys before overwriting. -func (r *RedisServer) trySetFastPath(conn redcon.Conn, ctx context.Context, key, value []byte, ttl *time.Time) bool { - // Only use the fast path when we are the leader for this key so the local - // type check is authoritative. On followers, stale MVCC state could miss a - // non-string type, leaving orphaned internal keys after overwrite. - if !r.coordinator.IsLeaderForKey(key) { - return false - } - readTS := r.readTS() - // Use rawKeyTypeAt (TTL-unaware) so that expired keys whose internal data - // still exists are detected and routed through the full cleanup path. - typ, err := r.rawKeyTypeAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return true - } - if isNonStringCollectionType(typ) { - return false - } - if err := r.saveString(ctx, key, value, ttl); err != nil { - writeRedisError(conn, err) - return true - } - conn.WriteString("OK") - return true -} - -func (r *RedisServer) set(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - // Option-2 dedup for standalone SET: route through runTransactionWithDedup - // as a single-mop EXEC body when the gate is on. SET inside MULTI/EXEC - // already has full dedup coverage via applySet (§M3 in the design doc), - // so we just reuse that machinery instead of building a per-handler - // reusableSetTxn + dispatchSetReuse shape. The fast-path optimization is - // intentionally bypassed under the gate — dedup is opt-in, and a - // non-dedup'd fast path under a dedup-on cluster would split the - // idempotency contract. - // - // Result translation: runTransactionWithDedup returns []redisResult; for - // SET there is exactly one element with the same redisResult shape as - // the standalone reply (resultString OK / resultNil for NX/XX miss / - // resultBulk for GET). - // Both gates must be on to route standalone SET through the dedup path. - // onePhaseTxnDedup covers the MULTI/EXEC and list-push retries that the - // parent design's M4 validated; standaloneSetDedup is a separate sub-gate - // (default off) because applySet diverges from executeSet on SET-over- - // collection — flipping onePhaseTxnDedup default-on without this guard - // would change normal Redis overwrite behaviour (PR #943 round-1 codex P1). - if r.onePhaseTxnDedup && r.standaloneSetDedup { - // Call runTransactionWithDedup directly instead of going through - // runTransaction. runTransaction re-checks the same - // r.onePhaseTxnDedup gate and routes here anyway; the indirection - // would make the call chain misleading ("dispatches via - // runTransactionWithDedup" being true only by indirection). - // Direct call makes the intent explicit and removes the double - // gate check. - results, err := r.runTransactionWithDedup([]redcon.Command{cmd}) - if err != nil { - writeRedisError(conn, err) - return - } - writeRedisStandaloneResult(conn, results) - return - } - r.setLegacy(conn, cmd) -} - -// setLegacy is the pre-dedup standalone SET path. Extracted from set() so -// the gate-on routing through runTransactionWithDedup keeps set() under the -// cyclop budget (the gate-off branch's parse + fast-path + executeSet -// shape carries its own decision points). Behaviour is byte-identical to -// the pre-PR set() body. -func (r *RedisServer) setLegacy(conn redcon.Conn, cmd redcon.Command) { - opts, err := parseRedisSetOptions(cmd.Args[3:], time.Now()) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - if opts.isFastPath() && r.trySetFastPath(conn, ctx, cmd.Args[1], cmd.Args[2], opts.ttl) { - return - } - - result, err := r.executeSet(ctx, cmd.Args[1], cmd.Args[2], opts) - if err != nil { - writeRedisError(conn, err) - return - } - if result.wroteNull { - conn.WriteNull() - return - } - if result.wroteOldBulk { - if result.state.oldValue == nil { - conn.WriteNull() - return - } - conn.WriteBulk(result.state.oldValue) - return - } - conn.WriteString("OK") -} - -// writeRedisStandaloneResult translates a single-element results array from -// runTransactionWithDedup into a redcon response, mirroring the shape a -// standalone handler would write directly. Used by SET / future standalone -// commands routed through the dedup loop. Differs from writeResults in NOT -// wrapping the response in conn.WriteArray — the standalone protocol returns -// the bare element. -// -// Empty or multi-element input is degenerate for standalone callers; we -// default to nil so a misuse never leaks a malformed reply to the wire. -// -// Array-element constraint: the resultArray arm writes each element via -// WriteBulkString, which is correct for flat arrays of strings (the -// shape applySet / future SET-pattern callers produce). It does NOT -// recurse into nested arrays. A future caller whose applyXxx emits -// resultArray with non-string elements (e.g. HGETALL-like nested -// responses) must either pre-flatten its result or extend this switch -// with a recursive arm; reusing it as-is would silently mangle the -// wire reply. -func writeRedisStandaloneResult(conn redcon.Conn, results []redisResult) { - if len(results) != 1 { - conn.WriteNull() - return - } - res := results[0] - switch res.typ { - case resultNil: - conn.WriteNull() - case resultError: - writeRedisError(conn, res.err) - case resultBulk: - conn.WriteBulk(res.bulk) - case resultString: - conn.WriteString(res.str) - case resultArray: - conn.WriteArray(len(res.arr)) - for _, s := range res.arr { - conn.WriteBulkString(s) - } - case resultInt: - conn.WriteInt64(res.integer) - default: - conn.WriteNull() - } -} - -func (r *RedisServer) get(conn redcon.Conn, cmd redcon.Command) { - key := cmd.Args[1] - if r.proxyToLeader(conn, cmd, key) { - return - } - - // Single bounded context for the slow paths in this handler, - // derived from the server's base context so Close() cancels any - // in-flight handler instead of leaving it running on a detached - // context.Background(). Only LeaseReadForKey and keyTypeAt accept - // a context; readRedisStringAt is a local-store read that does - // not take one. The shared deadline bounds the only branches - // that can actually block on quorum / I/O. - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - if _, err := kv.LeaseReadForKeyThrough(r.coordinator, ctx, key); err != nil { - writeRedisError(conn, err) - return - } - readTS := r.readTS() - - // Fast path: attempt the string read directly instead of probing - // every possible Redis encoding first. rawKeyTypeAt issues up to - // ~17 pebble seeks (list meta + list delta + 3×wide-column probes - // each doing 3 seeks + hash/set/zset/stream/HLL/str/bare); that - // overhead dominated every GET on a hot cluster (see - // docs/design/2026_04_20_implemented_lease_read.md). A live string key resolves in 1-2 - // seeks here, and we only fall back to keyTypeAt when the string - // path returns ErrKeyNotFound (meaning either missing, expired, - // or a non-string type is present under this user-key). - // - // Use the snapshot variant: LeaseReadForKeyThrough above already - // established the ReadIndex fence, so a per-call VerifyLeaderForKey - // (inside leaderAwareGetAt) would duplicate the quorum work. - v, _, err := r.readRedisStringAtSnapshot(key, readTS) - if err == nil { - conn.WriteBulk(v) - return - } - if !errors.Is(err, store.ErrKeyNotFound) { - writeRedisError(conn, err) - return - } - - // Slow path: disambiguate "missing / expired" from WRONGTYPE. - // keyTypeAt applies the TTL filter, so an expired string reports - // as redisTypeNone here and we return nil -- matching the - // pre-optimisation behaviour. - typ, err := r.keyTypeAt(ctx, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteNull() - return - } - // If keyTypeAt disagrees with the fast path and classifies the key - // as a live string (e.g. a rare TTL-filter discrepancy between - // decodePrefixedStringWith/readBareLegacyStringWith and - // hasExpiredTTLAt), match the pre-optimisation behaviour and - // return nil rather than WRONGTYPE. - if typ == redisTypeString { - conn.WriteNull() - return - } - conn.WriteError(wrongTypeMessage) -} - -// leaderEmbeddedTTLExpired looks at !redis|str| on the leader and, if the -// payload is in new format, returns the embedded-TTL expiry verdict. The bool -// indicates whether the caller should use this verdict (true) or fall through -// to the legacy !redis|ttl| index (false). -func (r *RedisServer) leaderEmbeddedTTLExpired(key []byte) (bool, bool) { - raw, err := r.tryLeaderGetAt(redisStrKey(key), 0) - if err != nil || !isNewRedisStrFormat(raw) { - return false, false - } - _, expireAt, decErr := decodeRedisStr(raw) - if decErr != nil { - // Malformed new-format payload: treat as expired rather than silently alive. - return true, true - } - if expireAt == nil { - return false, true - } - return !expireAt.After(time.Now()), true -} - -// isLeaderKeyExpired checks whether the key has an expired TTL on the leader. -func (r *RedisServer) isLeaderKeyExpired(key []byte) bool { - // For string keys with new encoding: check embedded TTL. - if expired, ok := r.leaderEmbeddedTTLExpired(key); ok { - return expired - } - raw, err := r.tryLeaderGetAt(redisTTLKey(key), 0) - if err != nil { - return false - } - ttl, err := decodeRedisTTL(raw) - if err != nil { - return false - } - return !ttl.After(time.Now()) -} - -// tryLeaderNonStringExists checks whether the key exists as a non-string type -// (hash, set, zset, stream, HLL, or list) on the leader. Returns false if the -// key has an expired TTL. -func (r *RedisServer) tryLeaderNonStringExists(key []byte) bool { - // Check TTL first: if expired, the key is logically gone. - if raw, err := r.tryLeaderGetAt(redisTTLKey(key), 0); err == nil { - if ttl, decErr := decodeRedisTTL(raw); decErr == nil && !ttl.After(time.Now()) { - return false - } - } - for _, internalKey := range [][]byte{ - redisHashKey(key), - redisSetKey(key), - redisHLLKey(key), - redisZSetKey(key), - redisStreamKey(key), - } { - if _, err := r.tryLeaderGetAt(internalKey, 0); err == nil { - return true - } - } - if _, err := r.tryLeaderGetAt(listMetaKey(key), 0); err == nil { - return true - } - return false -} - -// tryLeaderLogicalExists checks whether the key exists as any type on the leader. -func (r *RedisServer) tryLeaderLogicalExists(key []byte) bool { - // Prefer asking the leader's Redis command path directly: it evaluates - // existence with ttlAt() semantics (including the in-memory TTL buffer). - // If this path is unavailable we fall back to raw-KV probing, which is - // best-effort and may lag unflushed buffer-only TTL updates. - if cli, err := r.leaderClientForKey(key); err == nil { - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - if count, existsErr := cli.Exists(ctx, string(key)).Result(); existsErr == nil { - return count > 0 - } - } - - // Fallback to raw KV probing if Redis command proxying is unavailable. - if r.isLeaderKeyExpired(key) { - return false - } - // String type (raw user key). - if _, err := r.tryLeaderGetAt(key, 0); err == nil { - return true - } - return r.tryLeaderNonStringExists(key) -} - -func (r *RedisServer) del(conn redcon.Conn, cmd redcon.Command) { - // DEL discovers internal keys via local MVCC state. On followers this state - // may lag, producing incomplete deletes. Check per-key leadership and proxy - // non-local keys to the correct leader for accurate internal-key discovery. - localKeys := make([][]byte, 0, len(cmd.Args)-1) - proxyKeys := make([][]byte, 0) - for _, key := range cmd.Args[1:] { - if r.coordinator.IsLeaderForKey(key) { - localKeys = append(localKeys, key) - } else { - proxyKeys = append(proxyKeys, key) - } - } - - var removed int64 - - // Proxy non-local keys to the appropriate leader. - if len(proxyKeys) > 0 { - proxied, err := r.proxyDel(proxyKeys) - if err != nil { - writeRedisError(conn, err) - return - } - removed += proxied - } - - // Delete local keys directly. - if len(localKeys) > 0 { - localRemoved, err := r.delLocal(localKeys) - if err != nil { - writeRedisError(conn, err) - return - } - removed += int64(localRemoved) - } - - conn.WriteInt64(removed) -} - -func (r *RedisServer) delLocal(keys [][]byte) (int, error) { - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - var removed int - err := r.retryRedisWrite(ctx, func() error { - elems := []*kv.Elem[kv.OP]{} - nextRemoved := 0 - readTS := r.readTS() - for _, key := range keys { - keyElems, existed, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - if existed { - nextRemoved++ - } - elems = append(elems, keyElems...) - } - if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { - return err - } - removed = nextRemoved - return nil - }) - return removed, err -} - -func (r *RedisServer) exists(conn redcon.Conn, cmd redcon.Command) { - readTS := r.readTS() - // Derive ctx from the server's base context so work in this handler - // that honors context deadlines is bounded and cancels on shutdown. - // Local Pebble reads (store.GetAt / ExistsAt / ScanAt) currently - // ignore the context parameter, so cancellation does not interrupt - // an in-flight local probe. The negative-result follower fallback - // currently calls tryLeaderLogicalExists(), which manages its own - // timeout/context rather than using this ctx. - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - count := 0 - for _, key := range cmd.Args[1:] { - ok, err := r.existsAtFast(ctx, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if ok { - count++ - } else if !r.coordinator.IsLeaderForKey(key) { - // Local MVCC may be stale on a follower; proxy to the leader. - if r.tryLeaderLogicalExists(key) { - count++ - } - } - } - conn.WriteInt(count) -} - -// existsAtFast is a string-first fast path for EXISTS-style liveness -// checks. Strings dominate real workloads, and a live string key -// resolves here in 1-2 seeks against redisStrKey (with TTL filtering -// applied inline) versus the ~17 seeks of a full logicalExistsAt -// probe. When the redisStrKey probe misses we fall back to the full -// type-probe. -// -// The probe goes directly to the local store. EXISTS tolerates stale- -// positive reads on followers by design -- the pre-optimisation flow -// (logicalExistsAt → keyTypeAt → local store.ExistsAt) never proxied -// to the leader for the probe itself; proxying is reserved for the -// negative-result fallback (tryLeaderLogicalExists in the caller). -// Routing through readRedisStringAt here would instead issue a Raft -// round-trip per key on every follower, regressing EXISTS latency on -// workloads that were previously all-local. -func (r *RedisServer) existsAtFast(ctx context.Context, key []byte, readTS uint64) (bool, error) { - raw, err := r.store.GetAt(ctx, redisStrKey(key), readTS) - if err == nil { - alive, decErr := r.stringPayloadIsLive(ctx, key, raw, readTS) - if decErr != nil { - return false, errors.WithStack(decErr) - } - if alive { - return true, nil - } - // Expired: fall through so other encodings still get their - // chance. Undecodable payloads are already propagated as an - // error by stringPayloadIsLive above -- they're a corruption - // signal, not a "try something else" case. - } else if !errors.Is(err, store.ErrKeyNotFound) { - return false, errors.WithStack(err) - } - return r.logicalExistsAt(ctx, key, readTS) -} - -// stringPayloadIsLive reports whether a redisStrKey payload is still -// TTL-alive. New-format payloads carry their expiry inline; legacy- -// format payloads need the !redis|ttl| index consulted for the TTL. -// Both paths use the LOCAL store, matching existsAtFast's no-proxy -// contract. -func (r *RedisServer) stringPayloadIsLive(ctx context.Context, key, raw []byte, readTS uint64) (bool, error) { - if isNewRedisStrFormat(raw) { - _, expireAt, err := decodeRedisStr(raw) - if err != nil { - return false, err - } - return expireAt == nil || expireAt.After(time.Now()), nil - } - ttl, err := r.legacyIndexTTLAt(ctx, key, readTS) - if err != nil { - return false, err - } - return ttl == nil || ttl.After(time.Now()), nil -} - -func (r *RedisServer) keys(conn redcon.Conn, cmd redcon.Command) { - pattern := cmd.Args[1] - - if r.coordinator.IsLeader() { - // Per-call ctx with redisDispatchTimeout instead of the - // long-lived handlerContext: a stalled VerifyLeader on KEYS - // must not pin the command handler indefinitely. The same - // bound the rest of the dispatch path (sadd, set, …) uses; - // see Codex P1 review on PR #749. - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - if err := r.coordinator.VerifyLeader(ctx); err != nil { - writeRedisError(conn, err) - return - } - keys, err := r.visibleKeys(pattern) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteArray(len(keys)) - for _, k := range keys { - conn.WriteBulk(k) - } - return - } - - keys, err := r.proxyKeys(pattern) - if err != nil { - writeRedisError(conn, err) - return - } - - conn.WriteArray(len(keys)) - for _, k := range keys { - conn.WriteBulkString(k) - } -} - -func (r *RedisServer) localKeys(pattern []byte) ([][]byte, error) { - if !bytes.Contains(pattern, []byte("*")) { - return r.localKeysExact(pattern) - } - return r.localKeysPattern(pattern) -} - -func (r *RedisServer) localKeysExact(pattern []byte) ([][]byte, error) { - typ, err := r.keyTypeAt(context.Background(), pattern, r.readTS()) - if err != nil { - return nil, err - } - if typ != redisTypeNone { - return [][]byte{bytes.Clone(pattern)}, nil - } - return [][]byte{}, nil -} - -// mergeInternalNamespaces scans all internal key namespaces (list, hash, set, -// zset, and other internal prefixes) for keys that match pattern and merges -// them into the caller's keyset via mergeScannedKeys. Called only when the -// pattern is bounded (start != nil) because unbounded scans already cover the -// full keyspace. -func (r *RedisServer) mergeInternalNamespaces(start []byte, pattern []byte, mergeScannedKeys func([]byte, []byte) error) error { - metaStart, metaEnd := listPatternScanBounds(store.ListMetaPrefix, pattern) - if err := mergeScannedKeys(metaStart, metaEnd); err != nil { - return err - } - itemStart, itemEnd := listPatternScanBounds(store.ListItemPrefix, pattern) - if err := mergeScannedKeys(itemStart, itemEnd); err != nil { - return err - } - for _, prefix := range redisInternalPrefixes { - // !stream|meta| keys are length-prefixed (see store.StreamMetaKey): - // a pattern-bound scan over the raw prefix would mask out every - // migrated stream because the user-key bytes do not start at - // prefix[len(prefix):]. Delegate to the wide-column scan below, - // which uses streamMetaScanStart(start) to place the user-key - // lower bound past the length field. - if prefix == store.StreamMetaPrefix { - continue - } - internalStart, internalEnd := listPatternScanBounds(prefix, pattern) - if err := mergeScannedKeys(internalStart, internalEnd); err != nil { - return err - } - } - // Wide-column hash/set/zset keys embed the user-key as - // <4-byte-len>, so the binary length - // prefix makes straightforward bounds-based scanning non-trivial. - // Use the user-key prefix as the lower bound and scan to the end of each - // namespace; collectUserKeys filters false positives by pattern. - hashFieldStart := store.HashFieldScanPrefix(start) - hashFieldEnd := prefixScanEnd([]byte(store.HashFieldPrefix)) - if err := mergeScannedKeys(hashFieldStart, hashFieldEnd); err != nil { - return err - } - setMemberStart := store.SetMemberScanPrefix(start) - setMemberEnd := prefixScanEnd([]byte(store.SetMemberPrefix)) - if err := mergeScannedKeys(setMemberStart, setMemberEnd); err != nil { - return err - } - zsetMemberStart := store.ZSetMemberScanPrefix(start) - zsetMemberEnd := prefixScanEnd([]byte(store.ZSetMemberPrefix)) - if err := mergeScannedKeys(zsetMemberStart, zsetMemberEnd); err != nil { - return err - } - // Post-migration streams live under !stream|meta|. - // The meta record is enough to expose the logical key via KEYS; - // entry rows are filtered out by redisVisibleUserKey / collectUserKeys - // so the result stays one-line-per-stream regardless of entry count. - streamMetaStart := streamMetaScanStart(start) - streamMetaEnd := prefixScanEnd([]byte(store.StreamMetaPrefix)) - return mergeScannedKeys(streamMetaStart, streamMetaEnd) -} - -// streamMetaScanStart returns the lower bound for scanning stream meta -// keys that begin with the given user-key prefix. The store helper -// already returns StreamMetaPrefix + len(userKey) + userKey, so callers -// only need to supply the bounded pattern prefix. -func streamMetaScanStart(userPrefix []byte) []byte { - if len(userPrefix) == 0 { - return []byte(store.StreamMetaPrefix) - } - return store.StreamMetaKey(userPrefix) -} - -func (r *RedisServer) localKeysPattern(pattern []byte) ([][]byte, error) { - start, end := patternScanBounds(pattern) - keyset := map[string][]byte{} - readTS := r.readTS() - - mergeScannedKeys := func(scanStart, scanEnd []byte) error { - keys, err := r.store.ScanAt(context.Background(), scanStart, scanEnd, math.MaxInt, readTS) - if err != nil { - return errors.WithStack(err) - } - maps.Copy(keyset, r.collectUserKeys(keys, pattern)) - return nil - } - - if err := mergeScannedKeys(start, end); err != nil { - return nil, err - } - - // When the pattern is bounded (start != nil), user-key scans do not - // naturally include internal data namespaces, so scan those separately - // and map them back to logical user keys. For unbounded patterns - // (e.g. "*"), the full-keyspace scan already covers everything. - if start != nil { - if err := r.mergeInternalNamespaces(start, pattern, mergeScannedKeys); err != nil { - return nil, err - } - } - - out := make([][]byte, 0, len(keyset)) - for _, v := range keyset { - out = append(out, v) - } - return out, nil -} - -func patternScanBounds(pattern []byte) ([]byte, []byte) { - if bytes.Equal(pattern, []byte("*")) { - return nil, nil - } - - i := bytes.IndexByte(pattern, '*') - if i <= 0 { - return nil, nil - } - - start := bytes.Clone(pattern[:i]) - return start, prefixScanEnd(start) -} - -func listPatternScanBounds(prefix string, pattern []byte) ([]byte, []byte) { - userStart, userEnd := patternScanBounds(pattern) - prefixBytes := []byte(prefix) - - if userStart == nil && userEnd == nil { - return prefixBytes, prefixScanEnd(prefixBytes) - } - - start := append(bytes.Clone(prefixBytes), userStart...) - if userEnd == nil { - return start, prefixScanEnd(prefixBytes) - } - end := append(bytes.Clone(prefixBytes), userEnd...) - return start, end -} - -func matchesAsteriskPattern(pattern, key []byte) bool { - parts := bytes.Split(pattern, []byte("*")) - if len(parts) == 1 { - return bytes.Equal(pattern, key) - } - - pos := 0 - if len(parts[0]) > 0 { - if !bytes.HasPrefix(key, parts[0]) { - return false - } - pos = len(parts[0]) - } - - for i := 1; i < len(parts)-1; i++ { - part := parts[i] - if len(part) == 0 { - continue - } - idx := bytes.Index(key[pos:], part) - if idx < 0 { - return false - } - pos += idx + len(part) - } - - last := parts[len(parts)-1] - if len(last) > 0 && !bytes.HasSuffix(key, last) { - return false - } - - return true -} - -func (r *RedisServer) collectUserKeys(kvs []*store.KVPair, pattern []byte) map[string][]byte { - keyset := map[string][]byte{} - for _, kvPair := range kvs { - userKey := redisVisibleUserKey(kvPair.Key) - if userKey == nil || !matchesAsteriskPattern(pattern, userKey) { - continue - } - keyset[string(userKey)] = userKey - } - return keyset -} - -// zsetWideColumnVisibleUserKey handles the ZSet-specific part of wide-column key mapping. -// Returns (nil, true) for internal-only keys and (userKey, true) for visible keys. -func zsetWideColumnVisibleUserKey(key []byte) (userKey []byte, isWide bool) { - if store.IsZSetMetaDeltaKey(key) || store.IsZSetMetaKey(key) { - return nil, true - } - if store.IsZSetMemberKey(key) { - return store.ExtractZSetUserKeyFromMember(key), true - } - if store.IsZSetScoreKey(key) { - return store.ExtractZSetUserKeyFromScore(key), true - } - return nil, false -} - -// wideColumnVisibleUserKey maps a wide-column internal key to its visible user -// key, or returns (nil, true) for internal-only keys (meta/delta), and -// (nil, false) if the key is not a wide-column key at all. -func wideColumnVisibleUserKey(key []byte) (userKey []byte, isWide bool) { - // Check delta prefixes before meta prefixes (delta starts with meta prefix). - if store.IsHashMetaDeltaKey(key) || store.IsHashMetaKey(key) { - return nil, true - } - if store.IsHashFieldKey(key) { - return store.ExtractHashUserKeyFromField(key), true - } - if store.IsSetMetaDeltaKey(key) || store.IsSetMetaKey(key) { - return nil, true - } - if store.IsSetMemberKey(key) { - return store.ExtractSetUserKeyFromMember(key), true - } - if userKey, ok := streamWideColumnVisibleUserKey(key); ok { - return userKey, true - } - return zsetWideColumnVisibleUserKey(key) -} - -// streamWideColumnVisibleUserKey maps a wide-column stream key to its -// visible user key. Meta keys expose the stream exactly once; entry keys -// are internal-only so KEYS / SCAN don't leak one result per entry. -func streamWideColumnVisibleUserKey(key []byte) ([]byte, bool) { - if store.IsStreamMetaKey(key) { - return store.ExtractStreamUserKeyFromMeta(key), true - } - if store.IsStreamEntryKey(key) { - return nil, true - } - return nil, false -} - -func redisVisibleUserKey(key []byte) []byte { - if bytes.HasPrefix(key, redisTxnKeyPrefix) || isRedisTTLKey(key) { - return nil - } - // List item keys are visible; meta, delta, and claim keys are internal-only. - if store.IsListItemKey(key) { - return store.ExtractListUserKey(key) - } - if store.IsListMetaKey(key) || store.IsListMetaDeltaKey(key) || store.IsListClaimKey(key) { - return nil - } - if userKey, isWide := wideColumnVisibleUserKey(key); isWide { - return userKey - } - if userKey := extractRedisInternalUserKey(key); userKey != nil { - return userKey - } - return key -} - -func (r *RedisServer) proxyKeys(pattern []byte) ([]string, error) { - leader := r.coordinator.RaftLeader() - if leader == "" { - return nil, ErrLeaderNotFound - } - - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) - } - - cli := r.getOrCreateLeaderClient(leaderAddr) - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - keys, err := cli.Keys(ctx, string(pattern)).Result() - return keys, errors.WithStack(err) -} - -// MULTI/EXEC/DISCARD handling -func (r *RedisServer) multi(conn redcon.Conn, _ redcon.Command) { - state := getConnState(conn) - if state.inTxn { - conn.WriteError("ERR MULTI calls can not be nested") - return - } - state.inTxn = true - state.queue = nil - conn.WriteString("OK") -} - -func (r *RedisServer) discard(conn redcon.Conn, _ redcon.Command) { - state := getConnState(conn) - if !state.inTxn { - conn.WriteError("ERR DISCARD without MULTI") - return - } - state.inTxn = false - state.queue = nil - conn.WriteString("OK") -} - -func (r *RedisServer) exec(conn redcon.Conn, _ redcon.Command) { - state := getConnState(conn) - if !state.inTxn { - conn.WriteError("ERR EXEC without MULTI") - return - } - - queue := state.queue - state.inTxn = false - state.queue = nil - - // Always execute MULTI/EXEC on the leader so that reads and writes within - // the transaction see consistent, up-to-date data. Serving transactions - // on followers risks reading stale MVCC state and producing write cycles. - if !r.coordinator.IsLeader() { - r.proxyTransactionToLeader(conn, queue) - return - } - - results, err := r.runTransaction(queue) - if err != nil { - writeRedisError(conn, err) - return - } - - r.writeResults(conn, results) -} - -// proxyTransactionToLeader forwards a MULTI/EXEC transaction to the leader -// node and writes the EXEC response array back to conn. -// -//nolint:cyclop // inherent complexity of MULTI/EXEC proxy; refactoring would obscure the protocol flow -func (r *RedisServer) proxyTransactionToLeader(conn redcon.Conn, queue []redcon.Command) { - leaderAddr, ok := r.resolveLeaderRedisAddr(conn) - if !ok { - return - } - cli := r.getOrCreateLeaderClient(leaderAddr) - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - cmds, err := r.execTxPipeline(ctx, cli, queue) - if handleProxyTxnError(conn, err) { - return - } - writeProxyCmdsResult(conn, cmds) -} - -// resolveLeaderRedisAddr looks up the Redis address of the current Raft leader, -// writes an error reply to conn on failure and returns ("", false). -func (r *RedisServer) resolveLeaderRedisAddr(conn redcon.Conn) (string, bool) { - leader := r.coordinator.RaftLeader() - if leader == "" { - writeRedisError(conn, ErrLeaderNotFound) - return "", false - } - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - conn.WriteError(fmt.Sprintf("ERR leader redis address unknown for raft address %s", leader)) - return "", false - } - return leaderAddr, true -} - -// execTxPipeline sends queue as a single TxPipelined batch and returns the -// per-command result handles together with any pipeline-level error. -func (r *RedisServer) execTxPipeline(ctx context.Context, cli *redis.Client, queue []redcon.Command) ([]*redis.Cmd, error) { - cmds := make([]*redis.Cmd, 0, len(queue)) - _, err := cli.TxPipelined(ctx, func(pipe redis.Pipeliner) error { - for _, cmd := range queue { - args := make([]interface{}, len(cmd.Args)) - for i, a := range cmd.Args { - args[i] = a - } - cmds = append(cmds, pipe.Do(ctx, args...)) - } - return nil - }) - return cmds, errors.WithStack(err) -} - -// handleProxyTxnError writes the appropriate reply for terminal pipeline errors -// and returns true when the caller should return early without writing results. -func handleProxyTxnError(conn redcon.Conn, err error) bool { - // Transaction aborted (WATCH conflict): Redis protocol requires a Null - // array reply (*-1\r\n), not a null bulk string or an error. - // redis.Nil is a per-command nil response and must NOT be treated as an - // EXEC abort — only redis.TxFailedErr signals that. - if errors.Is(err, redis.TxFailedErr) { - conn.WriteArray(-1) - return true - } - // Fatal transport / context error: per-command results are unreliable. - if err != nil { - var netErr net.Error - if errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) || - errors.Is(err, io.EOF) || - errors.Is(err, io.ErrUnexpectedEOF) || - errors.As(err, &netErr) { - writeRedisError(conn, err) - return true - } - } - return false -} - -// writeProxyCmdsResult writes an EXEC-style array reply for the given pipeline -// command handles. For any other non-nil per-command errors, each cmd carries -// its own result, which is the correct Redis EXEC semantics. -func writeProxyCmdsResult(conn redcon.Conn, cmds []*redis.Cmd) { - conn.WriteArray(len(cmds)) - for _, cmd := range cmds { - writeGoRedisResult(conn, cmd) - } -} - -type txnValue struct { - raw []byte - ttl *time.Time - deleted bool - dirty bool - loaded bool -} - -type txnContext struct { - server *RedisServer - // ctx is the per-EXEC dispatch context (redisDispatchTimeout-bounded - // at the call site in runTransaction). Plumbed through so reads - // inside the EXEC such as load() → readValueAt() respect the - // caller's deadline rather than falling back to handlerContext + - // the verifyLeaderEngineCtx safety net. - ctx context.Context //nolint:containedctx // EXEC is a long-lived value type that wraps a single client command, ctx must travel with it. - working map[string]*txnValue - listStates map[string]*listTxnState - zsetStates map[string]*zsetTxnState - ttlStates map[string]*ttlTxnState - readKeys map[string][]byte - // streamDeletions tracks user keys whose stream wide-column layout must - // be tombstoned on commit: the !stream|meta| record plus every - // !stream|entry| row. stageKeyDeletion seeds this (MULTI/EXEC - // DEL / EXPIRE 0) so migrated streams are properly removed rather than - // leaking entry keys past the DEL's apparent success. - streamDeletions map[string][]byte - startTS uint64 -} - -type listTxnState struct { - meta store.ListMeta - metaExists bool - appends [][]byte - deleted bool - purge bool - purgeMeta store.ListMeta - existingDeltas [][]byte // delta key bytes present at load time; deleted on purge/delete -} - -type zsetTxnState struct { - members map[string]float64 // current (potentially modified) state - origMembers map[string]float64 // original state at load time (for wide-column diff) - isWide bool // true if loaded from wide-column !zs|mem| storage - exists bool - dirty bool -} - -type ttlTxnState struct { - value *time.Time - dirty bool -} - -func stageListDelete(st *listTxnState) { - if st == nil { - return - } - if st.metaExists { - st.purge = true - st.purgeMeta = st.meta - } - st.deleted = true - st.appends = nil -} - -func (t *txnContext) trackReadKey(key []byte) { - if len(key) == 0 { - return - } - k := string(key) - if _, ok := t.readKeys[k]; ok { - return - } - t.readKeys[k] = bytes.Clone(key) -} - -func (t *txnContext) trackTypeReadKeys(key []byte) { - for _, readKey := range [][]byte{ - listMetaKey(key), - redisHashKey(key), - redisSetKey(key), - redisZSetKey(key), - redisStreamKey(key), // legacy single-blob stream key - store.StreamMetaKey(key), // post-migration wide-column stream meta - redisHLLKey(key), - redisStrKey(key), - key, // legacy bare key for fallback reads - } { - t.trackReadKey(readKey) - } -} - -func (t *txnContext) load(key []byte) (*txnValue, error) { - // If the key is already an internal key (e.g., !redis|hash|..., - // !lst|..., !txn|..., !ddb|..., !s3|..., !dist|...), use it as-is. - // Otherwise, it's a bare user key for a string value — prefix it. - storageKey := key - if !isKnownInternalKey(key) { - storageKey = redisStrKey(key) - } - k := string(storageKey) - if tv, ok := t.working[k]; ok { - return tv, nil - } - t.trackReadKey(storageKey) - if !isKnownInternalKey(key) { - // Track the bare key too for conflict detection on legacy fallback reads. - t.trackReadKey(key) - } - tv := &txnValue{} - var val []byte - if !isKnownInternalKey(key) { - // For bare user string keys, use the fallback-aware reader. - var ( - err error - ttl *time.Time - ) - val, ttl, err = t.server.readRedisStringAt(key, t.startTS) - if err != nil && !errors.Is(err, store.ErrKeyNotFound) { - return nil, errors.WithStack(err) - } - tv.ttl = ttl - } else { - var err error - // Some redis_txn_test.go fixtures build a minimal txnContext - // literal without setting ctx; fall back to Background so - // readValueAt's coordinator.VerifyLeaderForKey does not panic - // when wrapped via context.WithTimeout(nil, …). Same defensive - // pattern as streamDeletions / loadListState. - ctx := t.ctx - if ctx == nil { - ctx = context.Background() - } - val, err = t.server.readValueAt(ctx, storageKey, t.startTS) - if err != nil && !errors.Is(err, store.ErrKeyNotFound) { - return nil, errors.WithStack(err) - } - } - tv.raw = val - tv.loaded = true - t.working[k] = tv - return tv, nil -} - -func (t *txnContext) loadListState(key []byte) (*listTxnState, error) { - k := string(key) - if st, ok := t.listStates[k]; ok { - return st, nil - } - ctx := context.Background() - meta, exists, err := t.server.resolveListMeta(ctx, key, t.startTS) - if err != nil { - return nil, err - } - - // Capture existing delta keys so they can be deleted if the list is later - // purged or deleted within this transaction. Scan one extra item to detect - // truncation: if >MaxDeltaScanLimit deltas exist the transaction cannot - // safely enumerate all of them for deletion, so we return ErrDeltaScanTruncated - // and let the caller retry after the background compactor has caught up. - deltaPrefix := store.ListMetaDeltaScanPrefix(key) - deltaEnd := store.PrefixScanEnd(deltaPrefix) - deltaKVs, err := t.server.store.ScanAt(ctx, deltaPrefix, deltaEnd, store.MaxDeltaScanLimit+1, t.startTS) - if err != nil { - return nil, errors.WithStack(err) - } - if len(deltaKVs) > store.MaxDeltaScanLimit { - return nil, ErrDeltaScanTruncated - } - existingDeltas := make([][]byte, 0, len(deltaKVs)) - for _, kv := range deltaKVs { - existingDeltas = append(existingDeltas, kv.Key) - } - - st := &listTxnState{ - meta: meta, - metaExists: exists, - appends: [][]byte{}, - existingDeltas: existingDeltas, - } - t.listStates[k] = st - - // Track the list-item key at the current tail (and the position before the - // head) so that concurrent RPUSH/LPUSH operations—which write to exactly - // these positions—trigger a read-write conflict and force a retry. - // Without this, a MULTI transaction that reads a list via LRANGE can commit - // with a stale snapshot while a concurrent RPUSH commits a new item, - // forming an anti-dependency (G2-item) cycle. - // The base meta key (listMetaKey) is intentionally NOT tracked here: the - // Delta scheme allows the DeltaCompactor to rewrite it without conflicting - // with ongoing push/read transactions (see TestRedisTxnValidateReadSet_ListMetaUpdateNoConflict). - t.trackReadKey(listItemKey(key, meta.Head+meta.Len)) // next RPUSH target - if meta.Head > math.MinInt64 { - t.trackReadKey(listItemKey(key, meta.Head-1)) // next LPUSH target - } - - return st, nil -} - -func (t *txnContext) listLength(st *listTxnState) int64 { - return st.meta.Len + int64(len(st.appends)) -} - -func (t *txnContext) loadZSetState(key []byte) (*zsetTxnState, error) { - k := string(key) - if st, ok := t.zsetStates[k]; ok { - return st, nil - } - t.trackReadKey(redisZSetKey(key)) - // Check TTL: treat expired keys as non-existent. - ttlSt, err := t.loadTTLState(key) - if err != nil { - return nil, err - } - if ttlSt.value != nil && !ttlSt.value.After(time.Now()) { - st := &zsetTxnState{ - members: map[string]float64{}, - origMembers: map[string]float64{}, - exists: false, - } - t.zsetStates[k] = st - return st, nil - } - - // Detect wide-column storage by probing the !zs|mem| prefix. - memberPrefix := store.ZSetMemberScanPrefix(key) - memberEnd := store.PrefixScanEnd(memberPrefix) - probeKVs, probeErr := t.server.store.ScanAt(context.Background(), memberPrefix, memberEnd, 1, t.startTS) - if probeErr != nil { - return nil, errors.WithStack(probeErr) - } - isWide := len(probeKVs) > 0 - - value, exists, err := t.server.loadZSetAt(context.Background(), key, t.startTS) - if err != nil { - return nil, err - } - members := zsetEntriesToMap(value.Entries) - // Snapshot the original members for wide-column diff at commit time. - origMembers := make(map[string]float64, len(members)) - for m, s := range members { - origMembers[m] = s - } - st := &zsetTxnState{ - members: members, - origMembers: origMembers, - isWide: isWide, - exists: exists, - } - t.zsetStates[k] = st - return st, nil -} - -func (t *txnContext) loadTTLState(key []byte) (*ttlTxnState, error) { - k := string(key) - if st, ok := t.ttlStates[k]; ok { - return st, nil - } - value, err := t.server.ttlAt(context.Background(), key, t.startTS) - if err != nil { - return nil, err - } - st := &ttlTxnState{value: value} - t.ttlStates[k] = st - return st, nil -} - -func (t *txnContext) stagedKeyType(key []byte) (redisValueType, error) { - k := string(key) - if typ, ok := t.stagedZSetType(k); ok { - return typ, nil - } - if typ, ok := t.stagedListType(k); ok { - return typ, nil - } - if typ, ok := t.stagedStringType(k); ok { - return typ, nil - } - t.trackTypeReadKeys(key) - return t.server.keyTypeAt(context.Background(), key, t.startTS) -} - -func (t *txnContext) stagedZSetType(key string) (redisValueType, bool) { - st, ok := t.zsetStates[key] - if !ok || (!st.dirty && !st.exists) { - return redisTypeNone, false - } - if len(st.members) == 0 { - return redisTypeNone, true - } - return redisTypeZSet, true -} - -func (t *txnContext) stagedListType(key string) (redisValueType, bool) { - st, ok := t.listStates[key] - if !ok { - return redisTypeNone, false - } - if st.deleted { - return redisTypeNone, true - } - if st.metaExists || len(st.appends) > 0 { - return redisTypeList, true - } - return redisTypeNone, false -} - -func (t *txnContext) stagedStringType(key string) (redisValueType, bool) { - tv, ok := t.working[string(redisStrKey([]byte(key)))] - if !ok { - return redisTypeNone, false - } - if tv.deleted || tv.raw == nil { - return redisTypeNone, true - } - return redisTypeString, true -} - -func (t *txnContext) apply(cmd redcon.Command) (redisResult, error) { - handler, ok := txnApplyHandlers[strings.ToUpper(string(cmd.Args[0]))] - if !ok { - return redisResult{}, errors.WithStack(errors.Newf("ERR unsupported command '%s'", cmd.Args[0])) - } - return handler(t, cmd) -} - -func (t *txnContext) applyExpireSeconds(cmd redcon.Command) (redisResult, error) { - return t.applyExpire(cmd, time.Second) -} - -func (t *txnContext) applyExpireMilliseconds(cmd redcon.Command) (redisResult, error) { - return t.applyExpire(cmd, time.Millisecond) -} - -func (t *txnContext) applySet(cmd redcon.Command) (redisResult, error) { - if isList, err := t.server.isListKeyAt(context.Background(), cmd.Args[1], t.startTS); err != nil { - return redisResult{}, err - } else if isList { - return redisResult{typ: resultError, err: errors.New("WRONGTYPE Operation against a key holding the wrong kind of value")}, nil - } - - opts, err := parseRedisSetOptions(cmd.Args[3:], time.Now()) - if err != nil { - return redisResult{}, err - } - - // NX/XX: skip the write if the key-existence condition is not met. - blocked, res, err := t.applySetCondition(cmd.Args[1], opts) - if err != nil { - return redisResult{}, err - } - if blocked { - return res, nil - } - - tv, err := t.load(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - - var oldValue []byte - if opts.returnOld && !tv.deleted { - oldValue = tv.raw - } - - tv.raw = cmd.Args[2] - tv.deleted = false - tv.dirty = true - - // Always update TTL state: EX/PX sets a new expiry; a plain SET clears it - // (opts.ttl == nil → nil stored → PERSIST semantics, matching Redis behaviour). - if err := t.applySetTTL(cmd.Args[1], opts.ttl); err != nil { - return redisResult{}, err - } - - return applySetResult(opts, oldValue), nil -} - -// applySetCondition checks NX/XX conditions. Returns (blocked, result, err). -// blocked=true means the condition prevented the write; callers should return result. -// Returns (false, _, nil) immediately when no condition is set. -func (t *txnContext) applySetCondition(key []byte, opts redisSetOptions) (bool, redisResult, error) { - if !opts.existsCond && !opts.missingCond { - return false, redisResult{}, nil - } - typ, err := t.stagedKeyType(key) - if err != nil { - return false, redisResult{}, err - } - exists := typ != redisTypeNone - if (opts.missingCond && exists) || (opts.existsCond && !exists) { - return true, redisResult{typ: resultNil}, nil - } - return false, redisResult{}, nil -} - -// applySetTTL stores the expiry in ttlStates so flushTTLToBuffer sends it to -// the TTLBuffer after a successful commit. -func (t *txnContext) applySetTTL(key []byte, expireAt *time.Time) error { - ttlSt, err := t.loadTTLState(key) - if err != nil { - return err - } - ttlSt.value = expireAt - ttlSt.dirty = true - return nil -} - -// applySetResult returns the appropriate redisResult for a completed SET. -func applySetResult(opts redisSetOptions, oldValue []byte) redisResult { - if !opts.returnOld { - return redisResult{typ: resultString, str: "OK"} - } - if oldValue == nil { - return redisResult{typ: resultNil} - } - return redisResult{typ: resultBulk, bulk: oldValue} -} - -func (t *txnContext) applyDel(cmd redcon.Command) (redisResult, error) { - var deleted int64 - for _, key := range cmd.Args[1:] { - typ, err := t.stagedKeyType(key) - if err != nil { - return redisResult{}, err - } - if typ == redisTypeNone { - continue - } - if _, err := t.stageKeyDeletion(key); err != nil { - return redisResult{}, err - } - deleted++ - } - return redisResult{typ: resultInt, integer: deleted}, nil -} - -func (t *txnContext) applyGet(cmd redcon.Command) (redisResult, error) { - typ, err := t.stagedKeyType(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if isNonStringCollectionType(typ) { - return redisResult{typ: resultError, err: wrongTypeError()}, nil - } - - tv, err := t.load(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if tv.deleted || tv.raw == nil { - return redisResult{typ: resultNil}, nil - } - return redisResult{typ: resultBulk, bulk: tv.raw}, nil -} - -func (t *txnContext) applyExists(cmd redcon.Command) (redisResult, error) { - var count int64 - for _, key := range cmd.Args[1:] { - typ, err := t.stagedKeyType(key) - if err != nil { - return redisResult{}, err - } - if typ != redisTypeNone { - count++ - } - } - return redisResult{typ: resultInt, integer: count}, nil -} - -func (t *txnContext) applyRPush(cmd redcon.Command) (redisResult, error) { - st, err := t.loadListState(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if st.deleted { - if st.metaExists { - st.purge = true - st.purgeMeta = st.meta - } - // DEL followed by RPUSH in the same transaction recreates the list. - st.deleted = false - st.metaExists = false - st.meta = store.ListMeta{} - st.appends = nil - } - - for _, v := range cmd.Args[2:] { - st.appends = append(st.appends, bytes.Clone(v)) - } - - return redisResult{typ: resultInt, integer: t.listLength(st)}, nil -} - -func (t *txnContext) applyLRange(cmd redcon.Command) (redisResult, error) { - st, err := t.loadListState(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - - s, e, err := parseRangeBounds(cmd.Args[2], cmd.Args[3], int(t.listLength(st))) - if err != nil { - return redisResult{}, err - } - if e < s { - return redisResult{typ: resultArray, arr: []string{}}, nil - } - - out, err := t.listRangeValues(cmd.Args[1], st, s, e) - if err != nil { - return redisResult{}, err - } - - return redisResult{typ: resultArray, arr: out}, nil -} - -func (t *txnContext) applyZIncrBy(cmd redcon.Command) (redisResult, error) { - typ, err := t.stagedKeyType(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if typ != redisTypeNone && typ != redisTypeZSet { - return redisResult{typ: resultError, err: wrongTypeError()}, nil - } - - inc, err := strconv.ParseFloat(string(cmd.Args[2]), 64) - if err != nil { - return redisResult{}, errors.WithStack(err) - } - st, err := t.loadZSetState(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - member := string(cmd.Args[3]) - st.members[member] += inc - st.dirty = true - return redisResult{typ: resultBulk, bulk: []byte(formatRedisFloat(st.members[member]))}, nil -} - -func (t *txnContext) applyExpire(cmd redcon.Command, unit time.Duration) (redisResult, error) { - typ, err := t.stagedKeyType(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if typ == redisTypeNone { - return redisResult{typ: resultInt, integer: 0}, nil - } - - ttl, err := strconv.ParseInt(string(cmd.Args[2]), 10, 64) - if err != nil { - return redisResult{}, errors.WithStack(err) - } - nxOnly, err := parseExpireNXOnly(cmd.Args[3:]) - if err != nil { - return redisResult{}, err - } - - state, err := t.loadTTLState(cmd.Args[1]) - if err != nil { - return redisResult{}, err - } - if nxOnly && hasActiveTTL(state.value, time.Now()) { - return redisResult{typ: resultInt, integer: 0}, nil - } - - if ttl <= 0 { - return t.stageKeyDeletion(cmd.Args[1]) - } - return t.applyPositiveExpire(cmd.Args[1], ttl, unit, typ, state) -} - -func (t *txnContext) applyPositiveExpire(key []byte, ttl int64, unit time.Duration, typ redisValueType, state *ttlTxnState) (redisResult, error) { - if ttl > math.MaxInt64/int64(unit) { - return redisResult{}, errors.New("ERR invalid expire time in command") - } - expireAt := time.Now().Add(time.Duration(ttl) * unit) - state.value = &expireAt - state.dirty = true - if typ == redisTypeString { - plain, err := t.server.isPlainRedisString(context.Background(), key, t.startTS) - if err != nil { - return redisResult{}, err - } - if plain { - return t.markStringDirty(key) - } - // HLL is reported as redisTypeString but stores its payload under - // !redis|hll|; keep TTL in the legacy scan index via buildTTLElems. - } - return redisResult{typ: resultInt, integer: 1}, nil -} - -// markStringDirty loads the string value into the working set so that -// buildKeyElems will re-encode it with the updated embedded TTL. -func (t *txnContext) markStringDirty(key []byte) (redisResult, error) { - tv, err := t.load(key) - if err != nil { - return redisResult{}, err - } - tv.dirty = true - return redisResult{typ: resultInt, integer: 1}, nil -} - -func (t *txnContext) stageKeyDeletion(key []byte) (redisResult, error) { - // Mark the list for deletion. - st, err := t.loadListState(key) - if err != nil { - return redisResult{}, err - } - stageListDelete(st) - // Mark the string/main value for deletion. - tv, err := t.load(key) - if err != nil { - return redisResult{}, err - } - tv.deleted = true - tv.dirty = true - // Mark TTL for deletion. - ttlState, err := t.loadTTLState(key) - if err != nil { - return redisResult{}, err - } - ttlState.value = nil - ttlState.dirty = true - // Mark zset for deletion. Use empty map (not nil) so that subsequent - // writes (e.g. ZINCRBY) in the same transaction can safely insert. - zs, err := t.loadZSetState(key) - if err != nil { - return redisResult{}, err - } - zs.members = map[string]float64{} - zs.exists = false - zs.dirty = true - // Mark hash, set, stream (legacy blob), and HLL internal keys for deletion. - for _, internalKey := range [][]byte{ - redisHashKey(key), - redisSetKey(key), - redisStreamKey(key), - redisHLLKey(key), - } { - iv, err := t.load(internalKey) - if err != nil { - return redisResult{}, err - } - iv.deleted = true - iv.dirty = true - } - // Stage the wide-column stream cleanup: the !stream|meta| record and - // every !stream|entry| row must also be tombstoned when the user deletes - // a migrated stream via MULTI/EXEC DEL or EXPIRE 0. Without this step - // the command would report success but leave rows behind, and a later - // XLEN / XREAD would "resurrect" the stream. commit() expands this - // entry into concrete Del elems by scanning the entry-key prefix. - // The map is lazy-initialised so test fixtures that build a minimal - // txnContext literal without this field still work. - if t.streamDeletions == nil { - t.streamDeletions = map[string][]byte{} - } - t.streamDeletions[string(key)] = bytes.Clone(key) - t.trackReadKey(store.StreamMetaKey(key)) - // Mark legacy bare string key for deletion. We bypass load() here - // because load() auto-prefixes bare keys to !redis|str|. - // Track the bare key in the read set for conflict detection. - t.trackReadKey(key) - bareK := string(key) - if _, ok := t.working[bareK]; !ok { - t.working[bareK] = &txnValue{} - } - t.working[bareK].deleted = true - t.working[bareK].dirty = true - return redisResult{typ: resultInt, integer: 1}, nil -} - -func parseRangeBounds(startRaw, endRaw []byte, total int) (int, int, error) { - start, err := parseInt(startRaw) - if err != nil { - return 0, 0, err - } - end, err := parseInt(endRaw) - if err != nil { - return 0, 0, err - } - s, e := clampRange(start, end, total) - return s, e, nil -} - -func (t *txnContext) listRangeValues(key []byte, st *listTxnState, s, e int) ([]string, error) { - persistedLen := int(st.meta.Len) - - switch { - case e < persistedLen: - return t.server.fetchListRange(context.Background(), key, st.meta, int64(s), int64(e), t.startTS) - case s >= persistedLen: - return appendValues(st.appends, s-persistedLen, e-persistedLen), nil - default: - head, err := t.server.fetchListRange(context.Background(), key, st.meta, int64(s), int64(persistedLen-1), t.startTS) - if err != nil { - return nil, err - } - tail := appendValues(st.appends, 0, e-persistedLen) - return append(head, tail...), nil - } -} - -func appendValues(buf [][]byte, start, end int) []string { - out := make([]string, 0, end-start+1) - for i := start; i <= end; i++ { - out = append(out, string(buf[i])) - } - return out -} - -func (t *txnContext) validateReadSet(ctx context.Context) error { - for _, key := range t.readKeys { - latestTS, exists, err := t.server.store.LatestCommitTS(ctx, key) - if err != nil { - return errors.WithStack(err) - } - if exists && latestTS > t.startTS { - return errors.WithStack(store.NewWriteConflictError(key)) - } - } - return nil -} - -// preparedTxnDispatch is the fully-assembled write set + read set + commit -// timestamp for a MULTI/EXEC transaction, ready to be passed to -// coordinator.Dispatch. Split out from commit() so the option-2 dedup -// path (runTransactionWithDedup) can intercept between prepare and -// dispatch — it needs to capture (elems, commitTS, readKeys) for a -// possible retry under PrevCommitTS without otherwise duplicating the -// commit-building logic. The owned ctx is the redisDispatchTimeout- -// bounded context the caller must run Dispatch under and Cancel after. -type preparedTxnDispatch struct { - elems []*kv.Elem[kv.OP] - commitTS uint64 - readKeys [][]byte - ctx context.Context - cancel context.CancelFunc -} - -// prepareDispatch builds everything Dispatch needs (elems, commitTS, -// readKeys, ctx) without actually calling Dispatch. Callers must always -// invoke `cancel()` on the returned prepared value once the dispatch -// attempt finishes (commit() does this via defer; the dedup path does it -// per retry iteration). When the transaction has no writes this returns -// a prepared value with empty `elems` and a no-op cancel — callers can -// check len(prepared.elems)==0 and skip the dispatch. -func (t *txnContext) prepareDispatch() (preparedTxnDispatch, error) { - elems := t.buildKeyElems() - - // Pre-allocate commitTS so Delta keys can embed it in their bytes before - // the coordinator assigns it during Dispatch. - commitTS, err := t.server.coordinator.Clock().NextFenced() - if err != nil { - return preparedTxnDispatch{cancel: func() {}}, errors.Wrap(err, "redis txn commit: allocate commitTS") - } - listElems := t.buildListElems(commitTS) - zsetElems, err := t.buildZSetElems(commitTS) - if err != nil { - return preparedTxnDispatch{cancel: func() {}}, err - } - // TTL elements: string keys have TTL embedded in value (buildKeyElems handles that), - // non-string keys get a !redis|ttl| element written in the same transaction. - ttlElems := t.buildTTLElems() - - // Derive a single redisDispatchTimeout-bounded context covering both - // the stream-deletion scans (paginated ScanAt/ExistsAt over - // StreamEntryScanPrefix) and the final Dispatch. The parent is the - // txnContext's own ctx (the caller's dispatchCtx), not the server- - // lifetime handlerContext, so an outer cancellation (client - // disconnect, retryRedisWrite timeout) interrupts the prepare+dispatch - // promptly instead of waiting the full redisDispatchTimeout. Symmetric - // with the reuseCtx threading in runTransactionWithDedup. The nil-guard - // falls back to handlerContext for callers that construct a txnContext - // without setting ctx (test fixtures). - parentCtx := t.ctx - if parentCtx == nil { - parentCtx = t.server.handlerContext() - } - ctx, cancel := context.WithTimeout(parentCtx, redisDispatchTimeout) - - streamElems, err := t.buildStreamDeletionElems(ctx) - if err != nil { - cancel() - return preparedTxnDispatch{cancel: func() {}}, err - } - - elems = append(elems, listElems...) - elems = append(elems, zsetElems...) - elems = append(elems, ttlElems...) - elems = append(elems, streamElems...) - - readKeys := make([][]byte, 0, len(t.readKeys)) - for _, k := range t.readKeys { - readKeys = append(readKeys, k) - } - return preparedTxnDispatch{ - elems: elems, - commitTS: commitTS, - readKeys: readKeys, - ctx: ctx, - cancel: cancel, - }, nil -} - -func (t *txnContext) commit() error { - prepared, err := t.prepareDispatch() - if err != nil { - return err - } - defer prepared.cancel() - if len(prepared.elems) == 0 { - return nil - } - group := &kv.OperationGroup[kv.OP]{ - IsTxn: true, - Elems: prepared.elems, - StartTS: t.startTS, - CommitTS: prepared.commitTS, - ReadKeys: prepared.readKeys, - } - if _, err := t.server.coordinator.Dispatch(prepared.ctx, group); err != nil { - return errors.WithStack(err) - } - return nil -} - -// stringValueAndTTLElem returns the encoded string value and an optional -// !redis|ttl| scan-index mutation for a string write. Dirty EXPIRE/PERSIST -// state takes priority; otherwise the TTL loaded with the value is preserved -// so commands like INCR or SETBIT inside MULTI/EXEC don't clear it. A dirty -// PERSIST emits a Del so the sweeper cannot later expire a persistent key. -func (t *txnContext) stringValueAndTTLElem(userKey []byte, tv *txnValue) ([]byte, *kv.Elem[kv.OP]) { - ttl := tv.ttl - ttlSt := t.ttlStates[string(userKey)] - if ttlSt != nil && ttlSt.dirty { - ttl = ttlSt.value - } - value := encodeRedisStr(tv.raw, ttl) - if ttl != nil { - return value, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(userKey), Value: encodeRedisTTL(*ttl)} - } - // ttl is nil: emit Del when there was a prior TTL (loaded or dirty-cleared) - // so the sweeper cannot later expire a now-persistent key or hit a stale index. - if tv.ttl != nil || (ttlSt != nil && ttlSt.dirty) { - return value, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(userKey)} - } - return value, nil -} - -func (t *txnContext) buildKeyElems() []*kv.Elem[kv.OP] { - keys := make([]string, 0, len(t.working)) - for k := range t.working { - keys = append(keys, k) - } - sort.Strings(keys) - - var elems []*kv.Elem[kv.OP] - for _, k := range keys { - tv := t.working[k] - if !tv.dirty { - continue - } - storageKey := []byte(k) - if tv.deleted { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: storageKey}) - // Deleting a string anchor must also drop any stale !redis|ttl| - // scan-index entry; buildTTLElems skips strings because it assumes - // the inline-TTL path owns them. - if bytes.HasPrefix(storageKey, []byte(redisStrPrefix)) { - userKey := storageKey[len(redisStrPrefix):] - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(userKey)}) - } - continue - } - value := tv.raw - if bytes.HasPrefix(storageKey, []byte(redisStrPrefix)) { - userKey := storageKey[len(redisStrPrefix):] - var extra *kv.Elem[kv.OP] - value, extra = t.stringValueAndTTLElem(userKey, tv) - if extra != nil { - elems = append(elems, extra) - } - } - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: storageKey, Value: value}) - } - return elems -} - -func listDeleteMeta(st *listTxnState) (store.ListMeta, bool) { - switch { - case st.metaExists: - return st.meta, true - case st.purge: - return st.purgeMeta, true - default: - return store.ListMeta{}, false - } -} - -func appendListDeleteOps(elems []*kv.Elem[kv.OP], userKey []byte, meta store.ListMeta) []*kv.Elem[kv.OP] { - for seq := meta.Head; seq < meta.Tail; seq++ { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: listItemKey(userKey, seq)}) - } - return append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: listMetaKey(userKey)}) -} - -func (t *txnContext) buildListElems(commitTS uint64) []*kv.Elem[kv.OP] { - listKeys := make([]string, 0, len(t.listStates)) - for k := range t.listStates { - listKeys = append(listKeys, k) - } - sort.Strings(listKeys) - - var elems []*kv.Elem[kv.OP] - var seqInTxn uint32 - for _, k := range listKeys { - st := t.listStates[k] - userKey := []byte(k) - - if st.deleted { - if meta, ok := listDeleteMeta(st); ok { - elems = appendListDeleteOps(elems, userKey, meta) - } - // Delete existing delta keys so they don't survive the logical delete. - for _, dk := range st.existingDeltas { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: dk}) - } - continue - } - if len(st.appends) == 0 { - continue - } - if st.purge { - elems = appendListDeleteOps(elems, userKey, st.purgeMeta) - // Delete existing delta keys so they don't accumulate after DEL+RPUSH. - for _, dk := range st.existingDeltas { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: dk}) - } - } - - startSeq := st.meta.Head + st.meta.Len - for i, v := range st.appends { - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: listItemKey(userKey, startSeq+int64(i)), - Value: v, - }) - } - - // Emit a Delta key instead of updating the base metadata key. - // Each list key in this transaction gets a unique seqInTxn. - n := int64(len(st.appends)) - deltaVal := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: 0, LenDelta: n}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ListMetaDeltaKey(userKey, commitTS, seqInTxn), - Value: deltaVal, - }) - seqInTxn++ - } - return elems -} - -func (t *txnContext) buildZSetElems(commitTS uint64) ([]*kv.Elem[kv.OP], error) { - keys := make([]string, 0, len(t.zsetStates)) - for k := range t.zsetStates { - keys = append(keys, k) - } - sort.Strings(keys) - - elems := make([]*kv.Elem[kv.OP], 0, len(keys)) - seqInTxn := uint32(0) - for _, k := range keys { - st := t.zsetStates[k] - if !st.dirty { - continue - } - key := []byte(k) - if st.isWide { - wideElems, lenDelta := buildZSetWideElems(key, st) - elems = append(elems, wideElems...) - if lenDelta != 0 { - deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: lenDelta}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetMetaDeltaKey(key, commitTS, seqInTxn), - Value: deltaVal, - }) - seqInTxn++ - } - continue - } - // Legacy blob path. - if len(st.members) == 0 { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisZSetKey(key)}) - continue - } - payload, err := marshalZSetValue(redisZSetValue{Entries: zsetMapToEntries(st.members)}) - if err != nil { - return nil, err - } - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisZSetKey(key), Value: payload}) - } - return elems, nil -} - -// buildZSetWideElems computes the minimal set of ops to transition from st.origMembers to -// st.members in wide-column format. Returns the ops and the net length delta. -func buildZSetWideElems(key []byte, st *zsetTxnState) ([]*kv.Elem[kv.OP], int64) { - elems := make([]*kv.Elem[kv.OP], 0, len(st.members)+len(st.origMembers)) - var lenDelta int64 - - // Deletions: members removed or score changed (old score index must be removed). - for member, oldScore := range st.origMembers { - newScore, inNew := st.members[member] - if !inNew { - // Fully removed. - elems = append(elems, - &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetMemberKey(key, []byte(member))}, - &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}, - ) - lenDelta-- - } else if newScore != oldScore { - // Score updated: delete old score index. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}) - } - } - - // Insertions / updates. - for member, newScore := range st.members { - _, wasOrig := st.origMembers[member] - elems = append(elems, - &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetMemberKey(key, []byte(member)), Value: store.MarshalZSetScore(newScore)}, - &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, newScore, []byte(member)), Value: []byte{}}, - ) - if !wasOrig { - lenDelta++ - } - } - return elems, lenDelta -} - -// buildStreamDeletionElems expands every user key queued in streamDeletions -// into the Del operations that actually tombstone a migrated stream: -// !stream|meta| and every !stream|entry| row. Called from -// commit() so that MULTI/EXEC DEL / EXPIRE 0 on a migrated stream leaves -// the store in a consistent state instead of only dropping the legacy blob. -// Each scan runs at t.startTS so the delete honours the transaction's -// snapshot view. -// -// ctx is the redisDispatchTimeout-bounded context derived in commit(); it -// caps the paginated ExistsAt + scanAllDeltaElems inside -// deleteStreamWideColumnElems so a pathological staged-stream count cannot -// hold the EXEC handler open past the per-request budget. -func (t *txnContext) buildStreamDeletionElems(ctx context.Context) ([]*kv.Elem[kv.OP], error) { - if len(t.streamDeletions) == 0 { - return nil, nil - } - keys := make([]string, 0, len(t.streamDeletions)) - for k := range t.streamDeletions { - keys = append(keys, k) - } - sort.Strings(keys) - var elems []*kv.Elem[kv.OP] - for _, k := range keys { - userKey := t.streamDeletions[k] - streamElems, err := t.server.deleteStreamWideColumnElems(ctx, userKey, t.startTS) - if err != nil { - return nil, err - } - elems = append(elems, streamElems...) - } - return elems, nil -} - -// buildTTLElems returns !redis|ttl| Raft elements for non-string keys with dirty TTL state. -// String keys have TTL embedded in the value; they are handled by buildKeyElems. -func (t *txnContext) buildTTLElems() []*kv.Elem[kv.OP] { - var elems []*kv.Elem[kv.OP] - for k, st := range t.ttlStates { - if !st.dirty { - continue - } - // String keys encode TTL inside the value in buildKeyElems; skip them here. - if _, isString := t.working[string(redisStrKey([]byte(k)))]; isString { - continue - } - if st.value == nil { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey([]byte(k))}) - } else { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey([]byte(k)), Value: encodeRedisTTL(*st.value)}) - } - } - return elems -} - -func (r *RedisServer) runTransaction(queue []redcon.Command) ([]redisResult, error) { - if r.onePhaseTxnDedup { - return r.runTransactionWithDedup(queue) - } - - dispatchCtx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - var results []redisResult - err := r.retryRedisWrite(dispatchCtx, func() error { - startTS := r.txnStartTS() - readPin := r.pinReadTS(startTS) - defer readPin.Release() - - txn := &txnContext{ - server: r, - ctx: dispatchCtx, - working: map[string]*txnValue{}, - listStates: map[string]*listTxnState{}, - zsetStates: map[string]*zsetTxnState{}, - ttlStates: map[string]*ttlTxnState{}, - readKeys: map[string][]byte{}, - streamDeletions: map[string][]byte{}, - startTS: startTS, - } - - nextResults := make([]redisResult, 0, len(queue)) - for _, cmd := range queue { - res, err := txn.apply(cmd) - if err != nil { - return err - } - nextResults = append(nextResults, res) - } - - if err := txn.validateReadSet(dispatchCtx); err != nil { - return err - } - if err := txn.commit(); err != nil { - return err - } - results = nextResults - return nil - }) - if err != nil { - return nil, err - } - - return results, nil -} - -// reusableExecTxn captures a dispatched MULTI/EXEC transaction so a -// subsequent retry can reuse its exact write set under a fresh -// commit_ts (carrying prev_commit_ts) and probe whether the prior -// attempt already landed. This is the EXEC analogue of -// reusableListPush (M3 R1 result reconstruction for MULTI/EXEC). -// -// `results` is computed once from attempt 1's startTS snapshot and is -// invariant across reuse for the same reason RPUSH/LPUSH's `length` -// is: the write set is fixed, so apply-vs-no-op is invisible to the -// client. Reads in the EXEC body returned values from attempt 1's -// snapshot — those values were what the client would have observed if -// attempt 1 hadn't returned an ambiguous error, so caching them is -// the right semantics for a confirmed-or-deduped commit. A -// genuine cross-txn conflict is caught by OCC on readKeys at the FSM -// apply (WriteConflict → drop pending → recompute), so the cached -// results are only returned when reuse actually represents the -// outcome of attempt 1's intent. -type reusableExecTxn struct { - elems []*kv.Elem[kv.OP] - startTS uint64 - commitTS uint64 - readKeys [][]byte - results []redisResult -} - -// dispatchExecReuse runs one iteration of the option-2 reuse path for -// MULTI/EXEC: dispatches the captured write set under a fresh -// commit_ts (carrying pending.commitTS as PrevCommitTS so the FSM -// probes whether the prior attempt landed) and returns the cached -// client-visible results on success. The drop return signals the -// caller to clear pending — set on a genuine WriteConflict from -// another txn (after the self-conflict probe rules out our own apply) -// so the next iteration rebuilds the txn from a fresh read snapshot. -// -// Mirrors dispatchListPushReuse; the only difference is the result -// payload (cached []redisResult vs computed list length) and the lack -// of a meta re-read fallback — for EXEC there is no post-apply "what -// is the current length" question; the client-visible result IS the -// cached results array. -func (r *RedisServer) dispatchExecReuse(ctx context.Context, pending *reusableExecTxn) (results []redisResult, drop bool, err error) { - // gemini PR-A HIGH: persistence-grade commit_ts allocation must honor the - // HLC-4 physical-ceiling fence (see kv/hlc.go NextFenced + the TLA proof - // at tla/hlc/MCHLC_gap.cfg). Clock().Next() bypasses the ceiling and - // could issue a timestamp that collides with a subsequent leader's - // window after renewal — the very class of bug option-2 is meant to - // rule out. - commitTS, allocErr := r.coordinator.Clock().NextFenced() - if allocErr != nil { - return nil, false, errors.Wrap(allocErr, "redis exec reuse: allocate commitTS") - } - _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: pending.startTS, - CommitTS: commitTS, - PrevCommitTS: pending.commitTS, - ReadKeys: pending.readKeys, - Elems: pending.elems, - }) - if dispErr == nil { - return pending.results, false, nil - } - if errors.Is(dispErr, store.ErrWriteConflict) { - // Self-inflicted-conflict guard (mirrors dispatchListPushReuse): - // the apply might have landed at this fresh commitTS but bubbled - // up as WriteConflict due to leadership churn. Probe whether our - // reused write set actually landed; if yes, return the cached - // results unchanged (they describe the EXEC body's outcome - // against attempt 1's snapshot, which is the outcome whether - // the bytes hit MVCC at attempt-1's commitTS or at this fresh - // commitTS — the OCC fence on readKeys guarantees no - // intervening cross-txn write slipped past). - if probeKey := firstWriteKey(pending.elems); len(probeKey) > 0 { - landed, perr := r.store.CommittedVersionAt(ctx, probeKey, commitTS) - if perr == nil && landed { - pending.commitTS = commitTS - return pending.results, false, nil - } - } - // Our attempt did not land at commitTS and a key collides with - // another txn — genuine conflict. Drop pending so the next - // iteration rebuilds from a fresh snapshot. - return nil, true, errors.WithStack(dispErr) - } - // Still ambiguous (lock / other retryable): the reuse may itself - // have landed, so the next retry must probe THIS commit_ts. Only - // advance pending.commitTS if retryRedisWrite will actually loop - // (non-retryable errors escape to the client; pending is then - // discarded with the goroutine). - if isRetryableRedisTxnErr(dispErr) { - pending.commitTS = commitTS - } - return nil, false, errors.WithStack(dispErr) -} - -// runTransactionWithDedup is the option-2 retry loop for MULTI/EXEC. -// The first attempt builds the txn write set + cached results from -// the user's startTS snapshot; any retryable failure makes the next -// iteration REUSE that write set under a fresh commit_ts with -// prev_commit_ts set, so the FSM no-ops if the prior attempt already -// landed. A WriteConflict on a reuse attempt (after the self-conflict -// probe rules out our own apply) means another txn touched a read or -// write key, and we drop pending → rebuild from a fresh snapshot. -// -// Mirrors listPushCoreWithDedup at the EXEC granularity. -func (r *RedisServer) runTransactionWithDedup(queue []redcon.Command) ([]redisResult, error) { - dispatchCtx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - var results []redisResult - var pending *reusableExecTxn - err := r.retryRedisWrite(dispatchCtx, func() error { - if pending != nil { - // gemini PR-A MEDIUM: derive the per-attempt reuse ctx from the - // caller's `dispatchCtx` (not `r.handlerContext()`) so a cancelled - // caller stops the reuse promptly. Per-attempt `redisDispatchTimeout` - // still caps the dispatch the same way `commit()` does for the - // first attempt; what changes is that an outer cancellation can - // now interrupt mid-attempt instead of being ignored until the - // fresh 10 s budget elapses. The earlier "fresh ctx from - // handlerContext" pattern (noted in design doc §M3) was strictly - // more conservative but wasted resources on a disconnected - // client. - reuseCtx, reuseCancel := context.WithTimeout(dispatchCtx, redisDispatchTimeout) - defer reuseCancel() - res, drop, dispErr := r.dispatchExecReuse(reuseCtx, pending) - if drop { - pending = nil - } - if dispErr != nil { - return dispErr - } - results = res - return nil - } - res, next, ferr := r.firstExecAttempt(dispatchCtx, queue) - if ferr != nil { - if next != nil { - pending = next - } - return ferr - } - results = res - return nil - }) - if err != nil { - return nil, err - } - return results, nil -} - -// firstExecAttempt runs the initial (no-reuse) EXEC attempt: builds the -// txn snapshot, applies each command to capture the client-visible -// results, validates the read set, and dispatches. On success returns -// the results. On a retryable dispatch failure it returns a -// reusableExecTxn capturing what the retry loop needs to dispatch via -// PrevCommitTS on the next iteration; non-retryable failures return a -// nil reuse state (mirrors listPushCoreWithDedup's gating). Extracted -// from runTransactionWithDedup to keep that loop under the cyclop -// budget; the dedup rationale lives there. -func (r *RedisServer) firstExecAttempt(dispatchCtx context.Context, queue []redcon.Command) ([]redisResult, *reusableExecTxn, error) { - startTS := r.txnStartTS() - readPin := r.pinReadTS(startTS) - defer readPin.Release() - - txn := &txnContext{ - server: r, - ctx: dispatchCtx, - working: map[string]*txnValue{}, - listStates: map[string]*listTxnState{}, - zsetStates: map[string]*zsetTxnState{}, - ttlStates: map[string]*ttlTxnState{}, - readKeys: map[string][]byte{}, - streamDeletions: map[string][]byte{}, - startTS: startTS, - } - - nextResults := make([]redisResult, 0, len(queue)) - for _, cmd := range queue { - res, err := txn.apply(cmd) - if err != nil { - return nil, nil, err - } - nextResults = append(nextResults, res) - } - - if err := txn.validateReadSet(dispatchCtx); err != nil { - return nil, nil, err - } - - prepared, err := txn.prepareDispatch() - if err != nil { - return nil, nil, err - } - defer prepared.cancel() - if len(prepared.elems) == 0 { - // Read-only EXEC: nothing to dispatch, no dedup window. - return nextResults, nil, nil - } - - group := &kv.OperationGroup[kv.OP]{ - IsTxn: true, - Elems: prepared.elems, - StartTS: txn.startTS, - CommitTS: prepared.commitTS, - ReadKeys: prepared.readKeys, - } - if _, dispErr := r.coordinator.Dispatch(prepared.ctx, group); dispErr != nil { - // Only remember the attempt for reuse if retryRedisWrite will - // actually loop. Mirrors listPushCoreWithDedup's gating - // rationale — errors that escape the loop (transient-leader, - // context deadline, FSM apply error) leave pending pointing at - // state wasted with the goroutine; ambiguous errors that - // escape to the client are out of scope for this loop. - if isRetryableRedisTxnErr(dispErr) { - return nil, &reusableExecTxn{ - elems: prepared.elems, - startTS: txn.startTS, - commitTS: prepared.commitTS, - readKeys: prepared.readKeys, - results: nextResults, - }, errors.WithStack(dispErr) - } - return nil, nil, errors.WithStack(dispErr) - } - return nextResults, nil, nil -} - -func (r *RedisServer) txnStartTS() uint64 { - // store.LastCommitTS() is the authoritative safe-snapshot watermark: it is - // updated atomically only AFTER the corresponding Pebble batch commit, so - // every version with commitTS ≤ store.LastCommitTS() is guaranteed visible - // in the store when we read. - // - // We must NOT return clock.Next() here. clock.Next() can be AHEAD of - // store.LastCommitTS() because concurrent dispatchTxn calls advance the HLC - // before their Raft entry is applied. If startTS = clock.Next() = T, a - // concurrent transaction that already called clock.Next() to obtain - // commitTS = T-1 and is still in the Raft pipeline will satisfy - // latestTS(key) = T-1 ≤ T = startTS - // causing the FSM conflict check (latestTS > startTS) to silently pass even - // though we read stale data. This allows two concurrent RPUSHes to pick the - // same sequence number, with the second overwriting the first — a lost write. - // - // Using store.LastCommitTS() directly closes this gap: any concurrent commit - // at > maxTS triggers a WriteConflict and a retry via retryRedisWrite. - // - // The Observe call still advances the HLC so that dispatchTxn's clock.Next() - // produces a commitTS strictly greater than maxTS (leader-election safety). - // - // When maxTS is 0 (empty store) we return 1 so the coordinator treats this - // as a valid startTS and does not override it with clock.Next() — which - // could be ahead of unapplied Raft entries and reintroduce the anomaly. - var maxTS uint64 - if r.store != nil { - maxTS = r.store.LastCommitTS() - } - if r.coordinator != nil && r.coordinator.Clock() != nil && maxTS > 0 { - r.coordinator.Clock().Observe(maxTS) - } - if maxTS == 0 { - return 1 - } - return maxTS -} - -func (r *RedisServer) writeResults(conn redcon.Conn, results []redisResult) { - conn.WriteArray(len(results)) - for _, res := range results { - switch res.typ { - case resultNil: - conn.WriteNull() - case resultError: - writeRedisError(conn, res.err) - case resultBulk: - conn.WriteBulk(res.bulk) - case resultString: - conn.WriteString(res.str) - case resultArray: - conn.WriteArray(len(res.arr)) - for _, s := range res.arr { - conn.WriteBulkString(s) - } - case resultInt: - conn.WriteInt64(res.integer) - default: - conn.WriteNull() - } - } -} - -func listMetaKey(userKey []byte) []byte { - return store.ListMetaKey(userKey) -} - -func listItemKey(userKey []byte, seq int64) []byte { - return store.ListItemKey(userKey, seq) -} - -func clampRange(start, end, length int) (int, int) { - if start < 0 { - start = length + start - } - if end < 0 { - end = length + end - } - if start < 0 { - start = 0 - } - if end >= length { - end = length - 1 - } - if end < start { - return 0, -1 - } - return start, end -} - -func (r *RedisServer) loadListMetaAt(ctx context.Context, key []byte, readTS uint64) (store.ListMeta, bool, error) { - val, err := r.store.GetAt(ctx, store.ListMetaKey(key), readTS) - if err != nil { - if errors.Is(err, store.ErrKeyNotFound) { - return store.ListMeta{}, false, nil - } - return store.ListMeta{}, false, errors.WithStack(err) - } - meta, err := store.UnmarshalListMeta(val) - if err != nil { - return store.ListMeta{}, false, errors.WithStack(err) - } - return meta, true, nil -} - -func (r *RedisServer) isListKeyAt(ctx context.Context, key []byte, readTS uint64) (bool, error) { - _, exists, err := r.loadListMetaAt(ctx, key, readTS) - return exists, err -} - -// buildRPushOps creates operations to append values to the tail of a list using -// the Delta pattern. Instead of writing to the base metadata key (causing OCC -// conflicts), it emits a single ListMetaDelta key with LenDelta = len(values). -// commitTS must be pre-allocated via dispatchElemsWithCommitTS; seqInTxn -// disambiguates multiple push operations in the same transaction. -func (r *RedisServer) buildRPushOps(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) { - if len(values) == 0 { - return nil, meta, nil - } - - elems := make([]*kv.Elem[kv.OP], 0, len(values)+1) - seq := meta.Head + meta.Len - for _, v := range values { - vCopy := bytes.Clone(v) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: listItemKey(key, seq), Value: vCopy}) - seq++ - } - - // Emit a Delta key instead of writing the base meta key. - delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: 0, LenDelta: int64(len(values))}) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListMetaDeltaKey(key, commitTS, seqInTxn), Value: delta}) - - meta.Len += int64(len(values)) - meta.Tail = meta.Head + meta.Len - return elems, meta, nil -} - -// listPushBuildFn is the type for functions that build list push operations. -type listPushBuildFn func(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) - -// listPushCore is the shared retry loop for RPUSH and LPUSH. The caller supplies -// a buildFn that assembles the specific operations (RPUSH appends to tail, LPUSH -// prepends to head). When onePhaseTxnDedup is enabled it uses the write-set-reuse -// retry path (option 2); otherwise it keeps the original recompute-on-retry loop. -func (r *RedisServer) listPushCore(ctx context.Context, key []byte, values [][]byte, buildFn listPushBuildFn) (int64, error) { - if r.onePhaseTxnDedup { - return r.listPushCoreWithDedup(ctx, key, values, buildFn) - } - - var newLen int64 - err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - meta, _, err := r.resolveListMeta(ctx, key, readTS) - if err != nil { - return err - } - - // Pre-allocate commitTS so we can embed it in the Delta key. - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return errors.Wrap(err, "listPushCore: allocate commitTS") - } - ops, updatedMeta, err := buildFn(meta, key, values, commitTS, 0) - if err != nil { - return err - } - if len(ops) == 0 { - newLen = updatedMeta.Len - return nil - } - - // Dispatch with the pre-allocated commitTS. - _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: ops, - }) - if dispErr != nil { - return errors.WithStack(dispErr) - } - newLen = updatedMeta.Len - return nil - }) - return newLen, err -} - -// reusableListPush captures a dispatched list-push attempt so a subsequent -// retry can reuse its exact write set (same seq, same item/delta keys) and -// probe whether it already landed, instead of recomputing seq from a fresh -// meta read. Recomputing is what duplicates the element under leadership -// churn: attempt 1 commits at T1 but returns an ambiguous error, the retry -// reads the now-larger list and appends at a NEW seq. Reuse + the FSM's -// exact-ts dedup probe close that. See option 2 in -// docs/design/2026_05_21_proposed_txn_secondary_idempotency.md. -type reusableListPush struct { - ops []*kv.Elem[kv.OP] - startTS uint64 - // commitTS is the most recent dispatched commit_ts for this write set; - // the next retry passes it as prev_commit_ts so the FSM probes exactly - // the attempt that might have landed. - commitTS uint64 - // length is the client-visible post-push length. It is invariant across - // reuse — the write set was built once from attempt 1's meta — so it is - // also the correct value to return when the FSM dedup no-ops the apply - // (R1 result reconstruction: no store re-read needed). - length int64 - // readKeys is the boundary read set captured at attempt 1's meta read: - // listItemKey(Head) and (when Len > 1) listItemKey(Tail-1). It is the - // load-bearing fence against the codex P1 scenario where an intervening - // pop/trim shrinks the list before the retry — without it, the reused - // seq would land past the new Tail and be unreachable to LRANGE. OCC - // validates these atomically against startTS at FSM apply, so any - // boundary-touching commit fires WriteConflict and the adapter drops - // pending → recomputes. Empty when attempt 1 read an empty list (no - // boundary to fence; the OCC on the write key suffices for that case). - readKeys [][]byte -} - -// dispatchListPushReuse runs one iteration of the option-2 reuse path: -// dispatches the captured write set under a fresh commit_ts (carrying -// pending.commitTS as PrevCommitTS so the FSM probes whether the prior -// attempt landed) and returns the post-push length on success. The drop -// return signals the caller to clear pending — set on a genuine -// WriteConflict from another txn so the next iteration recomputes from -// fresh meta. Extracted from listPushCoreWithDedup to keep that closure -// under the cyclop / gocognit / nestif limits. -func (r *RedisServer) dispatchListPushReuse(ctx context.Context, key []byte, pending *reusableListPush) (newLen int64, drop bool, err error) { - // HLC-4 parity: persistence-grade commit_ts allocation must honor - // the physical-ceiling fence so a stale-leader window cannot mint a - // timestamp that collides with the successor's. The error path - // returns ErrCeilingExpired which isRetryableRedisTxnErr classifies - // as non-retryable, so it exits retryRedisWrite directly to the - // client — same shape as the other persistence-grade Next call - // sites in this file. - commitTS, allocErr := r.coordinator.Clock().NextFenced() - if allocErr != nil { - return 0, false, errors.Wrap(allocErr, "redis list-push reuse: allocate commitTS") - } - _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: pending.startTS, - CommitTS: commitTS, - PrevCommitTS: pending.commitTS, - ReadKeys: pending.readKeys, - Elems: pending.ops, - }) - if dispErr == nil { - return r.resolveReuseLength(ctx, key, pending), false, nil - } - if errors.Is(dispErr, store.ErrWriteConflict) { - // Self-inflicted-conflict guard (codex P1): the apply might have - // landed at this fresh commitTS but bubbled up as WriteConflict due - // to leadership churn (the original bug class the doc's "Resolved" - // section identifies). Without this probe, dropping pending here - // would recompute and append a second copy. Ask the store: did - // our just-attempted commit_ts land? If yes, this conflict is - // against our own commit — return success and keep pending pointing - // at THIS commit_ts so any subsequent retry probes the right point. - // - // Length resolution (codex P2 round-11): pending.length was computed - // during the prior attempt and is stale w.r.t. any non-conflicting - // list-modifying writes that landed between attempt 1 and this fresh - // apply. Probing pending.commitTS would hit for the fresh apply and - // (under the old resolveReuseLength shortcut) silently return the - // prior-attempt length — understating the count. Always re-read meta - // in the self-conflict path. resolveListMeta failure falls back to - // pending.length to honor codex P2 round-10 ("avoid failing after a - // reuse apply"). - if probeKey := firstWriteKey(pending.ops); len(probeKey) > 0 { - landed, perr := r.store.CommittedVersionAt(ctx, probeKey, commitTS) - if perr == nil && landed { - pending.commitTS = commitTS - return r.resolveLengthAfterFreshApply(ctx, key, pending), false, nil - } - } - // Our attempt did not land at commitTS and the target seq is taken - // by another txn — a genuine conflict. Drop pending; the next - // iteration recomputes from a fresh meta read. - return 0, true, errors.WithStack(dispErr) - } - // Still ambiguous (lock / other retryable): this reuse may itself - // have landed, so the next retry must probe THIS commit_ts. Only - // advance pending.commitTS if retryRedisWrite will actually loop - // (non-retryable errors escape to the client; pending is then - // discarded with the goroutine, so the update is wasted and the - // stale value would be misleading if some future caller reads it). - if isRetryableRedisTxnErr(dispErr) { - pending.commitTS = commitTS - } - return 0, false, errors.WithStack(dispErr) -} - -// resolveReuseLength returns the client-visible post-push length after a -// successful reuse dispatch. If our prior attempt's exact commit_ts -// version exists, the FSM no-op'd (probe hit) and pending.length is the -// correct length we computed at that attempt. Otherwise the FSM applied -// the reused write set at a fresh commit_ts and we must re-read meta to -// capture any non-conflicting list-modifying writes that committed -// between attempts (codex P2) — without this, the return value would -// silently understate the count when the boundary OCC fence and -// write-key OCC both pass but the list length changed. -// -// Failure modes are converted to a degraded return (pending.length) rather -// than surfaced as an error, because the dispatch already committed. Per -// codex P2 round-10 ("avoid failing after a reuse apply"), reporting a -// write error after the apply landed drives the client into a retry that -// has no pending state and would re-append the element — the very anomaly -// this feature prevents. Specifically: -// - probe error of any kind: prefer pending.length over failure. -// - resolveListMeta failure (e.g. delta scan over MaxDeltaScanLimit -// under churn): fall back to pending.length. -// -// Returns int64 directly (no error) so callers do not have to invent -// caller-side fallback logic; the degraded-return contract is fixed here -// (golangci unparam / nilerr fix on the prior error-returning shape). -func (r *RedisServer) resolveReuseLength(ctx context.Context, key []byte, pending *reusableListPush) int64 { - if probeKey := firstWriteKey(pending.ops); len(probeKey) > 0 { - hit, perr := r.store.CommittedVersionAt(ctx, probeKey, pending.commitTS) - if perr == nil && hit { - return pending.length - } - if perr != nil { - // Probe failed; the dispatch already committed so degrade - // gracefully rather than propagate the read error. - return pending.length - } - // perr == nil && !hit: prior attempt didn't land at this ts; the - // FSM applied fresh writes, fall through to re-read meta. - } - return r.resolveLengthAfterFreshApply(ctx, key, pending) -} - -// resolveLengthAfterFreshApply re-reads list meta to capture the post-apply -// length when we know the fresh commitTS applied (no probe shortcut), with -// the same fall-back-to-pending.length contract as resolveReuseLength. Used -// by the self-conflict path (codex P2 round-11): there pending.length is -// stale w.r.t. intervening non-conflicting writes, so the probe-hit -// shortcut would silently understate the count. -func (r *RedisServer) resolveLengthAfterFreshApply(ctx context.Context, key []byte, pending *reusableListPush) int64 { - currentMeta, _, mErr := r.resolveListMeta(ctx, key, r.readTS()) - if mErr != nil { - return pending.length - } - return currentMeta.Len -} - -// firstWriteKey returns the first non-empty element key from ops, or nil -// when there is none. Used after a successful reuse dispatch to probe -// whether our prior attempt's commit_ts actually landed: attempt 1 writes -// all its elem keys atomically at the same commit_ts, so any one of them -// answers the question. -func firstWriteKey(ops []*kv.Elem[kv.OP]) []byte { - for _, e := range ops { - if e != nil && len(e.Key) > 0 { - return e.Key - } - } - return nil -} - -// listPushBoundaryReadKeys returns the boundary positions of the list as -// read keys for OCC. Including these in the dispatched OperationGroup makes -// FSM apply atomically reject the retry when any pop/trim has touched the -// boundary between attempts (codex P1 fix: prevents a reused seq from -// landing past a shrunk Tail). The keys are deduped: a single-element list -// has Head == Tail-1, so we emit it once. -func listPushBoundaryReadKeys(key []byte, meta store.ListMeta) [][]byte { - if meta.Len <= 0 { - return nil - } - tailIdx := meta.Tail - 1 - if tailIdx == meta.Head { - return [][]byte{listItemKey(key, meta.Head)} - } - return [][]byte{ - listItemKey(key, meta.Head), - listItemKey(key, tailIdx), - } -} - -// listPushCoreWithDedup is the option-2 retry loop. The first attempt computes -// the write set from the current meta; any retryable failure makes the next -// iteration REUSE that write set under a fresh commit_ts with prev_commit_ts -// set, so the FSM no-ops if the prior attempt already landed. A WriteConflict -// on a reuse attempt means the probe ruled out our own prior attempt and the -// seq is genuinely taken by another txn, so we fall back to a full recompute. -func (r *RedisServer) listPushCoreWithDedup(ctx context.Context, key []byte, values [][]byte, buildFn listPushBuildFn) (int64, error) { - var newLen int64 - var pending *reusableListPush - err := r.retryRedisWrite(ctx, func() error { - if pending != nil { - length, drop, dispErr := r.dispatchListPushReuse(ctx, key, pending) - if drop { - pending = nil - } - if dispErr != nil { - return dispErr - } - newLen = length - return nil - } - - readTS := r.readTS() - meta, _, err := r.resolveListMeta(ctx, key, readTS) - if err != nil { - return err - } - - // HLC-4 parity with prepareDispatch / dispatchExecReuse — - // see dispatchListPushReuse above for the rationale. - commitTS, allocErr := r.coordinator.Clock().NextFenced() - if allocErr != nil { - return errors.Wrap(allocErr, "redis list-push first-attempt: allocate commitTS") - } - ops, updatedMeta, err := buildFn(meta, key, values, commitTS, 0) - if err != nil { - return err - } - if len(ops) == 0 { - newLen = updatedMeta.Len - return nil - } - - startTS := normalizeStartTS(readTS) - boundaryReads := listPushBoundaryReadKeys(key, meta) - _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: startTS, - CommitTS: commitTS, - ReadKeys: boundaryReads, - Elems: ops, - }) - if dispErr == nil { - newLen = updatedMeta.Len - return nil - } - // Only remember the attempt for reuse if retryRedisWrite will actually - // loop — i.e. the error is one of WriteConflict / TxnLocked. For - // errors that escape the loop (transient-leader, context deadline, - // FSM apply error, etc.), `pending` would be discarded with the - // goroutine, and recording it would mislead a future reader about - // what state was preserved. The dedup window is therefore bounded by - // retryRedisWrite's retry predicate; ambiguous errors that escape - // to the client are a separate problem space (cross-request - // idempotency cache) and out of scope for this design. - if isRetryableRedisTxnErr(dispErr) { - pending = &reusableListPush{ - ops: ops, - startTS: startTS, - commitTS: commitTS, - length: updatedMeta.Len, - readKeys: boundaryReads, - } - } - return errors.WithStack(dispErr) - }) - return newLen, err -} - -func (r *RedisServer) listRPush(ctx context.Context, key []byte, values [][]byte) (int64, error) { - return r.listPushCore(ctx, key, values, r.buildRPushOps) -} - -// buildLPushOps creates operations to prepend values to the head of a list using -// the Delta pattern. LPUSH reverses the order of arguments: -// LPUSH key a b c → [c, b, a, ...existing]. -func (r *RedisServer) buildLPushOps(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) { - if len(values) == 0 { - return nil, meta, nil - } - - n := int64(len(values)) - if meta.Head < math.MinInt64+n { - return nil, meta, errors.WithStack(errors.New("LPUSH would underflow list Head sequence number")) - } - elems := make([]*kv.Elem[kv.OP], 0, len(values)+1) - // LPUSH reverses args, so last arg gets the lowest sequence number. - newHead := meta.Head - n - for i, v := range values { - // values[0]=a, values[1]=b, values[2]=c → seq ordering: c(newHead), b(newHead+1), a(newHead+2) - seq := newHead + n - 1 - int64(i) - vCopy := bytes.Clone(v) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: listItemKey(key, seq), Value: vCopy}) - } - - // Emit a Delta key instead of writing the base meta key. - delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: -n, LenDelta: n}) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListMetaDeltaKey(key, commitTS, seqInTxn), Value: delta}) - - meta.Head = newHead - meta.Len += n - return elems, meta, nil -} - -func (r *RedisServer) listLPush(ctx context.Context, key []byte, values [][]byte) (int64, error) { - return r.listPushCore(ctx, key, values, r.buildLPushOps) -} - -// clampPopCount clamps count to [1, min(listLen, maxWideColumnItems)]. -// An error is returned when the effective count would exceed maxWideColumnItems, -// which guards against OOM from enormous claim-key allocations. -func clampPopCount(count int, listLen int64) (int64, error) { - n := int64(count) - if n > listLen { - n = listLen - } - if n > int64(maxWideColumnItems) { - return 0, errors.Wrapf(ErrCollectionTooLarge, "LPOP/RPOP count %d exceeds maximum %d", n, maxWideColumnItems) - } - return n, nil -} - -// listPopClaim implements LPOP (left=true) or RPOP (left=false) using the -// Claim pattern to avoid write-write conflicts on the list metadata key. -// For each item popped it emits: -// - Del(listItemKey) — removes the item value -// - Put(listClaimKey, empty) — uniqueness guard; conflicts if another txn -// claims the same sequence number concurrently -// -// A single ListMetaDelta with {HeadDelta, LenDelta} is emitted for the whole batch. -// -// Returns the popped values (len ≤ count) or nil if the list does not exist. -func (r *RedisServer) buildListPopElems(ctx context.Context, key []byte, meta store.ListMeta, n int64, left bool, readTS uint64) ([]string, []*kv.Elem[kv.OP], error) { - // Build the [start, end) scan range covering exactly the n items to pop. - // n is already clamped to meta.Len by the caller, so no overflow is possible. - var startKey, endKey []byte - if left { - startKey = listItemKey(key, meta.Head) - endKey = listItemKey(key, meta.Head+n) - } else { - startKey = listItemKey(key, meta.Tail-n) - endKey = listItemKey(key, meta.Tail) - } - - var kvps []*store.KVPair - var scanErr error - if left { - kvps, scanErr = r.store.ScanAt(ctx, startKey, endKey, int(n), readTS) - } else { - kvps, scanErr = r.store.ReverseScanAt(ctx, startKey, endKey, int(n), readTS) - } - if scanErr != nil { - return nil, nil, errors.WithStack(scanErr) - } - - // Emit claim keys for every sequence position in the claimed range, including - // holes. This ensures that two concurrent pops over the same hole produce a - // write conflict rather than both silently advancing HeadDelta over the same - // empty position, which would otherwise orphan later items. - var claimStart, claimEnd int64 - if left { - claimStart = meta.Head - claimEnd = meta.Head + n - } else { - claimStart = meta.Tail - n - claimEnd = meta.Tail - } - // Capacity: n claim keys + n Del(item) for found items + 1 for the delta key appended by caller. - // n is bounded by maxWideColumnItems (100_000) so the int conversion is safe. - elems := make([]*kv.Elem[kv.OP], 0, int(n)+len(kvps)+listPopDeltaOverhead) - for seq := claimStart; seq < claimEnd; seq++ { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListClaimKey(key, seq), Value: []byte{}}) - } - - values := make([]string, 0, len(kvps)) - for _, pair := range kvps { - _, ok := store.ExtractListItemSeq(pair.Key, key) - if !ok { - continue - } - values = append(values, string(pair.Value)) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: bytes.Clone(pair.Key)}) - } - return values, elems, nil -} - -// checkListKeyType verifies the key is a list. Returns (keyFound, error). -// Writes wrongTypeError if the key exists but is not a list. -func (r *RedisServer) checkListKeyType(ctx context.Context, key []byte, readTS uint64) (found bool, err error) { - typ, typErr := r.keyTypeAt(ctx, key, readTS) - if typErr != nil { - return false, typErr - } - if typ == redisTypeNone { - return false, nil - } - if typ != redisTypeList { - return false, wrongTypeError() - } - return true, nil -} - -// listPopClaimOnce executes one attempt of a pop-with-claim transaction. -// Returns (nil, nil) for a missing key or an empty list, and the popped -// values otherwise. -func (r *RedisServer) listPopClaimOnce(ctx context.Context, key []byte, count int, left bool, readTS uint64) ([]string, error) { - found, typeErr := r.checkListKeyType(ctx, key, readTS) - if typeErr != nil || !found { - return nil, typeErr - } - - meta, exists, metaErr := r.resolveListMeta(ctx, key, readTS) - if metaErr != nil { - return nil, metaErr - } - if !exists || meta.Len == 0 { - // count >= 1 on an empty list: Redis returns nil (same as missing key). - return nil, nil - } - - n, err := clampPopCount(count, meta.Len) - if err != nil { - return nil, err - } - - values, elems, buildErr := r.buildListPopElems(ctx, key, meta, n, left, readTS) - if buildErr != nil { - return nil, buildErr - } - - if err := r.commitListPop(ctx, key, elems, n, left, readTS); err != nil { - return nil, err - } - return values, nil -} - -// commitListPop allocates commitTS, appends the ListMetaDelta entry, -// and dispatches the pop transaction. Extracted from listPopClaimOnce -// so that function stays under the cyclop ceiling after the HLC-4 -// (iii) NextFenced fence added a new error branch (PR #867 Phase 2b). -func (r *RedisServer) commitListPop(ctx context.Context, key []byte, elems []*kv.Elem[kv.OP], n int64, left bool, readTS uint64) error { - // n is the number of sequence positions claimed (including any holes). - // HeadDelta and LenDelta must use n, not len(values), so that Head - // advances past holes and the metadata stays consistent with Tail. - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return errors.Wrap(err, "commitListPop: allocate commitTS") - } - var headDelta int64 - if left { - headDelta = n // head advances by n positions for LPOP - } - delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: headDelta, LenDelta: -n}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ListMetaDeltaKey(key, commitTS, 0), - Value: delta, - }) - - if _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }); dispErr != nil { - return errors.WithStack(dispErr) - } - return nil -} - -func (r *RedisServer) listPopClaim(ctx context.Context, key []byte, count int, left bool) ([]string, error) { - // count=0: Redis returns an empty array if the key exists as a list, nil otherwise. - if count <= 0 { - readTS := r.readTS() - found, err := r.checkListKeyType(ctx, key, readTS) - if err != nil || !found { - return nil, err - } - return []string{}, nil - } - - var popped []string - err := r.retryRedisWrite(ctx, func() error { - result, popErr := r.listPopClaimOnce(ctx, key, count, left, r.readTS()) - if popErr != nil { - return popErr - } - popped = result - return nil - }) - return popped, err -} - -func (r *RedisServer) fetchListRange(ctx context.Context, key []byte, meta store.ListMeta, startIdx, endIdx int64, readTS uint64) ([]string, error) { - if endIdx < startIdx { - return []string{}, nil - } - - startSeq := meta.Head + startIdx - endSeq := meta.Head + endIdx - - startKey := listItemKey(key, startSeq) - endKey := listItemKey(key, endSeq+1) // exclusive - - kvs, err := r.store.ScanAt(ctx, startKey, endKey, int(endIdx-startIdx+1), readTS) - if err != nil { - return nil, errors.WithStack(err) - } - - out := make([]string, 0, len(kvs)) - for _, kvp := range kvs { - out = append(out, string(kvp.Value)) - } - return out, nil -} - -func (r *RedisServer) rangeList(ctx context.Context, key []byte, startRaw, endRaw []byte) ([]string, error) { - if !r.coordinator.IsLeaderForKey(key) { - return r.proxyLRange(key, startRaw, endRaw) - } - - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), key, readTS) - if err != nil { - return nil, err - } - if typ == redisTypeNone { - return []string{}, nil - } - if typ != redisTypeList { - return nil, wrongTypeError() - } - - // PR #749 follow-up: pass the per-call dispatch ctx so a stalled - // VerifyLeaderForKey honours the caller's deadline rather than the - // long-lived handlerContext + verifyLeaderEngineCtx fallback. Same - // shape as keys() / FLUSHDB. - if err := r.coordinator.VerifyLeaderForKey(ctx, key); err != nil { - return nil, errors.WithStack(err) - } - - meta, exists, err := r.resolveListMeta(context.Background(), key, readTS) - if err != nil { - return nil, err - } - if !exists || meta.Len == 0 { - return []string{}, nil - } - - s, e, err := parseRangeBounds(startRaw, endRaw, int(meta.Len)) - if err != nil { - return nil, err - } - - return r.fetchListRange(context.Background(), key, meta, int64(s), int64(e), readTS) -} - -func (r *RedisServer) proxyLRange(key []byte, startRaw, endRaw []byte) ([]string, error) { - leader := r.coordinator.RaftLeaderForKey(key) - if leader == "" { - return nil, ErrLeaderNotFound - } - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) - } - - cli := r.getOrCreateLeaderClient(leaderAddr) - - start, err := parseInt(startRaw) - if err != nil { - return nil, err - } - end, err := parseInt(endRaw) - if err != nil { - return nil, err - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - res, err := cli.LRange(ctx, string(key), int64(start), int64(end)).Result() - return res, errors.WithStack(err) -} - -func (r *RedisServer) proxyRPush(key []byte, values [][]byte) (int64, error) { - leader := r.coordinator.RaftLeaderForKey(key) - if leader == "" { - return 0, ErrLeaderNotFound - } - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - return 0, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) - } - - cli := r.getOrCreateLeaderClient(leaderAddr) - - args := make([]any, 0, len(values)) - for _, v := range values { - args = append(args, string(v)) - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - res, err := cli.RPush(ctx, string(key), args...).Result() - return res, errors.WithStack(err) -} - -func (r *RedisServer) proxyLPush(key []byte, values [][]byte) (int64, error) { - leader := r.coordinator.RaftLeaderForKey(key) - if leader == "" { - return 0, ErrLeaderNotFound - } - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - return 0, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) - } - - cli := r.getOrCreateLeaderClient(leaderAddr) - - args := make([]any, 0, len(values)) - for _, v := range values { - args = append(args, string(v)) - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - res, err := cli.LPush(ctx, string(key), args...).Result() - return res, errors.WithStack(err) -} - -// getOrCreateLeaderClient returns a cached go-redis client for the given address, -// creating one if it doesn't exist. -func (r *RedisServer) getOrCreateLeaderClient(addr string) *redis.Client { - r.leaderClientsMu.RLock() - cli, ok := r.leaderClients[addr] - r.leaderClientsMu.RUnlock() - if ok { - return cli - } - - r.leaderClientsMu.Lock() - defer r.leaderClientsMu.Unlock() - // Double-check after acquiring write lock. - if cli, ok = r.leaderClients[addr]; ok { - return cli - } - cli = redis.NewClient(&redis.Options{Addr: addr}) - r.leaderClients[addr] = cli - return cli -} - -// leaderClientForKey returns a cached go-redis client connected to the leader -// for the given key. -func (r *RedisServer) leaderClientForKey(key []byte) (*redis.Client, error) { - leader := r.coordinator.RaftLeaderForKey(key) - if leader == "" { - return nil, ErrLeaderNotFound - } - leaderAddr, ok := r.leaderRedis[leader] - if !ok || leaderAddr == "" { - return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) - } - return r.getOrCreateLeaderClient(leaderAddr), nil -} - -// proxyToLeader forwards a Redis command to the leader and writes the -// response to conn. Returns true if the command was proxied (caller should -// return immediately), false if this node is the leader. -func (r *RedisServer) proxyToLeader(conn redcon.Conn, cmd redcon.Command, key []byte) bool { - if r.coordinator.IsLeaderForKey(key) { - return false - } - cli, err := r.leaderClientForKey(key) - if err != nil { - writeRedisError(conn, err) - return true - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - - args := make([]interface{}, len(cmd.Args)) - for i, a := range cmd.Args { - args[i] = a - } - writeGoRedisResult(conn, cli.Do(ctx, args...)) - return true -} - -func writeGoRedisResult(conn redcon.Conn, cmd *redis.Cmd) { - val, err := cmd.Result() - if err != nil { - if errors.Is(err, redis.Nil) { - conn.WriteNull() - } else { - writeRedisError(conn, err) - } - return - } - writeGoRedisValue(conn, val) -} - -func writeGoRedisValue(conn redcon.Conn, val interface{}) { - switch v := val.(type) { - case string: - conn.WriteBulkString(v) - case []byte: - conn.WriteBulk(v) - case int64: - conn.WriteInt64(v) - case bool: - conn.WriteInt(boolToInt(v)) - case float64: - conn.WriteBulkString(strconv.FormatFloat(v, 'f', -1, 64)) - case []interface{}: - writeGoRedisArray(conn, v) - case nil: - conn.WriteNull() - default: - conn.WriteBulkString(fmt.Sprint(v)) - } -} - -func writeGoRedisArray(conn redcon.Conn, arr []interface{}) { - conn.WriteArray(len(arr)) - for _, item := range arr { - writeGoRedisValue(conn, item) - } -} - -func boolToInt(b bool) int { - if b { - return 1 - } - return 0 -} - -func parseInt(b []byte) (int, error) { - i, err := strconv.Atoi(string(b)) - return i, errors.WithStack(err) -} - -// tryLeaderGet proxies a GET to the current Raft leader, returning the value and -// whether the proxy succeeded. -func (r *RedisServer) tryLeaderGetAt(key []byte, ts uint64) ([]byte, error) { - addr := r.coordinator.RaftLeaderForKey(key) - if addr == "" { - return nil, ErrLeaderNotFound - } - - conn, err := r.relayConnCache.ConnFor(addr) - if err != nil { - return nil, errors.WithStack(err) - } - - ctx, cancel := context.WithTimeout(r.handlerContext(), redisRelayPublishTimeout) - defer cancel() - - cli := pb.NewRawKVClient(conn) - resp, err := cli.RawGet(ctx, &pb.RawGetRequest{Key: key, Ts: ts}) - if err != nil { - return nil, errors.WithStack(err) - } - // Compatibility with older nodes that don't set RawGetResponse.exists: - // treat any non-nil payload as found even when exists=false. - if !resp.GetExists() && resp.GetValue() == nil { - return nil, errors.WithStack(store.ErrKeyNotFound) - } - return resp.Value, nil -} - -func (r *RedisServer) readValueAt(ctx context.Context, key []byte, readTS uint64) ([]byte, error) { - ttlKey := key - nonStringInternal := false - if userKey := extractRedisInternalUserKey(key); userKey != nil { - ttlKey = userKey - // Non-string internal keys (!redis|hash|, !redis|set|, …) can never - // carry an embedded-TTL payload, so we can skip the !redis|str| probe - // that ttlAt would otherwise make. - nonStringInternal = !bytes.HasPrefix(key, []byte(redisStrPrefix)) - } - expired, err := r.hasExpired(context.Background(), ttlKey, readTS, nonStringInternal) - if err != nil { - return nil, err - } - if expired { - return nil, errors.WithStack(store.ErrKeyNotFound) - } - - if r.coordinator.IsLeaderForKey(key) { - // PR #749 follow-up: caller-supplied ctx (with - // redisDispatchTimeout from the dispatch handler) replaces - // r.handlerContext() so VerifyLeaderForKey honours the - // per-command deadline. Same shape as keys() / FLUSHDB. - if err := r.coordinator.VerifyLeaderForKey(ctx, key); err != nil { - return nil, errors.WithStack(err) - } - v, err := r.store.GetAt(context.Background(), key, readTS) - return v, errors.WithStack(err) - } - return r.tryLeaderGetAt(key, readTS) -} - -type listPushFunc func(ctx context.Context, key []byte, values [][]byte) (int64, error) -type listProxyFunc func(key []byte, values [][]byte) (int64, error) - -func (r *RedisServer) listPushCmd(conn redcon.Conn, cmd redcon.Command, pushFn listPushFunc, proxyFn listProxyFunc) { - key := cmd.Args[1] - if !r.coordinator.IsLeaderForKey(key) { - length, err := proxyFn(key, cmd.Args[2:]) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt64(length) - return - } - - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ != redisTypeNone && typ != redisTypeList { - conn.WriteError(wrongTypeMessage) - return - } - - ctx := context.Background() - length, err := pushFn(ctx, key, cmd.Args[2:]) - - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt64(length) -} - -func (r *RedisServer) rpush(conn redcon.Conn, cmd redcon.Command) { - r.listPushCmd(conn, cmd, r.listRPush, r.proxyRPush) -} - -func (r *RedisServer) lrange(conn redcon.Conn, cmd redcon.Command) { - ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - defer cancel() - items, err := r.rangeList(ctx, cmd.Args[1], cmd.Args[2], cmd.Args[3]) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteArray(len(items)) - for _, it := range items { - conn.WriteBulkString(it) - } -} diff --git a/adapter/redis_compat_commands.go b/adapter/redis_compat_commands.go index 9bf43fd9..bb05c7a7 100644 --- a/adapter/redis_compat_commands.go +++ b/adapter/redis_compat_commands.go @@ -1,25 +1,7 @@ package adapter import ( - "bytes" - "context" - "errors" - "fmt" - "log" - "math" - "slices" - "sort" - "strconv" - "strings" "time" - - "github.com/bootjp/elastickv/kv" - "github.com/bootjp/elastickv/monitoring" - "github.com/bootjp/elastickv/store" - cockerrors "github.com/cockroachdb/errors" - "github.com/tidwall/redcon" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" ) const ( @@ -53,460 +35,6 @@ const ( wideColumnBulkScanThreshold = 16 ) -type xreadRequest struct { - block time.Duration - count int - keys [][]byte - afterIDs []string -} - -type xreadOptions struct { - block time.Duration - count int - streamsIndex int -} - -type xreadResult struct { - key []byte - entries []redisStreamEntry -} - -type xaddRequest struct { - // maxLen is -1 when no MAXLEN clause was given, 0 for explicit MAXLEN 0, - // or a positive value for MAXLEN . - maxLen int - id string - fields []string -} - -type zrangeOptions struct { - withScores bool - reverse bool -} - -type bzpopminResult struct { - key []byte - entry redisZSetEntry -} - -func (r *RedisServer) info(conn redcon.Conn, _ redcon.Command) { - role := "slave" - if r.coordinator != nil && r.coordinator.IsLeader() { - role = "master" - } - - leaderRedis := r.raftLeaderRedisAddr() - - conn.WriteBulkString(strings.Join([]string{ - "# Server", - "redis_version:7.2.0", - "loading:0", - "role:" + role, - "", - "# Replication", - "role:" + role, - "raft_leader_redis:" + leaderRedis, - "", - }, "\r\n")) -} - -// raftLeaderRedisAddr returns the Redis-protocol address of the current Raft -// leader as known by this node. When this node is itself the leader the -// server's own listen address is returned. An empty string is returned when -// the leader is not yet known or when the leader's Redis address is not -// configured in the leaderRedis map. -func (r *RedisServer) raftLeaderRedisAddr() string { - if r.coordinator == nil { - return "" - } - if r.coordinator.IsLeader() { - return r.redisAddr - } - leader := r.coordinator.RaftLeader() - if leader == "" { - return "" - } - return r.leaderRedis[leader] -} - -// SETEX key seconds value — equivalent to SET key value EX seconds -func (r *RedisServer) setex(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - seconds, err := strconv.ParseInt(string(cmd.Args[2]), 10, 64) - if err != nil || seconds <= 0 { - conn.WriteError("ERR invalid expire time in 'setex' command") - return - } - ttl := time.Now().Add(time.Duration(seconds) * time.Second) - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - if err := r.saveString(ctx, cmd.Args[1], cmd.Args[3], &ttl); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteString("OK") -} - -// GETDEL key — get the value and delete the key atomically -func (r *RedisServer) getdel(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - key := cmd.Args[1] - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var v []byte - err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), key, readTS) - if err != nil { - return err - } - if typ == redisTypeNone { - v = nil - return nil - } - if typ != redisTypeString { - return wrongTypeError() - } - raw, _, err := r.readRedisStringAt(key, readTS) - if err != nil { - // Key may have expired or been deleted between type check and read. - v = nil - return nil //nolint:nilerr // treat not-found/expired as nil value - } - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { - return err - } - v = raw - return nil - }) - if err != nil { - writeRedisError(conn, err) - return - } - if v == nil { - conn.WriteNull() - return - } - conn.WriteBulk(v) -} - -// SETNX key value — set if not exists, returns 1 on success, 0 on failure -func (r *RedisServer) setnx(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - - opts := redisSetOptions{missingCond: true} - result, err := r.executeSet(ctx, cmd.Args[1], cmd.Args[2], opts) - if err != nil { - writeRedisError(conn, err) - return - } - if result.wroteNull { - conn.WriteInt(0) - return - } - conn.WriteInt(1) -} - -// clientSubcommandArgCount is the total cmd.Args length (including -// CLIENT + subcommand) required by no-operand CLIENT subcommands -// like GETNAME / ID / INFO. -const clientSubcommandArgCount = 2 - -// checkClientArity verifies cmd.Args has exactly want elements and -// writes the standard Redis wrong-arity error otherwise. Returns -// true when the caller should stop handling (bad arity). -func checkClientArity(conn redcon.Conn, cmd redcon.Command, sub string, want int) bool { - if len(cmd.Args) == want { - return false - } - conn.WriteError("ERR wrong number of arguments for 'client|" + strings.ToLower(sub) + "' command") - return true -} - -// clientSetName handles CLIENT SETNAME. SETNAME is shared with -// HELLO's SETNAME clause; both write into the same connState.clientName -// slot so a client that uses HELLO SETNAME once and then queries -// CLIENT GETNAME gets the right answer without having to re-issue -// CLIENT SETNAME. -func clientSetName(conn redcon.Conn, cmd redcon.Command, state *connState) { - if checkClientArity(conn, cmd, "SETNAME", clientSetNameArgCount) { - return - } - state.clientName = string(cmd.Args[2]) - conn.WriteString("OK") -} - -func clientGetName(conn redcon.Conn, cmd redcon.Command, state *connState) { - if checkClientArity(conn, cmd, "GETNAME", clientSubcommandArgCount) { - return - } - if state.clientName == "" { - conn.WriteNull() - return - } - conn.WriteBulkString(state.clientName) -} - -func (r *RedisServer) clientID(conn redcon.Conn, cmd redcon.Command, state *connState) { - if checkClientArity(conn, cmd, "ID", clientSubcommandArgCount) { - return - } - conn.WriteInt64(int64(r.ensureConnID(state))) //nolint:gosec // connID monotonic counter, guaranteed <= math.MaxInt64 in practice -} - -func (r *RedisServer) clientInfo(conn redcon.Conn, cmd redcon.Command, state *connState) { - if checkClientArity(conn, cmd, "INFO", clientSubcommandArgCount) { - return - } - id := r.ensureConnID(state) - conn.WriteBulkString(fmt.Sprintf("id=%d addr=%s name=%s", id, conn.RemoteAddr(), state.clientName)) -} - -// clientSetInfo handles CLIENT SETINFO . elastickv does -// not persist the advertised attributes (lib-name / lib-ver, etc.), but -// it MUST still enforce exact arity — otherwise `CLIENT SETINFO` with -// no operands returns OK and masks a client bug that real Redis would -// have surfaced as a wrong-arity error. -func clientSetInfo(conn redcon.Conn, cmd redcon.Command) { - if checkClientArity(conn, cmd, "SETINFO", clientSetInfoArgCount) { - return - } - conn.WriteString("OK") -} - -func (r *RedisServer) client(conn redcon.Conn, cmd redcon.Command) { - sub := strings.ToUpper(string(cmd.Args[1])) - state := getConnState(conn) - switch sub { - case "SETINFO": - clientSetInfo(conn, cmd) - case "SETNAME": - clientSetName(conn, cmd, state) - case "GETNAME": - clientGetName(conn, cmd, state) - case "ID": - r.clientID(conn, cmd, state) - case "INFO": - r.clientInfo(conn, cmd, state) - default: - conn.WriteError("ERR unsupported CLIENT subcommand '" + sub + "'") - } -} - -// command implements the Redis `COMMAND` family used by clients for -// capability probing at connect time (go-redis, redis-py, ioredis, …). -// Subcommand matrix: -// -// COMMAND -> array of per-command info -// COMMAND COUNT -> integer -// COMMAND LIST -> array of names (FILTERBY rejected) -// COMMAND INFO [name ...] -> array of per-command info (nil per unknown) -// COMMAND DOCS [name ...] -> minimal map-shaped doc entries -// COMMAND GETKEYS cmd args -> array of extracted keys -// COMMAND GETKEYSANDFLAGS -> ERR unsupported -func (r *RedisServer) command(conn redcon.Conn, cmd redcon.Command) { - if len(cmd.Args) == 1 { - r.writeCommandInfoAll(conn) - return - } - sub := strings.ToUpper(string(cmd.Args[1])) - switch sub { - case "COUNT": - // COUNT must match the cardinality of COMMAND / COMMAND LIST — - // which iterate argsLen (= routed set). The table has the same - // size by invariant, but driving COUNT off argsLen keeps the - // three subcommands wire-consistent even during the brief - // window when a new route has been added but the table row is - // still pending. - conn.WriteInt(len(argsLen)) - case "LIST": - // `COMMAND LIST` takes no args (bare list) or `FILTERBY …` which we - // reject below. Anything past the subcommand slot is a filter. - const commandListArgFixed = 2 - if len(cmd.Args) > commandListArgFixed { - // We explicitly do not support FILTERBY MODULE|ACLCAT|PATTERN - // — elastickv has no modules and no ACL categories. Rejecting - // here is consistent with how real Redis would behave when a - // filter resolves to an empty universe; clients that see this - // fall back to COMMAND (no args), which we support. - conn.WriteError("ERR unsupported COMMAND LIST filter") - return - } - r.writeCommandList(conn) - case "INFO": - r.writeCommandInfo(conn, cmd.Args[2:]) - case "DOCS": - r.writeCommandDocs(conn, cmd.Args[2:]) - case "GETKEYS": - r.writeCommandGetKeys(conn, cmd.Args[2:]) - case "GETKEYSANDFLAGS": - conn.WriteError("ERR unsupported COMMAND subcommand 'GETKEYSANDFLAGS'") - default: - conn.WriteError("ERR Unknown COMMAND subcommand '" + sub + "'") - } -} - -// writeCommandInfoEntry emits the 6-element per-command info array for a -// single command. Redis 7 extends this to 10 elements; we deliberately -// stop at 6 because every client we care about parses the first 6 fields -// and ignores trailing elements. -func writeCommandInfoEntry(conn redcon.Conn, meta redisCommandMeta) { - const infoArity = 6 - conn.WriteArray(infoArity) - conn.WriteBulkString(meta.Name) - conn.WriteInt(meta.Arity) - conn.WriteArray(len(meta.Flags)) - for _, f := range meta.Flags { - conn.WriteBulkString(f) - } - conn.WriteInt(meta.FirstKey) - conn.WriteInt(meta.LastKey) - conn.WriteInt(meta.Step) -} - -func (r *RedisServer) writeCommandInfoAll(conn redcon.Conn) { - metas := routedRedisCommandMetas() - conn.WriteArray(len(metas)) - for _, meta := range metas { - writeCommandInfoEntry(conn, meta) - } -} - -func (r *RedisServer) writeCommandList(conn redcon.Conn) { - metas := routedRedisCommandMetas() - conn.WriteArray(len(metas)) - for _, meta := range metas { - conn.WriteBulkString(meta.Name) - } -} - -func (r *RedisServer) writeCommandInfo(conn redcon.Conn, requested [][]byte) { - // `COMMAND INFO` with no names is equivalent to `COMMAND` (no args): - // return info for every known command. This is what real Redis does - // and what go-redis relies on when it issues bare `COMMAND INFO`. - if len(requested) == 0 { - r.writeCommandInfoAll(conn) - return - } - conn.WriteArray(len(requested)) - for _, raw := range requested { - meta, ok := redisCommandTable[strings.ToUpper(string(raw))] - if !ok { - conn.WriteNull() - continue - } - writeCommandInfoEntry(conn, meta) - } -} - -// writeCommandDocs emits the RESP2 flat-map form of COMMAND DOCS: -// alternating command-name keys and 4-element doc-maps with "summary" -// and "arguments" fields. Two compliance-critical behaviours: -// -// 1. Bare `COMMAND DOCS` (no names) returns docs for ALL routed -// commands, identical to how `COMMAND INFO` and bare `COMMAND` -// behave. Clients/tools like redis-cli --docs rely on this. -// 2. Every requested entry writes BOTH the command-name key AND the -// doc map value. Clients decode the top-level array as a map of -// name -> docs, so skipping the name key makes the reply -// unparseable. Unknown commands emit the requested name followed -// by nil (Redis semantics). -// -// We do not maintain per-command docs, so summary is "" and arguments -// is empty. The wire-shape is what clients care about at connect time. -func (r *RedisServer) writeCommandDocs(conn redcon.Conn, requested [][]byte) { - const docEntryLen = 4 - // Bare DOCS (no command names): iterate the routed set so the - // reply mirrors `COMMAND` / `COMMAND INFO` / `COMMAND LIST`. - if len(requested) == 0 { - metas := routedRedisCommandMetas() - // Two wire slots per command (name + doc map). - conn.WriteArray(len(metas) * 2) //nolint:mnd // 2 = (name, docs) pair - for _, meta := range metas { - conn.WriteBulkString(meta.Name) - conn.WriteArray(docEntryLen) - conn.WriteBulkString("summary") - conn.WriteBulkString("") - conn.WriteBulkString("arguments") - conn.WriteArray(0) - } - return - } - // Explicit names: preserve the caller-supplied order so a client - // that expects its own request ordering back (e.g. for building a - // lookup table) is not surprised. Each pair is (name, docs) or - // (name, nil) for unknowns. - conn.WriteArray(len(requested) * 2) //nolint:mnd // 2 = (name, docs) pair - for _, raw := range requested { - name := string(raw) - meta, ok := redisCommandTable[strings.ToUpper(name)] - if !ok { - conn.WriteBulkString(name) - conn.WriteNull() - continue - } - conn.WriteBulkString(meta.Name) - conn.WriteArray(docEntryLen) - conn.WriteBulkString("summary") - conn.WriteBulkString("") - conn.WriteBulkString("arguments") - conn.WriteArray(0) - } -} - -// writeCommandGetKeys dispatches COMMAND GETKEYS for a given subcommand -// plus its arguments. Real Redis requires at least one arg after GETKEYS -// (the command name itself); we enforce that here rather than lean on -// argsLen which only validates the outer COMMAND call. -func (r *RedisServer) writeCommandGetKeys(conn redcon.Conn, argv [][]byte) { - if len(argv) == 0 { - conn.WriteError("ERR wrong number of arguments for 'command|getkeys' command") - return - } - meta, ok := redisCommandTable[strings.ToUpper(string(argv[0]))] - if !ok { - conn.WriteError("ERR Invalid command specified") - return - } - // validate arity of the nested command so we match Redis behaviour of - // refusing to compute keys for obviously malformed commands (a common - // source of confusion in client test suites). - switch { - case meta.Arity > 0 && len(argv) != meta.Arity: - conn.WriteError("ERR Invalid arguments specified for populating the array of keys") - return - case meta.Arity < 0 && len(argv) < -meta.Arity: - conn.WriteError("ERR Invalid arguments specified for populating the array of keys") - return - } - keys := redisCommandGetKeys(meta, argv) - if len(keys) == 0 { - // `The command has no key arguments` — real Redis returns an error - // in this case rather than an empty array, and go-redis's test - // suite expects the error form. - conn.WriteError("ERR The command has no key arguments") - return - } - conn.WriteArray(len(keys)) - for _, k := range keys { - conn.WriteBulk(k) - } -} - // HELLO reply and protocol constants. Kept as named constants so the // linter's "no magic numbers" rule accepts the wire-format values. const ( @@ -541,28 +69,6 @@ const ( clientSetInfoArgCount = 4 ) -// helloParseError is the internal signal used by parseHelloArgs to -// surface a client-facing error without forcing the top-level hello -// handler to pay for additional branches. The caller writes err to -// the wire verbatim. -type helloParseError struct{ msg string } - -func (e *helloParseError) Error() string { return e.msg } - -// parseHelloArgs walks the optional HELLO argument list and mutates -// connState for any recognized options. Returns a non-nil error -// containing the exact wire-format string to emit via WriteError. -// Split out of hello() so the handler's cyclomatic complexity stays -// within the linter's budget. -// parsedHelloOption is the pure-function result of a single option -// token. advance is the number of input args consumed. Exactly one -// of (advance > 0) or (err != nil) is non-zero. -type parsedHelloOption struct { - name string - hasName bool - advance int -} - const ( // helloAuthOptionArity is the total token count a HELLO AUTH // clause consumes: keyword + username + password. @@ -573,4862 +79,3 @@ const ( // sentinel used by XREAD '$' on an empty or missing stream. streamZeroID = "0-0" ) - -// parseHelloOption decodes one HELLO option starting at args[0] (the -// option keyword). Returns how many input tokens the option consumed -// and any client-side staging it wants applied. -func parseHelloOption(args [][]byte) (parsedHelloOption, error) { - opt := strings.ToUpper(string(args[0])) - switch opt { - case "AUTH": - if len(args) < helloAuthOptionArity { - return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO AUTH"} - } - // elastickv's Redis adapter has no AUTH layer. Rejecting rather - // than silently accepting keeps operators honest. - return parsedHelloOption{}, &helloParseError{msg: "NOPERM HELLO AUTH is not supported"} - case "SETNAME": - if len(args) < helloSetNameOptionArity { - return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO SETNAME"} - } - return parsedHelloOption{ - name: string(args[1]), - hasName: true, - advance: helloSetNameOptionArity, - }, nil - default: - return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO option '" + opt + "'"} - } -} - -func parseHelloArgs(state *connState, args [][]byte) error { - if len(args) == 0 { - return nil - } - protover, err := strconv.Atoi(string(args[0])) - if err != nil || protover != helloReplyProto { - // Non-numeric, RESP3 (3), or any other requested version: - // reject with NOPROTO so well-behaved clients fall back to - // RESP2. - return &helloParseError{msg: "NOPROTO unsupported protocol version"} - } - // Buffer side effects locally so a partial parse (e.g. SETNAME - // followed by a bad option or AUTH) leaves connState untouched — - // the command must be all-or-nothing, matching real Redis. - var ( - pendingName string - pendingNameSet bool - ) - for i := 1; i < len(args); { - opt, err := parseHelloOption(args[i:]) - if err != nil { - return err - } - if opt.hasName { - pendingName = opt.name - pendingNameSet = true - } - i += opt.advance - } - if pendingNameSet { - state.clientName = pendingName - } - return nil -} - -// hello implements the Redis HELLO command. Syntax: -// -// HELLO [protover [AUTH username password] [SETNAME clientname]] -// -// elastickv speaks RESP2 only (redcon is RESP2-only and exposes no -// RESP3 map-reply API), so: -// -// - No protover, or protover == 2: succeed and return the server-info -// array. -// - protover == 3 or any other non-2 value: reply with the -// NOPROTO error the real Redis server uses when a client requests -// an unsupported protocol version. go-redis and friends fall back -// to RESP2 when they see this. -// - AUTH is rejected because elastickv has no auth layer wired into -// the Redis adapter; silently accepting any credentials would be a -// security footgun for operators who assume AUTH means something. -// We return a NOPERM-style error so clients surface a clear error -// rather than assuming auth succeeded. -// - SETNAME is wired into the existing connState.clientName slot, so -// a subsequent CLIENT GETNAME observes the name set here. -func (r *RedisServer) hello(conn redcon.Conn, cmd redcon.Command) { - state := getConnState(conn) - if err := parseHelloArgs(state, cmd.Args[1:]); err != nil { - writeRedisError(conn, err) - return - } - - role := "slave" - if r.coordinator != nil && r.coordinator.IsLeader() { - role = "master" - } - id := r.ensureConnID(state) - - // Reply as a flat RESP2 array of alternating key/value pairs, the - // same wire shape Redis uses when a client negotiates RESP2 via - // HELLO. Order matches real Redis so clients that parse - // positionally (jedis has done this historically) still work. - conn.WriteArray(helloReplyArrayLen) - conn.WriteBulkString("server") - conn.WriteBulkString("redis") - conn.WriteBulkString("version") - conn.WriteBulkString(helloReplyVersion) - conn.WriteBulkString("proto") - conn.WriteInt(helloReplyProto) - conn.WriteBulkString("id") - conn.WriteInt64(int64(id)) //nolint:gosec // connID monotonic counter, fits in int64 in practice. - conn.WriteBulkString("mode") - conn.WriteBulkString("standalone") - conn.WriteBulkString("role") - conn.WriteBulkString(role) - conn.WriteBulkString("modules") - conn.WriteArray(0) -} - -func (r *RedisServer) selectDB(conn redcon.Conn, cmd redcon.Command) { - if _, err := strconv.Atoi(string(cmd.Args[1])); err != nil { - conn.WriteError("ERR invalid DB index") - return - } - conn.WriteString("OK") -} - -func (r *RedisServer) quit(conn redcon.Conn, _ redcon.Command) { - conn.WriteString("OK") - _ = conn.Close() -} - -func (r *RedisServer) typeCmd(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - typ, err := r.keyType(context.Background(), cmd.Args[1]) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteString(string(typ)) -} - -func (r *RedisServer) ttl(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.writeTTL(conn, cmd.Args[1], false) -} - -func (r *RedisServer) pttl(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.writeTTL(conn, cmd.Args[1], true) -} - -func (r *RedisServer) writeTTL(conn redcon.Conn, key []byte, milliseconds bool) { - readTS := r.readTS() - exists, err := r.logicalExistsAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if !exists { - conn.WriteInt64(-2) - return - } - ttl, err := r.ttlAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - ms := ttlMilliseconds(ttl) - if ms == -1 { - conn.WriteInt64(-1) - return - } - if !milliseconds && ms >= 0 { - ms /= 1000 - } - conn.WriteInt64(ms) -} - -func (r *RedisServer) expire(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.setExpire(conn, cmd, time.Second) -} - -func (r *RedisServer) pexpire(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.setExpire(conn, cmd, time.Millisecond) -} - -func parseExpireNXOnly(args [][]byte) (bool, error) { - nxOnly := false - for _, arg := range args { - if !strings.EqualFold(string(arg), "NX") { - return false, errors.New("ERR syntax error") - } - nxOnly = true - } - return nxOnly, nil -} - -func hasActiveTTL(ttl *time.Time, now time.Time) bool { - return ttl != nil && ttl.After(now) -} - -func parseExpireTTL(raw []byte) (int64, error) { - ttl, err := strconv.ParseInt(string(raw), 10, 64) - if err != nil { - return 0, fmt.Errorf("parse expire ttl: %w", err) - } - return ttl, nil -} - -func (r *RedisServer) prepareExpire(key []byte, nxOnly bool) (uint64, bool, error) { - readTS := r.readTS() - exists, err := r.logicalExistsAt(context.Background(), key, readTS) - if err != nil { - return 0, false, err - } - if !exists { - return readTS, false, nil - } - - if !nxOnly { - return readTS, true, nil - } - - currentTTL, err := r.ttlAt(context.Background(), key, readTS) - if err != nil { - return 0, false, err - } - return readTS, !hasActiveTTL(currentTTL, time.Now()), nil -} - -func (r *RedisServer) setExpire(conn redcon.Conn, cmd redcon.Command, unit time.Duration) { - ttl, err := parseExpireTTL(cmd.Args[2]) - if err != nil { - writeRedisError(conn, err) - return - } - - nxOnly, err := parseExpireNXOnly(cmd.Args[3:]) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - - // Pin expireAt once before the retry loop so successive attempts all write - // the same wall-clock deadline (OCC retries must not push expiry forward). - var expireAt time.Time - if ttl > 0 { - if ttl > math.MaxInt64/int64(unit) { - conn.WriteError("ERR invalid expire time in command") - return - } - expireAt = time.Now().Add(time.Duration(ttl) * unit) - } - - var result int - if err := r.retryRedisWrite(ctx, func() error { - var retErr error - result, retErr = r.doSetExpire(ctx, cmd.Args[1], ttl, expireAt, nxOnly) - return retErr - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(result) -} - -// doSetExpire is the inner body of setExpire's retryRedisWrite loop. -// All reads (existence, type, value) use the same readTS snapshot so they form -// a consistent view. The subsequent dispatchElems calls use IsTxn=true with -// StartTS=readTS, which causes coordinator.Dispatch to reject the write with -// ErrWriteConflict if any touched key was modified after readTS. retryRedisWrite -// then re-invokes doSetExpire with a fresh readTS, providing OCC safety without -// an explicit mutex. Leadership is verified by coordinator.Dispatch itself. -func (r *RedisServer) doSetExpire(ctx context.Context, key []byte, ttl int64, expireAt time.Time, nxOnly bool) (int, error) { - readTS, eligible, err := r.prepareExpire(key, nxOnly) - if err != nil { - return 0, err - } - if !eligible { - return 0, nil - } - if ttl <= 0 { - return r.expireDeleteKey(ctx, key, readTS) - } - typ, err := r.rawKeyTypeAt(ctx, key, readTS) - if err != nil { - return 0, err - } - if typ == redisTypeString { - // rawKeyTypeAt also reports HLL as redisTypeString; HLL payloads live - // under !redis|hll| and don't carry an inline TTL, so fall back - // to the legacy scan-index path for them. - plain, err := r.isPlainRedisString(ctx, key, readTS) - if err != nil { - return 0, err - } - if plain { - applied, err := r.dispatchStringExpire(ctx, key, readTS, expireAt) - if err != nil || !applied { - return 0, err - } - return 1, nil - } - } - elems := []*kv.Elem[kv.OP]{{Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(expireAt)}} - return 1, r.dispatchElems(ctx, true, readTS, elems) -} - -// isPlainRedisString distinguishes a plain Redis string (stored under -// !redis|str| or, for legacy data, the bare key) from a HyperLogLog -// (stored under !redis|hll|), both of which rawKeyTypeAt reports as -// redisTypeString. -func (r *RedisServer) isPlainRedisString(ctx context.Context, key []byte, readTS uint64) (bool, error) { - exists, err := r.store.ExistsAt(ctx, redisStrKey(key), readTS) - if err != nil { - return false, cockerrors.WithStack(err) - } - if exists { - return true, nil - } - // Fall back to the bare legacy layout. - legacy, err := r.store.ExistsAt(ctx, key, readTS) - if err != nil { - return false, cockerrors.WithStack(err) - } - return legacy, nil -} - -func (r *RedisServer) expireDeleteKey(ctx context.Context, key []byte, readTS uint64) (int, error) { - elems, existed, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return 0, err - } - if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { - return 0, err - } - if existed { - return 1, nil - } - return 0, nil -} - -// dispatchStringExpire performs a read-modify-write on the string anchor key: -// it reads the current value at readTS, re-encodes it with the new expiry, and -// writes both the updated value and the !redis|ttl| scan index in a single Raft -// entry (IsTxn=true, StartTS=readTS). The coordinator rejects the write with -// ErrWriteConflict if any key was modified after readTS, so stale-data safety is -// guaranteed by OCC — no explicit mutex is required. -func (r *RedisServer) dispatchStringExpire(ctx context.Context, key []byte, readTS uint64, expireAt time.Time) (bool, error) { - userValue, _, readErr := r.readRedisStringAt(key, readTS) - if readErr != nil { - if cockerrors.Is(readErr, store.ErrKeyNotFound) { - // Raced with a delete/expiry between prepareExpire and this read; - // do not resurrect the key with an empty anchor. - return false, nil - } - return false, cockerrors.WithStack(readErr) - } - encoded := encodeRedisStr(userValue, &expireAt) - elems := []*kv.Elem[kv.OP]{ - {Op: kv.Put, Key: redisStrKey(key), Value: encoded}, - {Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(expireAt)}, - } - return true, r.dispatchElems(ctx, true, readTS, elems) -} - -func parseScanArgs(args [][]byte) (int, []byte, int, error) { - cursor, err := strconv.Atoi(string(args[1])) - if err != nil || cursor < 0 { - return 0, nil, 0, errors.New("ERR invalid cursor") - } - - pattern := []byte("*") - count := 10 - for i := redisPairWidth; i < len(args); i += redisPairWidth { - if i+1 >= len(args) { - return 0, nil, 0, errors.New("ERR syntax error") - } - switch strings.ToUpper(string(args[i])) { - case "MATCH": - pattern = args[i+1] - case redisKeywordCount: - count, err = strconv.Atoi(string(args[i+1])) - if err != nil || count <= 0 { - return 0, nil, 0, errors.New("ERR syntax error") - } - default: - return 0, nil, 0, errors.New("ERR syntax error") - } - } - return cursor, pattern, count, nil -} - -func writeScanReply(conn redcon.Conn, next int, keys [][]byte) { - conn.WriteArray(redisPairWidth) - conn.WriteBulkString(strconv.Itoa(next)) - conn.WriteArray(len(keys)) - for _, key := range keys { - conn.WriteBulk(key) - } -} - -func (r *RedisServer) scan(conn redcon.Conn, cmd redcon.Command) { - cursor, pattern, count, err := parseScanArgs(cmd.Args) - if err != nil { - writeRedisError(conn, err) - return - } - - keys, err := r.visibleKeys(pattern) - if err != nil { - writeRedisError(conn, err) - return - } - if cursor >= len(keys) { - writeScanReply(conn, 0, nil) - return - } - - end := minRedisInt(cursor+count, len(keys)) - next := 0 - if end < len(keys) { - next = end - } - - writeScanReply(conn, next, keys[cursor:end]) -} - -func (r *RedisServer) publish(conn redcon.Conn, cmd redcon.Command) { - count := r.publishCluster(context.Background(), cmd.Args[1], cmd.Args[2]) - if r.traceCommands { - log.Printf("redis trace publish remote=%s channel=%q subscribers=%d", conn.RemoteAddr(), string(cmd.Args[1]), count) - } - conn.WriteInt64(count) -} - -func (r *RedisServer) subscribe(conn redcon.Conn, cmd redcon.Command) { - for _, channel := range cmd.Args[1:] { - r.pubsub.Subscribe(conn, string(channel)) - } -} - -func (r *RedisServer) dbsize(conn redcon.Conn, _ redcon.Command) { - if !r.coordinator.IsLeader() { - size, err := r.proxyDBSize() - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(size) - return - } - if err := r.coordinator.VerifyLeader(r.handlerContext()); err != nil { - writeRedisError(conn, err) - return - } - - keys, err := r.visibleKeys([]byte("*")) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(len(keys)) -} - -func (r *RedisServer) flushdb(conn redcon.Conn, _ redcon.Command) { - r.flushDatabase(conn, false) -} - -func (r *RedisServer) flushall(conn redcon.Conn, _ redcon.Command) { - r.flushDatabase(conn, true) -} - -// deleteLegacyKeys scans the full keyspace and deletes keys that do not belong -// to any known internal prefix. Returns the number of user-visible legacy keys -// deleted. TTL keys are intentionally NOT deleted because the !redis|ttl| -// namespace is shared across all Redis types — deleting them could strip -// expiration from already-migrated or newly-created keys. -func (r *RedisServer) deleteLegacyKeys(ctx context.Context, readTS uint64) (int, error) { - const batchSize = 1000 - var totalDeleted int - cursor := make([]byte, 0, batchSize) - for { - kvs, err := r.store.ScanAt(ctx, cursor, nil, batchSize, readTS) - if err != nil { - return totalDeleted, fmt.Errorf("scan: %w", err) - } - if len(kvs) == 0 { - break - } - - elems := make([]*kv.Elem[kv.OP], 0, len(kvs)) - legacyCount := 0 - for _, pair := range kvs { - if !isKnownInternalKey(pair.Key) { - legacyCount++ - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: pair.Key}) - } - } - - if len(elems) > 0 { - if err := r.dispatchElems(ctx, false, readTS, elems); err != nil { - return totalDeleted, err - } - totalDeleted += legacyCount - } - - // Advance cursor past the last key in this batch. - lastKey := kvs[len(kvs)-1].Key - cursor = make([]byte, len(lastKey)+1) - copy(cursor, lastKey) - - // Yield briefly between batches to avoid saturating the Raft log. - time.Sleep(time.Millisecond) - } - return totalDeleted, nil -} - -// flushlegacy deletes old unprefixed Redis string keys that were written before -// the !redis|str| prefix migration. It scans all keys and deletes those that -// do not match any known internal prefix. This is a one-time migration operation. -func (r *RedisServer) flushlegacy(conn redcon.Conn, _ redcon.Command) { - if !r.coordinator.IsLeader() { - n, err := r.proxyFlushLegacy() - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(n) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisFlushLegacyTimeout) - defer cancel() - - totalDeleted, err := r.deleteLegacyKeys(ctx, r.readTS()) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(totalDeleted) -} - -func (r *RedisServer) flushDatabase(conn redcon.Conn, all bool) { - if !r.coordinator.IsLeader() { - if err := r.proxyFlushDatabase(all); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteString("OK") - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - - if err := r.retryRedisWrite(ctx, func() error { - // Use the per-call ctx with redisDispatchTimeout, NOT - // handlerContext (the long-lived server baseCtx). FLUSHDB's - // retry budget already lives in ctx; routing it to - // VerifyLeader keeps the whole command bounded. - if err := r.coordinator.VerifyLeader(ctx); err != nil { - return fmt.Errorf("verify leader: %w", err) - } - - // Delete only Redis-related keys. Each DEL_PREFIX operation must be - // dispatched separately because the FSM processes only one DEL_PREFIX - // per request (the first mutation). - // - // Namespaces covered: - // "!redis|" — str, legacy hash/set/zset/hll/stream, ttl - // "!lst|" — list meta + items - // "!zs|" — zset wide-column - // "!hs|" — hash wide-column meta/field/delta - // "!st|" — set wide-column meta/member/delta - // - // Legacy bare keys are NOT deleted here to avoid a full keyspace - // scan. Run FLUSHLEGACY first to clean up legacy data. - // - // All prefixes are attempted even if one dispatch fails so that we - // delete as many namespaces as possible before reporting errors. - var combined error - for _, prefix := range [][]byte{ - []byte("!redis|"), - []byte("!lst|"), - []byte("!zs|"), - []byte("!hs|"), - []byte("!st|"), - } { - if _, err := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - Elems: []*kv.Elem[kv.OP]{ - {Op: kv.DelPrefix, Key: prefix}, - }, - }); err != nil { - combined = cockerrors.CombineErrors(combined, fmt.Errorf("dispatch del_prefix %q: %w", prefix, err)) - } - } - return cockerrors.WithStack(combined) - }); err != nil { - writeRedisError(conn, err) - return - } - - conn.WriteString("OK") -} - -func (r *RedisServer) pubsubCmd(conn redcon.Conn, cmd redcon.Command) { - switch strings.ToUpper(string(cmd.Args[1])) { - case "CHANNELS": - r.writePubSubChannels(conn, cmd.Args) - case "NUMSUB": - r.writePubSubNumSub(conn, cmd.Args) - case "NUMPAT": - conn.WriteInt(0) - default: - conn.WriteError("ERR unsupported PUBSUB subcommand '" + string(cmd.Args[1]) + "'") - } -} - -func (r *RedisServer) writePubSubChannels(conn redcon.Conn, args [][]byte) { - pattern := []byte("*") - if len(args) >= pubsubPatternArgMin { - pattern = args[pubsubFirstChannel] - } - - counts := r.pubsubChannelCounts() - channels := make([]string, 0, len(counts)) - for channel, count := range counts { - if count <= 0 || !matchesAsteriskPattern(pattern, []byte(channel)) { - continue - } - channels = append(channels, channel) - } - - sort.Strings(channels) - conn.WriteArray(len(channels)) - for _, channel := range channels { - conn.WriteBulkString(channel) - } -} - -func (r *RedisServer) writePubSubNumSub(conn redcon.Conn, args [][]byte) { - channels := args[pubsubFirstChannel:] - snapshot := r.pubsubChannelCounts() - - conn.WriteArray(len(channels) * redisPairWidth) - for _, channel := range channels { - conn.WriteBulk(channel) - conn.WriteInt(snapshot[string(channel)]) - } -} - -func (r *RedisServer) pubsubChannelCounts() map[string]int { - return r.pubsub.ChannelCounts() -} - -func (r *RedisServer) sadd(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.mutateExactSet(conn, setKind, cmd.Args[1], cmd.Args[2:], true) -} - -func (r *RedisServer) srem(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.mutateExactSet(conn, setKind, cmd.Args[1], cmd.Args[2:], false) -} - -func (r *RedisServer) validateExactSetKind(kind string, key []byte, readTS uint64) error { - typ, err := r.keyTypeAt(context.Background(), key, readTS) - if err != nil { - return err - } - - switch kind { - case setKind: - return r.validateExactSetType(typ, key, readTS) - case hllKind: - return r.validateExactHLLType(typ, key, readTS) - default: - return errors.New("ERR unsupported exact set kind") - } -} - -func (r *RedisServer) hllExistsAt(key []byte, readTS uint64) (bool, error) { - exists, err := r.store.ExistsAt(context.Background(), redisHLLKey(key), readTS) - if err != nil { - return false, fmt.Errorf("exists hll: %w", err) - } - return exists, nil -} - -func (r *RedisServer) validateExactSetType(typ redisValueType, key []byte, readTS uint64) error { - if typ == redisTypeSet { - return nil - } - if typ != redisTypeNone { - return wrongTypeError() - } - - hllExists, err := r.hllExistsAt(key, readTS) - if err != nil { - return err - } - if hllExists { - return wrongTypeError() - } - return nil -} - -func (r *RedisServer) validateExactHLLType(typ redisValueType, key []byte, readTS uint64) error { - if typ == redisTypeNone { - return nil - } - - hllExists, err := r.hllExistsAt(key, readTS) - if err != nil { - return err - } - if !hllExists { - return wrongTypeError() - } - return nil -} - -func exactSetMembers(value redisSetValue) map[string]struct{} { - members := make(map[string]struct{}, len(value.Members)) - for _, member := range value.Members { - members[member] = struct{}{} - } - return members -} - -func applyExactSetMutation(existing map[string]struct{}, members [][]byte, add bool) int { - changed := 0 - for _, member := range members { - memberKey := string(member) - _, ok := existing[memberKey] - if add { - if ok { - continue - } - existing[memberKey] = struct{}{} - changed++ - continue - } - if ok { - delete(existing, memberKey) - changed++ - } - } - return changed -} - -func sortedExactSetMembers(existing map[string]struct{}) []string { - out := make([]string, 0, len(existing)) - for member := range existing { - out = append(out, member) - } - sort.Strings(out) - return out -} - -func (r *RedisServer) persistExactSetMembersTxn(ctx context.Context, kind string, key []byte, readTS uint64, members map[string]struct{}) error { - if kind != setKind { - // HLL and other non-set kinds keep using the legacy blob format. - if len(members) == 0 { - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, elems) - } - payload, err := marshalSetValue(redisSetValue{Members: sortedExactSetMembers(members)}) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ - {Op: kv.Put, Key: redisExactSetStorageKey(kind, key), Value: payload}, - }) - } - // Wide-column set: full rewrite (used when the whole state is available). - if len(members) == 0 { - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, elems) - } - elems := make([]*kv.Elem[kv.OP], 0, len(members)+setWideColOverhead) - for member := range members { - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.SetMemberKey(key, []byte(member)), - Value: []byte{}, - }) - } - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.SetMetaKey(key), - Value: store.MarshalSetMeta(store.SetMeta{Len: int64(len(members))}), - }) - // Remove legacy blob if present. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisSetKey(key)}) - return r.dispatchElems(ctx, true, readTS, elems) -} - -// applySetMemberMutation emits a Put or Del for one set member and returns the -// change count (1) and the signed length delta (+1 or -1), or (0, 0) if no change. -func applySetMemberMutation(elems []*kv.Elem[kv.OP], memberKey []byte, exists, add bool) ([]*kv.Elem[kv.OP], int, int64) { - if add && !exists { - return append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: []byte{}}), 1, 1 - } - if !add && exists { - return append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: memberKey}), 1, -1 - } - return elems, 0, 0 -} - -// mutateExactSetLegacy handles SADD/SREM for non-set kinds (e.g. HLL) via the legacy blob path. -func (r *RedisServer) mutateExactSetLegacy(conn redcon.Conn, ctx context.Context, kind string, key []byte, members [][]byte, add bool) { - var changed int - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - if err := r.validateExactSetKind(kind, key, readTS); err != nil { - return err - } - value, err := r.loadSetAt(context.Background(), kind, key, readTS) - if err != nil { - return err - } - existing := exactSetMembers(value) - changed = applyExactSetMutation(existing, members, add) - if changed == 0 { - return nil - } - return r.persistExactSetMembersTxn(ctx, kind, key, readTS, existing) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(changed) -} - -// mutateExactSetWide handles SADD/SREM for the wide-column set path. -func (r *RedisServer) mutateExactSetWide(conn redcon.Conn, ctx context.Context, key []byte, members [][]byte, add bool) { - var changed int - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - if err := r.validateExactSetKind(setKind, key, readTS); err != nil { - return err - } - - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return cockerrors.Wrap(err, "mutateExactSetWide: allocate commitTS") - } - - migrationElems, migErr := r.buildSetLegacyMigrationElems(ctx, key, readTS) - if migErr != nil { - return migErr - } - elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(members)+setWideColOverhead) - elems = append(elems, migrationElems...) - - // Extract legacy member names from migration ops so that applySetMemberMutations - // can treat them as already-existing (they are not yet visible at readTS). - legacyMemberBase := buildLegacySetMemberBase(migrationElems, key) - - var lenDelta int64 - var mutErr error - elems, changed, lenDelta, mutErr = r.applySetMemberMutations(ctx, key, members, add, readTS, elems, legacyMemberBase) - if mutErr != nil { - return mutErr - } - - if changed == 0 && len(migrationElems) == 0 { - return nil - } - - if lenDelta != 0 { - deltaVal := store.MarshalSetMetaDelta(store.SetMetaDelta{LenDelta: lenDelta}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.SetMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - - if len(elems) == 0 { - return nil - } - - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return cockerrors.WithStack(dispatchErr) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(changed) -} - -// scanSetMemberExistsMap does a paginated prefix scan of all member keys for -// the given set and returns a map from member name to struct{}{}. -// Using a single prefix scan eliminates the per-member ExistsAt round-trip. -func (r *RedisServer) scanSetMemberExistsMap(ctx context.Context, key []byte, readTS uint64) (map[string]struct{}, error) { - return r.scanKeyExistsMap(ctx, store.SetMemberScanPrefix(key), readTS, - func(k []byte) []byte { return store.ExtractSetMemberName(k, key) }) -} - -// scanHashFieldExistsMap does a paginated prefix scan of all field keys for -// the given hash and returns a map from field name to struct{}{}. -// Using a single prefix scan eliminates per-field ExistsAt round-trips. -func (r *RedisServer) scanHashFieldExistsMap(ctx context.Context, key []byte, readTS uint64) (map[string]struct{}, error) { - return r.scanKeyExistsMap(ctx, store.HashFieldScanPrefix(key), readTS, - func(k []byte) []byte { return store.ExtractHashFieldName(k, key) }) -} - -// mergeZSetBulkScores performs a single prefix scan of ZSet member keys and -// merges the store scores into inTxnView when pairCount >= wideColumnBulkScanThreshold. -// This avoids O(pairCount) individual GetAt round-trips inside applyZAddPair. -// Members already in inTxnView (migration elems or earlier pairs) take precedence. -// Returns inTxnView unchanged when the batch is below the threshold. -func (r *RedisServer) mergeZSetBulkScores(ctx context.Context, key []byte, readTS uint64, pairCount int, inTxnView map[string]float64) (map[string]float64, error) { - if pairCount < wideColumnBulkScanThreshold { - return inTxnView, nil - } - bulkScores, err := r.scanZSetMemberScoreMap(ctx, key, readTS) - if err != nil { - return nil, err - } - if inTxnView == nil { - return bulkScores, nil - } - for m, s := range bulkScores { - if _, alreadySeen := inTxnView[m]; !alreadySeen { - inTxnView[m] = s - } - } - return inTxnView, nil -} - -// scanZSetMemberScoreMap does a paginated prefix scan of all member keys for -// the given ZSet and returns a map from member name to its current score. -// Using a single prefix scan eliminates O(N) GetAt round-trips in ZADD for -// large batches (>= wideColumnBulkScanThreshold pairs). -func (r *RedisServer) scanZSetMemberScoreMap(ctx context.Context, key []byte, readTS uint64) (map[string]float64, error) { - scanPrefix := store.ZSetMemberScanPrefix(key) - scanEnd := store.PrefixScanEnd(scanPrefix) - scores := make(map[string]float64) - cursor := scanPrefix - for { - scanKVs, err := r.store.ScanAt(ctx, cursor, scanEnd, store.MaxDeltaScanLimit, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - for _, pair := range scanKVs { - m := store.ExtractZSetMemberName(pair.Key, key) - if m == nil { - continue - } - if s, decodeErr := store.UnmarshalZSetScore(pair.Value); decodeErr == nil { - scores[string(m)] = s - } - } - if len(scanKVs) < store.MaxDeltaScanLimit { - break - } - lastKey := scanKVs[len(scanKVs)-1].Key - next := make([]byte, len(lastKey)+1) - copy(next, lastKey) - cursor = next - } - return scores, nil -} - -// scanKeyExistsMap paginates through all keys under scanPrefix, extracts a -// name from each key using extractName, and builds a set of existing names. -// It is used by scanSetMemberExistsMap and scanHashFieldExistsMap to eliminate -// per-key ExistsAt round-trips during SADD/SREM/HDEL operations. -func (r *RedisServer) scanKeyExistsMap(ctx context.Context, scanPrefix []byte, readTS uint64, extractName func([]byte) []byte) (map[string]struct{}, error) { - scanEnd := store.PrefixScanEnd(scanPrefix) - existsMap := make(map[string]struct{}) - cursor := scanPrefix - for { - scanKVs, err := r.store.ScanAt(ctx, cursor, scanEnd, store.MaxDeltaScanLimit, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - for _, pair := range scanKVs { - if name := extractName(pair.Key); name != nil { - existsMap[string(name)] = struct{}{} - } - } - if len(scanKVs) < store.MaxDeltaScanLimit { - break - } - lastKey := scanKVs[len(scanKVs)-1].Key - next := make([]byte, len(lastKey)+1) - copy(next, lastKey) - cursor = next - } - return existsMap, nil -} - -// initSetExistsMap builds the initial existence map for a set mutation batch. -// For large batches or when legacy members are present it does a bulk prefix -// scan; otherwise it returns an empty (non-nil) map for per-member ExistsAt -// fallback. Legacy members from migration elems are merged in so that members -// already in-flight in the same transaction are treated as existing. -func (r *RedisServer) initSetExistsMap(ctx context.Context, key []byte, members [][]byte, readTS uint64, legacyBase map[string]struct{}) (map[string]struct{}, error) { - existsMap := make(map[string]struct{}) - if len(members) >= wideColumnBulkScanThreshold || len(legacyBase) > 0 { - var err error - existsMap, err = r.scanSetMemberExistsMap(ctx, key, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - } - for m := range legacyBase { - existsMap[m] = struct{}{} - } - return existsMap, nil -} - -// lookupSetMemberExists reports whether memberStr is present, updating -// existsMap as a cache. For small clean batches (no bulk scan, no legacy -// migration) it falls back to an ExistsAt store read; otherwise it relies -// solely on the pre-built map. -func (r *RedisServer) lookupSetMemberExists(ctx context.Context, memberStr string, memberKey []byte, readTS uint64, existsMap map[string]struct{}, isSmallClean bool) (bool, error) { - if _, ok := existsMap[memberStr]; ok { - return true, nil - } - if !isSmallClean { - return false, nil - } - exists, err := r.store.ExistsAt(ctx, memberKey, readTS) - if err != nil { - return false, cockerrors.WithStack(err) - } - if exists { - existsMap[memberStr] = struct{}{} - } - return exists, nil -} - -// applySetMemberMutations resolves existence for each member using either a -// pre-built bulk scan (for large batches) or individual ExistsAt calls (for -// small batches), then applies the mutation to elems. -// The bulk scan threshold is wideColumnBulkScanThreshold. -// legacyBase contains members from a legacy blob being migrated in the same -// transaction; they are not visible at readTS and must be treated as existing. -func (r *RedisServer) applySetMemberMutations(ctx context.Context, key []byte, members [][]byte, add bool, readTS uint64, elems []*kv.Elem[kv.OP], legacyBase map[string]struct{}) ([]*kv.Elem[kv.OP], int, int64, error) { - existsMap, err := r.initSetExistsMap(ctx, key, members, readTS, legacyBase) - if err != nil { - return nil, 0, 0, err - } - isSmallClean := len(members) < wideColumnBulkScanThreshold && len(legacyBase) == 0 - changed := 0 - lenDelta := int64(0) - for _, member := range members { - memberStr := string(member) - memberKey := store.SetMemberKey(key, member) - exists, lookupErr := r.lookupSetMemberExists(ctx, memberStr, memberKey, readTS, existsMap, isSmallClean) - if lookupErr != nil { - return nil, 0, 0, lookupErr - } - var cnt int - var d int64 - elems, cnt, d = applySetMemberMutation(elems, memberKey, exists, add) - changed += cnt - lenDelta += d - // Update existsMap to reflect this mutation so that subsequent - // duplicate members in this call observe the correct in-txn state. - if add { - existsMap[memberStr] = struct{}{} - } else { - delete(existsMap, memberStr) - } - } - return elems, changed, lenDelta, nil -} - -func (r *RedisServer) mutateExactSet(conn redcon.Conn, kind string, key []byte, members [][]byte, add bool) { - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - - if kind != setKind { - r.mutateExactSetLegacy(conn, ctx, kind, key, members, add) - return - } - r.mutateExactSetWide(conn, ctx, key, members, add) -} - -func (r *RedisServer) sismember(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - key := cmd.Args[1] - member := cmd.Args[2] - readTS := r.readTS() - ctx := context.Background() - - hit, alive, err := r.setMemberFastExists(ctx, key, member, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if hit { - if alive { - conn.WriteInt(1) - } else { - conn.WriteInt(0) - } - return - } - r.sismemberSlow(conn, ctx, key, member, readTS) -} - -func (r *RedisServer) setMemberFastExists(ctx context.Context, key, member []byte, readTS uint64) (hit, alive bool, err error) { - // Probe FIRST; guard only on hit. See hashFieldFastLookup for the - // regression rationale. - exists, err := r.store.ExistsAt(ctx, store.SetMemberKey(key, member), readTS) - if err != nil { - return false, false, cockerrors.WithStack(err) - } - if !exists { - return false, false, nil - } - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return false, false, hErr - } else if higher { - return false, false, nil - } - expired, expErr := r.hasExpired(ctx, key, readTS, true) - if expErr != nil { - return false, false, cockerrors.WithStack(expErr) - } - return true, !expired, nil -} - -func (r *RedisServer) sismemberSlow(conn redcon.Conn, ctx context.Context, key, member []byte, readTS uint64) { - typ, err := r.keyTypeAt(ctx, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteInt(0) - return - } - if typ != redisTypeSet { - conn.WriteError(wrongTypeMessage) - return - } - value, err := r.loadSetAt(ctx, setKind, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if slices.Contains(value.Members, string(member)) { - conn.WriteInt(1) - return - } - conn.WriteInt(0) -} - -func (r *RedisServer) smembers(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteArray(0) - return - } - if typ != redisTypeSet { - conn.WriteError(wrongTypeMessage) - return - } - - value, err := r.loadSetAt(context.Background(), setKind, cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteArray(len(value.Members)) - for _, member := range value.Members { - conn.WriteBulkString(member) - } -} - -func (r *RedisServer) pfadd(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var changed int - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - if err := r.validateExactSetKind(hllKind, cmd.Args[1], readTS); err != nil { - return err - } - - value, err := r.loadSetAt(context.Background(), hllKind, cmd.Args[1], readTS) - if err != nil { - return err - } - existing := exactSetMembers(value) - changed = applyExactSetMutation(existing, cmd.Args[2:], true) - if changed == 0 { - return nil - } - - return r.persistExactSetMembersTxn(ctx, hllKind, cmd.Args[1], readTS, existing) - }); err != nil { - writeRedisError(conn, err) - return - } - if changed == 0 { - conn.WriteInt(0) - } else { - conn.WriteInt(1) - } -} - -func (r *RedisServer) pfcount(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - union := map[string]struct{}{} - for _, key := range cmd.Args[1:] { - typ, err := r.keyTypeAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ != redisTypeNone { - hllExists, err := r.store.ExistsAt(context.Background(), redisHLLKey(key), readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if !hllExists { - conn.WriteError(wrongTypeMessage) - return - } - } - value, err := r.loadSetAt(context.Background(), hllKind, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - for _, member := range value.Members { - union[member] = struct{}{} - } - } - conn.WriteInt(len(union)) -} - -func (r *RedisServer) hset(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - added, err := r.applyHashFieldPairs(cmd.Args[1], cmd.Args[2:]) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(added) -} - -func (r *RedisServer) hmset(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - if _, err := r.applyHashFieldPairs(cmd.Args[1], cmd.Args[2:]); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteString("OK") -} - -// buildHashLegacyMigrationElems returns ops that atomically migrate a legacy -// !redis|hash| blob to wide-column !hs|fld| keys. Returns nil if no legacy -// blob exists. The base meta key is also written with the migrated count so -// that resolveHashMeta works correctly after migration. -func (r *RedisServer) buildHashLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { - raw, err := r.store.GetAt(ctx, redisHashKey(key), readTS) - if cockerrors.Is(err, store.ErrKeyNotFound) { - return nil, nil - } - if err != nil { - return nil, cockerrors.WithStack(err) - } - value, err := unmarshalHashValue(raw) - if err != nil { - return nil, err - } - elems := make([]*kv.Elem[kv.OP], 0, len(value)+setWideColOverhead) - for field, val := range value { - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashFieldKey(key, []byte(field)), - Value: []byte(val), - }) - } - // Delete the legacy blob. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisHashKey(key)}) - // Write a base meta so that resolveHashMeta starts from an accurate count. - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaKey(key), - Value: store.MarshalHashMeta(store.HashMeta{Len: int64(len(value))}), - }) - return elems, nil -} - -// buildSetLegacyMigrationElems returns ops that atomically migrate a legacy -// !redis|set| blob to wide-column !st|mem| keys. Returns nil if no legacy -// blob exists. -func (r *RedisServer) buildSetLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { - raw, err := r.store.GetAt(ctx, redisSetKey(key), readTS) - if cockerrors.Is(err, store.ErrKeyNotFound) { - return nil, nil - } - if err != nil { - return nil, cockerrors.WithStack(err) - } - value, err := unmarshalSetValue(raw) - if err != nil { - return nil, err - } - elems := make([]*kv.Elem[kv.OP], 0, len(value.Members)+setWideColOverhead) - for _, member := range value.Members { - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.SetMemberKey(key, []byte(member)), - Value: []byte{}, - }) - } - // Delete the legacy blob. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisSetKey(key)}) - // Write a base meta so that resolveSetMeta starts from an accurate count. - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.SetMetaKey(key), - Value: store.MarshalSetMeta(store.SetMeta{Len: int64(len(value.Members))}), - }) - return elems, nil -} - -// buildZSetLegacyMigrationElems returns ops that atomically migrate a legacy -// !redis|zset| blob to wide-column !zs|mem| + !zs|scr| keys. Returns nil if no legacy -// blob exists. The base meta key is also written with the migrated count so -// that resolveZSetMeta works correctly after migration. -func (r *RedisServer) buildZSetLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { - raw, err := r.store.GetAt(ctx, redisZSetKey(key), readTS) - if cockerrors.Is(err, store.ErrKeyNotFound) { - return nil, nil - } - if err != nil { - return nil, cockerrors.WithStack(err) - } - value, err := unmarshalZSetValue(raw) - if err != nil { - return nil, err - } - // Each entry → member key + score index key; plus legacy blob deletion + base meta. - elems := make([]*kv.Elem[kv.OP], 0, len(value.Entries)*2+setWideColOverhead) //nolint:mnd // 2 ops per entry (member + score index) - for _, entry := range value.Entries { - elems = append(elems, - &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetMemberKey(key, []byte(entry.Member)), - Value: store.MarshalZSetScore(entry.Score), - }, - &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetScoreKey(key, entry.Score, []byte(entry.Member)), - Value: []byte{}, - }, - ) - } - // Delete the legacy blob. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisZSetKey(key)}) - // Write a base meta so that resolveZSetMeta starts from an accurate count. - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetMetaKey(key), - Value: store.MarshalZSetMeta(store.ZSetMeta{Len: int64(len(value.Entries))}), - }) - return elems, nil -} - -// addLegacyHashFieldsToMap adds field names from migration Put elems (fields -// being migrated in the current transaction, not yet visible at readTS) into -// existsMap so that buildHashFieldElems does not count them as new fields. -func addLegacyHashFieldsToMap(migrationElems []*kv.Elem[kv.OP], key []byte, existsMap map[string]struct{}) { - for _, elem := range migrationElems { - if elem.Op == kv.Put { - if f := store.ExtractHashFieldName(elem.Key, key); f != nil { - existsMap[string(f)] = struct{}{} - } - } - } -} - -// buildLegacySetMemberBase extracts member names from migration Put elems -// (members being migrated in the current transaction, invisible at readTS) -// and returns them as a set. Returns nil when no migration is happening. -func buildLegacySetMemberBase(migrationElems []*kv.Elem[kv.OP], key []byte) map[string]struct{} { - var base map[string]struct{} - for _, elem := range migrationElems { - if elem.Op == kv.Put { - if m := store.ExtractSetMemberName(elem.Key, key); m != nil { - if base == nil { - base = make(map[string]struct{}) - } - base[string(m)] = struct{}{} - } - } - } - return base -} - -// buildHashFieldElems iterates over field-value pairs in args, checks each -// field against existsMap to determine if it is new, appends Put operations -// to elems, and returns the updated elems and new-field count. -// existsMap is built by scanHashFieldExistsMap before this call so that -// existence checks are a single bulk scan rather than N ExistsAt round-trips. -func (r *RedisServer) buildHashFieldElems(key []byte, args [][]byte, existsMap map[string]struct{}, elems []*kv.Elem[kv.OP]) ([]*kv.Elem[kv.OP], int) { - newFields := 0 - for i := 0; i < len(args); i += redisPairWidth { - field := args[i] - value := args[i+1] - fieldStr := string(field) - fieldKey := store.HashFieldKey(key, field) - if _, exists := existsMap[fieldStr]; !exists { - newFields++ - // Mark as seen so duplicate field names in one HSET call are not - // counted as additional new fields (Redis deduplication semantics). - existsMap[fieldStr] = struct{}{} - } - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: value}) - } - return elems, newFields -} - -func (r *RedisServer) applyHashFieldPairs(key []byte, args [][]byte) (int, error) { - if len(args) == 0 || len(args)%redisPairWidth != 0 { - return 0, errors.New("ERR wrong number of arguments for hash command") - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var added int - err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeHash); err != nil { - return err - } - - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return cockerrors.Wrap(err, "applyHashFieldPairs: allocate commitTS") - } - - // Atomically migrate any legacy blob on first wide-column write. - // Fetch migration elems before allocating the main elems slice so that - // the initial capacity accounts for both migration and field Put ops, - // avoiding a reallocation when a legacy blob is present. - migrationElems, err := r.buildHashLegacyMigrationElems(ctx, key, readTS) - if err != nil { - return err - } - elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(args)/redisPairWidth+setWideColOverhead) - elems = append(elems, migrationElems...) - - // Bulk-scan existing fields once so buildHashFieldElems can check - // existence via a map lookup instead of per-field ExistsAt. - existsMap, err := r.scanHashFieldExistsMap(ctx, key, readTS) - if err != nil { - return err - } - // Fields from the legacy blob are being migrated in this same transaction, - // so they are not yet visible at readTS. Add them to existsMap so that - // buildHashFieldElems does not count already-existing fields as new. - addLegacyHashFieldsToMap(migrationElems, key, existsMap) - - var newFields int - elems, newFields = r.buildHashFieldElems(key, args, existsMap, elems) - added = newFields - - // Emit a single delta key for all newly-added fields. - if newFields != 0 { - deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: int64(newFields)}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - - if len(elems) == 0 { - return nil - } - - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return cockerrors.WithStack(dispatchErr) - }) - return added, err -} - -func (r *RedisServer) hget(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - key := cmd.Args[1] - field := cmd.Args[2] - readTS := r.readTS() - ctx := context.Background() - - // Fast path: look the wide-column field up directly. Live - // wide-column hashes resolve here in 1 seek + TTL probe versus - // the ~17 seeks rawKeyTypeAt issues through keyTypeAt. Legacy- - // blob hashes miss the wide-column key and fall through. - raw, hit, alive, err := r.hashFieldFastLookup(ctx, key, field, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if hit { - if !alive { - conn.WriteNull() - return - } - // WriteBulk sends the payload directly from the []byte backing - // store; WriteBulkString(string(raw)) would force a []byte → - // string copy on every fast-path hit. - conn.WriteBulk(raw) - return - } - r.hgetSlow(conn, ctx, key, field, readTS) -} - -// hashFieldFastLookup probes the wide-column field entry directly and -// reports whether it is present and TTL-alive. Returns hit=false when -// the wide-column key is absent, or when the narrow string-encoding -// guard in hasHigherPriorityStringEncoding fires, so the caller -// falls through to hgetSlow. -// -// Priority-alignment scope: this fast path does NOT fully mirror -// rawKeyTypeAt / keyTypeAt's priority checks. The guard only probes -// redisStrKey (the common SET-over-previous-hash corruption case); -// rarer dual-encoding corruption involving HLL, legacy bare keys, or -// list meta / delta entries is NOT caught here and will surface the -// wide-column hash answer instead of the WRONGTYPE / nil response -// keyTypeAt would produce. In normal operation at most one encoding -// exists per user key, so the guard is a guaranteed miss and the -// priority-alignment gap is invisible; pre-existing writers already -// clean up the old encoding before switching types. A full check -// would cost ~3-5 extra seeks per fast-path hit, which would negate -// most of the gain over the ~17-seek keyTypeAt slow path. -func (r *RedisServer) hashFieldFastLookup(ctx context.Context, key, field []byte, readTS uint64) (raw []byte, hit, alive bool, err error) { - // Probe the wide-column field FIRST so the priority guard only - // runs on a hit. Placing the guard before the probe made every - // miss (nonexistent key, legacy-blob hash, or wrong-type) pay an - // unnecessary ExistsAt on redisStrKey -- pure overhead for the - // common negative-lookup case and for any workload that still - // carries legacy-blob encodings. See the PR #565 independent - // review for the Medium-severity regression this addresses. - raw, err = r.store.GetAt(ctx, store.HashFieldKey(key, field), readTS) - if err != nil { - if cockerrors.Is(err, store.ErrKeyNotFound) { - return nil, false, false, nil - } - return nil, false, false, cockerrors.WithStack(err) - } - // Only pay the guard seek when we actually have a hit to defer. - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return nil, false, false, hErr - } else if higher { - return nil, false, false, nil - } - expired, expErr := r.hasExpired(ctx, key, readTS, true) - if expErr != nil { - return nil, false, false, cockerrors.WithStack(expErr) - } - return raw, true, !expired, nil -} - -// hasHigherPriorityStringEncoding returns true iff the new-format -// string encoding (redisStrKey) exists for key. This is NARROWER -// than rawKeyTypeAt's full string-wins tiebreaker, which also covers -// HyperLogLog (redisHLLKey) and the legacy bare key: those rarer -// dual-encoding corruption cases still reach the wide-column fast -// path and may return the collection-specific answer instead of -// WRONGTYPE / nil. -// -// The narrow scope is deliberate -- expanding the guard to every -// string-priority candidate (3 ExistsAt calls + the list-meta probe) -// would cost ~4-5 extra seeks per fast-path hit, regressing the -// negative case further than the ordering tweak in -// hashFieldFastLookup / setMemberFastExists / hashFieldFastExists -// already saved. Callers that require complete priority alignment -// must take the keyTypeAt slow path explicitly. -func (r *RedisServer) hasHigherPriorityStringEncoding(ctx context.Context, key []byte, readTS uint64) (bool, error) { - exists, err := r.store.ExistsAt(ctx, redisStrKey(key), readTS) - if err != nil { - return false, cockerrors.WithStack(err) - } - return exists, nil -} - -// zsetMemberFastScore probes the wide-column score entry for (key, -// member) directly and reports whether it is present and TTL-alive. -// Priority-alignment scope mirrors hashFieldFastLookup: only the -// redisStrKey dual-encoding case is guarded (see -// hasHigherPriorityStringEncoding's narrow-scope caveats). Callers -// must fall back to the full zsetState loader on hit=false to cover -// legacy-blob zsets and nil / WRONGTYPE disambiguation. -// -// Probe ORDER matches hashFieldFastLookup / setMemberFastExists / -// hashFieldFastExists post-PR #565: hit the wide-column score key -// first so the negative case (missing, legacy-blob, wrong-type) does -// not pay the priority-guard seek. -func (r *RedisServer) zsetMemberFastScore(ctx context.Context, key, member []byte, readTS uint64) (score float64, hit, alive bool, err error) { - raw, err := r.store.GetAt(ctx, store.ZSetMemberKey(key, member), readTS) - if err != nil { - if cockerrors.Is(err, store.ErrKeyNotFound) { - return 0, false, false, nil - } - return 0, false, false, cockerrors.WithStack(err) - } - score, err = store.UnmarshalZSetScore(raw) - if err != nil { - return 0, false, false, cockerrors.WithStack(err) - } - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return 0, false, false, hErr - } else if higher { - return 0, false, false, nil - } - expired, expErr := r.hasExpired(ctx, key, readTS, true) - if expErr != nil { - return 0, false, false, cockerrors.WithStack(expErr) - } - return score, true, !expired, nil -} - -// zsetRangeByScoreFast streams the score index for key over the -// caller-supplied [startKey, endKey) byte range, returning the -// decoded entries up to offset+limit. This replaces the -// load-the-whole-zset path used by cmdZRangeByScore / cmdZRevRangeByScore -// when the caller has no script-local mutations and the zset is in -// wide-column form. For a delay-queue poll ("next 10 jobs due by -// now") the cost goes from O(N) member GetAts to O(range_width + -// offset + limit) score-index entries. -// -// hit=false means the fast path cannot safely answer (legacy-blob -// zset present, string-encoding corruption, or empty-result case -// where we cannot distinguish "zset is empty in this range" from -// "key exists as another type / is missing"). Callers MUST take -// the slow path on hit=false so keyTypeAt disambiguation fires. -// reason carries the specific hit=false branch so observers can -// subdivide fallback rates for dashboarding; "" when hit=true. -// -// scoreInRange filter is applied post-scan for exclusive bound -// edge cases; the caller supplies precomputed scan bounds that -// over-approximate toward INclusive and lets this helper filter. -func (r *RedisServer) zsetRangeByScoreFast( - ctx context.Context, - key, startKey, endKey []byte, - reverse bool, - offset, limit int, - scoreFilter func(float64) bool, - readTS uint64, -) ([]redisZSetEntry, bool, monitoring.LuaFastPathFallbackReason, error) { - if eligible, err := r.zsetFastPathEligible(ctx, key, readTS); err != nil || !eligible { - return nil, false, monitoring.LuaFastPathFallbackIneligible, err - } - // Large-offset short-circuit: once offset >= maxWideScanLimit, - // the fast path can only scan maxWideScanLimit rows then skip all - // of them -- guaranteed wasted I/O. Defer to the slow path - // immediately so it can answer from the full member load without - // the redundant score-index scan. - if offset >= maxWideScanLimit { - return nil, false, monitoring.LuaFastPathFallbackLargeOffset, nil - } - scanLimit := zsetFastScanLimit(offset, limit) - if scanLimit <= 0 || bytes.Compare(startKey, endKey) >= 0 { - hit, reason, err := r.zsetRangeEmptyFastResult(ctx, key, readTS) - return nil, hit, reason, err - } - kvs, err := r.zsetScoreScan(ctx, startKey, endKey, scanLimit, reverse, readTS) - if err != nil { - return nil, false, monitoring.LuaFastPathFallbackOther, err - } - return r.finalizeZSetFastRange(ctx, key, kvs, offset, limit, scanLimit, scoreFilter, readTS) -} - -// finalizeZSetFastRange runs the post-scan priority guard, decodes -// the candidate score rows into redisZSetEntry, and applies the TTL -// filter. Factored out so zsetRangeByScoreFast stays under the -// cyclomatic-complexity cap. -// -// Takes scanLimit so we can detect a saturated scan: if the scanner -// returned exactly scanLimit rows AND the caller's request is not -// satisfied (unbounded limit, or collected fewer entries than limit), -// there MAY be more entries beyond the scan window. In that case we -// return hit=false so the slow path can produce the authoritative -// answer -- the fast path MUST NOT silently truncate. -func (r *RedisServer) finalizeZSetFastRange( - ctx context.Context, key []byte, kvs []*store.KVPair, - offset, limit, scanLimit int, scoreFilter func(float64) bool, readTS uint64, -) ([]redisZSetEntry, bool, monitoring.LuaFastPathFallbackReason, error) { - // Priority guard runs after a candidate hit (mirrors post-PR #565 - // ordering). Skip it on empty result -- the empty-result tail - // handles disambiguation. - if len(kvs) > 0 { - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return nil, false, monitoring.LuaFastPathFallbackOther, hErr - } else if higher { - return nil, false, monitoring.LuaFastPathFallbackWrongType, nil - } - } - entries := decodeZSetScoreRange(key, kvs, offset, limit, scoreFilter) - // Truncation guard: the raw scanner hit its cap AND the caller did - // not get a satisfied result. Entries beyond the window may - // exist; defer to the slow path for correctness. - if zsetFastPathTruncated(len(kvs), scanLimit, len(entries), limit) { - return nil, false, monitoring.LuaFastPathFallbackTruncated, nil - } - if len(entries) == 0 { - hit, reason, err := r.zsetRangeEmptyFastResult(ctx, key, readTS) - return nil, hit, reason, err - } - expired, expErr := r.hasExpired(ctx, key, readTS, true) - if expErr != nil { - return nil, false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(expErr) - } - if expired { - return nil, true, "", nil - } - return entries, true, "", nil -} - -// zsetFastPathTruncated reports whether the bounded score-index scan -// may have dropped entries that the caller's request would otherwise -// include. Returns true when the scanner returned the full quota -// (scannedRows == scanLimit) AND the caller's request is still -// unsatisfied (unbounded limit or collectedEntries < limit). In that -// case the caller must fall back to the slow full-load path to get -// the authoritative result. -func zsetFastPathTruncated(scannedRows, scanLimit, collectedEntries, limit int) bool { - if scannedRows < scanLimit { - return false - } - if limit < 0 { - return true - } - return collectedEntries < limit -} - -// zsetFastPathEligible returns false (without error) when a legacy- -// blob zset is present; the caller must take the slow path so -// ensureZSetLoaded / blob decoding runs. -func (r *RedisServer) zsetFastPathEligible(ctx context.Context, key []byte, readTS uint64) (bool, error) { - legacyExists, err := r.store.ExistsAt(ctx, redisZSetKey(key), readTS) - if err != nil { - return false, cockerrors.WithStack(err) - } - return !legacyExists, nil -} - -// zsetFastScanLimit clamps offset+limit to maxWideScanLimit so an -// unbounded or malicious LIMIT cannot force an O(N) scan of a large -// zset. A negative limit means "unbounded" at the Redis level; cap it -// at the collection OOM limit. -// -// Check bounds BEFORE adding to avoid signed-integer overflow on -// hostile input (e.g. a Lua script passing offset=limit=math.MaxInt). -// A wrap would produce a negative scanLimit and cause the caller's -// `scanLimit <= 0` branch to misroute a live zset into the -// empty-result tail. -func zsetFastScanLimit(offset, limit int) int { - // limit == 0: the caller wants zero entries regardless of offset. - // Return 0 so the caller's `scanLimit <= 0` branch routes to the - // empty-result tail (which still runs resolveZSetMeta for proper - // WRONGTYPE / existence disambiguation) instead of a pointless - // full-quota scan. - if limit == 0 { - return 0 - } - if limit < 0 { - return maxWideScanLimit - } - if offset >= maxWideScanLimit { - return maxWideScanLimit - } - if limit > maxWideScanLimit-offset { - return maxWideScanLimit - } - return offset + limit -} - -// zsetScoreScan picks Forward / Reverse ScanAt based on direction. -func (r *RedisServer) zsetScoreScan( - ctx context.Context, startKey, endKey []byte, scanLimit int, reverse bool, readTS uint64, -) ([]*store.KVPair, error) { - if reverse { - kvs, err := r.store.ReverseScanAt(ctx, startKey, endKey, scanLimit, readTS) - return kvs, cockerrors.WithStack(err) - } - kvs, err := r.store.ScanAt(ctx, startKey, endKey, scanLimit, readTS) - return kvs, cockerrors.WithStack(err) -} - -// zsetDecodeAllocSize returns a tight upper bound on the collected -// entry count for decodeZSetScoreRange: (kvLen - offset) capped by -// limit, never negative. Avoiding a make([]...len(kvs)) saves up to -// maxWideScanLimit entries of wasted slice capacity when the caller -// asked for a small window at a large offset. -func zsetDecodeAllocSize(kvLen, offset, limit int) int { - allocSize := kvLen - offset - if allocSize < 0 { - return 0 - } - if limit >= 0 && limit < allocSize { - return limit - } - return allocSize -} - -// decodeZSetScoreRange decodes score-index scan results into -// redisZSetEntry, applying the post-scan score filter (exclusive -// bound edges) and the offset / limit pagination. Entries that fail -// to decode are silently dropped -- they can only appear under data -// corruption. -func decodeZSetScoreRange( - key []byte, kvs []*store.KVPair, offset, limit int, scoreFilter func(float64) bool, -) []redisZSetEntry { - entries := make([]redisZSetEntry, 0, zsetDecodeAllocSize(len(kvs), offset, limit)) - skipped := 0 - for _, kv := range kvs { - score, member, ok := store.ExtractZSetScoreAndMember(kv.Key, key) - if !ok { - continue - } - if scoreFilter != nil && !scoreFilter(score) { - continue - } - // Check limit saturation BEFORE the offset skip so a small - // limit with a large offset exits immediately instead of - // burning offset iterations on the skip branch. Correct for - // any (offset, limit): once len(entries) >= limit we are done - // regardless of remaining skip budget. - if limit >= 0 && len(entries) >= limit { - break - } - if skipped < offset { - skipped++ - continue - } - entries = append(entries, redisZSetEntry{Member: string(member), Score: score}) - } - return entries -} - -// zsetRangeEmptyFastResult is the empty-result tail: either the -// score range is genuinely empty on a live zset (return empty + -// hit=true) or the zset does not exist in wide-column form (return -// hit=false so the caller takes the slow path for WRONGTYPE / missing -// disambiguation). -// -// Uses resolveZSetMeta so delta-only wide zsets (a fresh zset whose -// base meta has not been persisted yet, only delta rows) are detected -// as "exists". Using a plain ExistsAt on ZSetMetaKey would miss those -// and force the slow path unnecessarily. Also runs the string-priority -// guard so a corrupted redisStrKey + zset meta surfaces WRONGTYPE via -// the slow path rather than an empty array. -// zsetRangeEmptyFastResult returns (hit, reason, err) for the empty- -// result tail. hit=true means the key is a live zset whose score -// range is simply empty (callers return an empty array and no -// fallback); hit=false carries a specific fallback reason so the -// caller can route its slow-path observation accordingly. -func (r *RedisServer) zsetRangeEmptyFastResult(ctx context.Context, key []byte, readTS uint64) (bool, monitoring.LuaFastPathFallbackReason, error) { - _, zsetExists, err := r.resolveZSetMeta(ctx, key, readTS) - if err != nil { - return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(err) - } - if !zsetExists { - // The key has no ZSet encoding at readTS. Redis semantics: - // - key truly absent → ZRANGEBYSCORE returns empty - // - key is another type → ZRANGEBYSCORE returns WRONGTYPE - // Production metric (PR #572) showed this branch is the - // hot-path dominant outcome (~96% of ZRANGEBYSCORE calls on - // BullMQ-style workloads that poll an empty delayed queue). - // Punting every such call to the slow path repeats the same - // 3-probe member/meta/delta scan we just did and then - // re-probes all other types anyway -- pure duplicate I/O. - // - // Short-circuit: use keyTypeAt (logical type after TTL check) - // to distinguish "truly absent" from "wrong type". If None, - // return hit=true with an empty result -- that is the correct - // Redis answer and saves the slow-path round-trip. Otherwise - // fall back so the slow path can produce WRONGTYPE. - typ, typErr := r.keyTypeAtExpect(ctx, key, readTS, redisTypeZSet) - if typErr != nil { - return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(typErr) - } - if typ == redisTypeNone { - return true, "", nil - } - return false, monitoring.LuaFastPathFallbackWrongType, nil - } - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return false, monitoring.LuaFastPathFallbackOther, hErr - } else if higher { - return false, monitoring.LuaFastPathFallbackWrongType, nil - } - // hasExpired is called for its error-surfacing side effect only: - // whether the zset is expired or not, a live zset with no members - // in range returns an empty hit=true result. Keep the call so - // storage errors during TTL resolution still propagate. - if _, expErr := r.hasExpired(ctx, key, readTS, true); expErr != nil { - return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(expErr) - } - return true, "", nil -} - -// hgetSlow falls back to the type-probing path when hashFieldFastLookup -// misses. Handles legacy-blob hashes and nil / WRONGTYPE disambiguation. -func (r *RedisServer) hgetSlow(conn redcon.Conn, ctx context.Context, key, field []byte, readTS uint64) { - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteNull() - return - } - if typ != redisTypeHash { - conn.WriteError(wrongTypeMessage) - return - } - value, err := r.loadHashAt(ctx, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - fieldValue, ok := value[string(field)] - if !ok { - conn.WriteNull() - return - } - conn.WriteBulkString(fieldValue) -} - -func (r *RedisServer) hmget(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) - if err != nil { - writeRedisError(conn, err) - return - } - fields := cmd.Args[redisPairWidth:] - if typ == redisTypeNone { - conn.WriteArray(len(fields)) - for range cmd.Args[2:] { - conn.WriteNull() - } - return - } - if typ != redisTypeHash { - conn.WriteError(wrongTypeMessage) - return - } - - value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteArray(len(fields)) - for _, field := range fields { - fieldValue, ok := value[string(field)] - if !ok { - conn.WriteNull() - continue - } - conn.WriteBulkString(fieldValue) - } -} - -func (r *RedisServer) hdel(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var removed int - if err := r.retryRedisWrite(ctx, func() error { - var err error - removed, err = r.hdelTxn(ctx, cmd.Args[1], cmd.Args[2:]) - return err - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(removed) -} - -// hdelWideColumn deletes the given fields from the wide-column hash and emits a negative delta. -func (r *RedisServer) hdelWideColumn(ctx context.Context, key []byte, fields [][]byte, readTS uint64) (int, error) { - delElems, removed, err := r.resolveHashFieldDelElems(ctx, key, fields, readTS) - if err != nil { - return 0, err - } - if removed == 0 { - return 0, nil - } - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return 0, cockerrors.Wrap(err, "hdelWideColumn: allocate commitTS") - } - elems := delElems - deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: int64(-removed)}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return removed, cockerrors.WithStack(dispatchErr) -} - -// resolveHashFieldDelElems checks which fields exist using either a bulk scan -// (for large batches) or individual ExistsAt calls (for small batches), then -// returns Del elems for every field that exists and the count of deletions. -func (r *RedisServer) resolveHashFieldDelElems(ctx context.Context, key []byte, fields [][]byte, readTS uint64) ([]*kv.Elem[kv.OP], int, error) { - var existsMap map[string]struct{} - if len(fields) >= wideColumnBulkScanThreshold { - var err error - existsMap, err = r.scanHashFieldExistsMap(ctx, key, readTS) - if err != nil { - return nil, 0, err - } - } - elems := make([]*kv.Elem[kv.OP], 0, len(fields)+1) - removed := 0 - for _, field := range fields { - fieldKey := store.HashFieldKey(key, field) - var exists bool - if existsMap != nil { - _, exists = existsMap[string(field)] - } else { - var err error - exists, err = r.store.ExistsAt(ctx, fieldKey, readTS) - if err != nil { - return nil, 0, cockerrors.WithStack(err) - } - } - if exists { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: fieldKey}) - removed++ - } - } - return elems, removed, nil -} - -func (r *RedisServer) hdelTxn(ctx context.Context, key []byte, fields [][]byte) (int, error) { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) - if err != nil { - return 0, err - } - if typ == redisTypeNone { - return 0, nil - } - if typ != redisTypeHash { - return 0, wrongTypeError() - } - - // Wide-column path: check if any !hs|fld| keys exist for this key. - hashFieldPrefix := store.HashFieldScanPrefix(key) - hashFieldEnd := store.PrefixScanEnd(hashFieldPrefix) - wideKVs, err := r.store.ScanAt(context.Background(), hashFieldPrefix, hashFieldEnd, 1, readTS) - if err != nil { - return 0, cockerrors.WithStack(err) - } - if len(wideKVs) > 0 { - return r.hdelWideColumn(ctx, key, fields, readTS) - } - - // Legacy blob path. - value, err := r.loadHashAt(context.Background(), key, readTS) - if err != nil { - return 0, err - } - removed := removeHashFields(value, fields) - if removed == 0 { - return 0, nil - } - return removed, r.persistHashTxn(ctx, key, readTS, value) -} - -func removeHashFields(value redisHashValue, fields [][]byte) int { - removed := 0 - for _, field := range fields { - if _, ok := value[string(field)]; ok { - delete(value, string(field)) - removed++ - } - } - return removed -} - -func (r *RedisServer) persistHashTxn(ctx context.Context, key []byte, readTS uint64, value redisHashValue) error { - if len(value) == 0 { - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, elems) - } - // Wide-column rewrite: write per-field keys and a new base meta. - // deleteLogicalKeyElems (called by the caller when needed) clears old keys. - elems := make([]*kv.Elem[kv.OP], 0, len(value)+1) - for field, val := range value { - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashFieldKey(key, []byte(field)), - Value: []byte(val), - }) - } - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaKey(key), - Value: store.MarshalHashMeta(store.HashMeta{Len: int64(len(value))}), - }) - // Also remove the legacy blob if it was present. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisHashKey(key)}) - return r.dispatchElems(ctx, true, readTS, elems) -} - -func (r *RedisServer) hexists(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - key := cmd.Args[1] - field := cmd.Args[2] - readTS := r.readTS() - ctx := context.Background() - - // Fast path: direct wide-column field existence check. ExistsAt - // is cheaper than GetAt since we don't need the value payload. - hit, alive, err := r.hashFieldFastExists(ctx, key, field, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if hit { - if alive { - conn.WriteInt(1) - } else { - conn.WriteInt(0) - } - return - } - r.hexistsSlow(conn, ctx, key, field, readTS) -} - -func (r *RedisServer) hashFieldFastExists(ctx context.Context, key, field []byte, readTS uint64) (hit, alive bool, err error) { - // Probe FIRST; guard only on hit. See hashFieldFastLookup for the - // regression rationale. - exists, err := r.store.ExistsAt(ctx, store.HashFieldKey(key, field), readTS) - if err != nil { - return false, false, cockerrors.WithStack(err) - } - if !exists { - return false, false, nil - } - if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { - return false, false, hErr - } else if higher { - return false, false, nil - } - expired, expErr := r.hasExpired(ctx, key, readTS, true) - if expErr != nil { - return false, false, cockerrors.WithStack(expErr) - } - return true, !expired, nil -} - -func (r *RedisServer) hexistsSlow(conn redcon.Conn, ctx context.Context, key, field []byte, readTS uint64) { - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteInt(0) - return - } - if typ != redisTypeHash { - conn.WriteError(wrongTypeMessage) - return - } - value, err := r.loadHashAt(ctx, key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if _, ok := value[string(field)]; ok { - conn.WriteInt(1) - return - } - conn.WriteInt(0) -} - -func (r *RedisServer) hlen(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteInt(0) - return - } - if typ != redisTypeHash { - conn.WriteError(wrongTypeMessage) - return - } - - // Wide-column path: use delta-aggregated metadata for O(1) count. - count, exists, err := r.resolveHashMeta(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if exists { - conn.WriteInt64(count) - return - } - // Legacy blob fallback: load all fields and count. - value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(len(value)) -} - -func (r *RedisServer) hincrby(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - increment, err := strconv.ParseInt(string(cmd.Args[3]), 10, 64) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var current int64 - if err := r.retryRedisWrite(ctx, func() error { - var txnErr error - current, txnErr = r.hincrbyTxn(ctx, cmd.Args[1], cmd.Args[2], increment) - return txnErr - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt64(current) -} - -// readHashFieldInt reads the current integer value of a hash field from wide-column or legacy storage. -// Returns (current, isNewField, legacyHashValue, error). legacyHashValue is non-nil only when -// the value came from a legacy JSON blob that needs to be migrated on the next write. -func (r *RedisServer) readHashFieldInt(ctx context.Context, key, field []byte, readTS uint64) (int64, bool, redisHashValue, error) { - fieldKey := store.HashFieldKey(key, field) - raw, readErr := r.store.GetAt(ctx, fieldKey, readTS) - if readErr != nil && !cockerrors.Is(readErr, store.ErrKeyNotFound) { - return 0, true, nil, cockerrors.WithStack(readErr) - } - if readErr == nil { - current, parseErr := strconv.ParseInt(string(raw), 10, 64) - if parseErr != nil { - return 0, false, nil, errors.New("ERR hash value is not an integer") - } - return current, false, nil, nil - } - // Not in wide-column – check legacy blob. - legacyValue, legacyErr := r.loadHashAt(ctx, key, readTS) - if legacyErr != nil { - return 0, true, nil, legacyErr - } - if rawLegacy, ok := legacyValue[string(field)]; ok { - current, parseErr := strconv.ParseInt(rawLegacy, 10, 64) - if parseErr != nil { - return 0, false, nil, errors.New("ERR hash value is not an integer") - } - return current, false, legacyValue, nil - } - return 0, true, legacyValue, nil -} - -// hincrbyWithMigration handles the HINCRBY case where a legacy JSON blob must be migrated -// atomically with the increment operation. -func (r *RedisServer) hincrbyWithMigration(ctx context.Context, key, fieldKey []byte, readTS, commitTS uint64, current int64, isNewField bool, increment int64) (int64, error) { - migrationElems, migErr := r.buildHashLegacyMigrationElems(ctx, key, readTS) - if migErr != nil { - return 0, migErr - } - current += increment - newVal := strconv.FormatInt(current, 10) - elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+setWideColOverhead) - elems = append(elems, migrationElems...) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: []byte(newVal)}) - if isNewField { - deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: 1}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return current, cockerrors.WithStack(dispatchErr) -} - -func (r *RedisServer) hincrbyTxn(ctx context.Context, key, field []byte, increment int64) (int64, error) { - readTS := r.readTS() - if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeHash); err != nil { - return 0, err - } - - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return 0, cockerrors.Wrap(err, "hincrbyTxn: allocate commitTS") - } - fieldKey := store.HashFieldKey(key, field) - - current, isNewField, legacyValue, err := r.readHashFieldInt(ctx, key, field, readTS) - if err != nil { - return 0, err - } - - // If a legacy blob exists, migrate it atomically with the increment. - if len(legacyValue) > 0 { - return r.hincrbyWithMigration(ctx, key, fieldKey, readTS, commitTS, current, isNewField, increment) - } - - current += increment - newVal := strconv.FormatInt(current, 10) - elems := make([]*kv.Elem[kv.OP], 0, setWideColOverhead) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: []byte(newVal)}) - if isNewField { - deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: 1}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.HashMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return current, cockerrors.WithStack(dispatchErr) -} - -func (r *RedisServer) incr(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var current int64 - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - return err - } - if typ != redisTypeNone && typ != redisTypeString { - return wrongTypeError() - } - - current = 0 - var existingTTL *time.Time - if typ == redisTypeString { - raw, ttl, err := r.readRedisStringAt(cmd.Args[1], readTS) - if err != nil { - return err - } - existingTTL = ttl - current, err = strconv.ParseInt(string(raw), 10, 64) - if err != nil { - return fmt.Errorf("ERR value is not an integer or out of range") - } - } - current++ - - // INCR preserves any existing TTL (Redis semantics). - encoded := encodeRedisStr([]byte(strconv.FormatInt(current, 10)), existingTTL) - elems := []*kv.Elem[kv.OP]{ - {Op: kv.Put, Key: redisStrKey(cmd.Args[1]), Value: encoded}, - } - if existingTTL != nil { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(cmd.Args[1]), Value: encodeRedisTTL(*existingTTL)}) - } else { - // Defensively clear any stale/legacy scan index entry so the sweeper - // cannot later expire a now-persistent key. - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(cmd.Args[1])}) - } - return r.dispatchElems(ctx, true, readTS, elems) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt64(current) -} - -func (r *RedisServer) hgetall(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteArray(0) - return - } - if typ != redisTypeHash { - conn.WriteError(wrongTypeMessage) - return - } - - value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - fields := make([]string, 0, len(value)) - for field := range value { - fields = append(fields, field) - } - sort.Strings(fields) - conn.WriteArray(len(fields) * redisPairWidth) - for _, field := range fields { - conn.WriteBulkString(field) - conn.WriteBulkString(value[field]) - } -} - -type zaddFlags struct { - nx bool // only add new elements - xx bool // only update existing elements - gt bool // only update when new score > current score - lt bool // only update when new score < current score -} - -func parseZAddFlags(args [][]byte) (zaddFlags, int, error) { - var flags zaddFlags - i := 2 - for i < len(args) { - if !flags.applyFlag(strings.ToUpper(string(args[i]))) { - break - } - i++ - } - if err := flags.validate(); err != nil { - return zaddFlags{}, 0, err - } - return flags, i, nil -} - -func (f *zaddFlags) applyFlag(name string) bool { - switch name { - case "NX": - f.nx = true - case "XX": - f.xx = true - case "GT": - f.gt = true - case "LT": - f.lt = true - default: - return false - } - return true -} - -func (f zaddFlags) allows(exists bool, oldScore, newScore float64) bool { - if (f.nx && exists) || (f.xx && !exists) { - return false - } - return !exists || f.scoreAllowed(oldScore, newScore) -} - -func (f zaddFlags) scoreAllowed(oldScore, newScore float64) bool { - if f.gt && newScore <= oldScore { - return false - } - if f.lt && newScore >= oldScore { - return false - } - return true -} - -func (f zaddFlags) validate() error { - if f.nx && f.xx { - return fmt.Errorf("ERR XX and NX options at the same time are not compatible") - } - if f.nx && (f.gt || f.lt) { - return fmt.Errorf("ERR GT, LT, and NX options at the same time are not compatible") - } - return nil -} - -type zaddPair struct { - score float64 - member string -} - -func parseZAddPairs(remaining [][]byte) ([]zaddPair, error) { - pairs := make([]zaddPair, 0, len(remaining)/redisPairWidth) - for i := 0; i < len(remaining); i += redisPairWidth { - score, err := strconv.ParseFloat(string(remaining[i]), 64) - if err != nil { - return nil, fmt.Errorf("parse zadd score: %w", err) - } - pairs = append(pairs, zaddPair{score: score, member: string(remaining[i+1])}) - } - return pairs, nil -} - -func (r *RedisServer) zadd(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - flags, pairStart, err := parseZAddFlags(cmd.Args) - if err != nil { - writeRedisError(conn, err) - return - } - remaining := cmd.Args[pairStart:] - if len(remaining) == 0 || len(remaining)%redisPairWidth != 0 { - conn.WriteError("ERR syntax error") - return - } - pairs, err := parseZAddPairs(remaining) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var added int - if err := r.retryRedisWrite(ctx, func() error { - var err error - added, err = r.zaddTxn(ctx, cmd.Args[1], flags, pairs) - return err - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(added) -} - -// buildZSetMigrationView extracts member→score from ZSet migration Put elems -// so that applyZAddPair can see migrated members without a store round-trip. -// Returns a map from member name to score; absent members were not migrated. -func buildZSetMigrationView(migrationElems []*kv.Elem[kv.OP], key []byte) map[string]float64 { - view := make(map[string]float64) - for _, elem := range migrationElems { - if elem.Op != kv.Put { - continue - } - m := store.ExtractZSetMemberName(elem.Key, key) - if m == nil { - continue - } - score, err := store.UnmarshalZSetScore(elem.Value) - if err == nil { - view[string(m)] = score - } - } - return view -} - -// resolveZSetMemberScore returns the current score and existence for a ZSet -// member. It checks inTxnView first (covers migration elems and earlier pairs -// in the same ZADD call), then falls back to a store GetAt. -func (r *RedisServer) resolveZSetMemberScore(ctx context.Context, memberKey []byte, member string, readTS uint64, inTxnView map[string]float64) (score float64, exists bool, err error) { - if s, ok := inTxnView[member]; ok { - return s, true, nil - } - raw, getErr := r.store.GetAt(ctx, memberKey, readTS) - if getErr == nil { - s, unmarshalErr := store.UnmarshalZSetScore(raw) - if unmarshalErr != nil { - return 0, false, cockerrors.WithStack(unmarshalErr) - } - return s, true, nil - } - if !cockerrors.Is(getErr, store.ErrKeyNotFound) { - return 0, false, cockerrors.WithStack(getErr) - } - return 0, false, nil -} - -// applyZAddPair processes one ZADD pair against the wide-column store: reads the -// existing member score (if any), checks the ZADD flags, emits del-old-score / -// put-member / put-score-index ops, and returns the updated elems, the add count -// (0 or 1), and the length delta (0 or +1). -// inTxnView provides an in-transaction view of member→score for members written -// in the same transaction (migration or earlier pairs); checked before GetAt so -// migrated and duplicate members are handled correctly. -func (r *RedisServer) applyZAddPair(ctx context.Context, key []byte, p zaddPair, flags zaddFlags, readTS uint64, elems []*kv.Elem[kv.OP], inTxnView map[string]float64) ([]*kv.Elem[kv.OP], int, int64, error) { - memberKey := store.ZSetMemberKey(key, []byte(p.member)) - oldScore, memberExists, err := r.resolveZSetMemberScore(ctx, memberKey, p.member, readTS, inTxnView) - if err != nil { - return nil, 0, 0, err - } - if !flags.allows(memberExists, oldScore, p.score) { - return elems, 0, 0, nil - } - if memberExists { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(p.member))}) - } - elems = append(elems, - &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: store.MarshalZSetScore(p.score)}, - &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, p.score, []byte(p.member)), Value: []byte{}}, - ) - // Update inTxnView so subsequent pairs (duplicates) see this write. - inTxnView[p.member] = p.score - if memberExists { - return elems, 0, 0, nil - } - return elems, 1, 1, nil -} - -func (r *RedisServer) zaddTxn(ctx context.Context, key []byte, flags zaddFlags, pairs []zaddPair) (int, error) { - readTS := r.readTS() - if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeZSet); err != nil { - return 0, err - } - - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return 0, cockerrors.Wrap(err, "zaddTxn: allocate commitTS") - } - - migrationElems, err := r.buildZSetLegacyMigrationElems(ctx, key, readTS) - if err != nil { - return 0, err - } - // Capacity: each pair may produce 3 ops (del old score + put member + put score index), - // plus migration elems and a delta key. - elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(pairs)*3+setWideColOverhead) //nolint:mnd // 3 ops per pair - elems = append(elems, migrationElems...) - - // Seed the in-transaction view from migration elems so that migrated - // members are not incorrectly counted as new by applyZAddPair. - inTxnView := buildZSetMigrationView(migrationElems, key) - - // For large batches, mergeZSetBulkScores performs one prefix scan that - // eliminates O(N) GetAt calls inside applyZAddPair; it is a no-op for - // batches below wideColumnBulkScanThreshold. - inTxnView, err = r.mergeZSetBulkScores(ctx, key, readTS, len(pairs), inTxnView) - if err != nil { - return 0, err - } - - added := 0 - lenDelta := int64(0) - for _, p := range pairs { - var c int - var d int64 - elems, c, d, err = r.applyZAddPair(ctx, key, p, flags, readTS, elems, inTxnView) - if err != nil { - return 0, err - } - added += c - lenDelta += d - } - - if len(elems) == 0 { - return 0, nil - } - - if lenDelta != 0 { - deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: lenDelta}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - - return added, r.dispatchAndSignalZSet(ctx, readTS, commitTS, elems, key) -} - -// dispatchAndSignalZSet dispatches the elems through the coordinator -// and, on success, wakes any BZPOPMIN waiter on the same node. -// coordinator.Dispatch blocks until the FSM applies locally, so by -// the time Signal fires the new members are visible at the readTS -// the woken waiter will pick on its next iteration. Pulled out of -// zaddTxn / zincrbyTxn so the parents stay under the cyclop budget -// — the signal step would otherwise add an extra branch on the -// dispatch error path. -func (r *RedisServer) dispatchAndSignalZSet( - ctx context.Context, - readTS, commitTS uint64, - elems []*kv.Elem[kv.OP], - zsetKey []byte, -) error { - _, err := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - if err != nil { - return cockerrors.WithStack(err) - } - r.zsetWaiters.Signal(zsetKey) - return nil -} - -// zincrbyTxn performs one attempt of ZINCRBY in wide-column format. -// Returns the new score after applying increment. -func (r *RedisServer) zincrbyTxn(ctx context.Context, key []byte, member string, increment float64) (float64, error) { - readTS := r.readTS() - if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeZSet); err != nil { - return 0, err - } - - memberKey := store.ZSetMemberKey(key, []byte(member)) - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return 0, cockerrors.Wrap(err, "zincrbyTxn: allocate commitTS") - } - - migrationElems, migErr := r.buildZSetLegacyMigrationElems(ctx, key, readTS) - if migErr != nil { - return 0, migErr - } - - // Check in-txn migration view before falling back to the store - // (migrated keys are not yet visible at readTS). - inTxnView := buildZSetMigrationView(migrationElems, key) - oldScore, memberExists, err := r.resolveZSetMemberScore(ctx, memberKey, member, readTS, inTxnView) - if err != nil { - return 0, err - } - - newScore := oldScore + increment - if math.IsNaN(newScore) { - return 0, errors.New("ERR resulting score is not a number (NaN)") - } - elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+3) //nolint:mnd // del old score + put member + put score index - elems = append(elems, migrationElems...) - if memberExists { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}) - } - elems = append(elems, - &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: store.MarshalZSetScore(newScore)}, - &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, newScore, []byte(member)), Value: []byte{}}, - ) - if !memberExists { - deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: 1}) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.ZSetMetaDeltaKey(key, commitTS, 0), - Value: deltaVal, - }) - } - if err := r.dispatchAndSignalZSet(ctx, readTS, commitTS, elems, key); err != nil { - return 0, err - } - return newScore, nil -} - -func (r *RedisServer) zincrby(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - increment, err := strconv.ParseFloat(string(cmd.Args[2]), 64) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var newScore float64 - if err := r.retryRedisWrite(ctx, func() error { - var txnErr error - newScore, txnErr = r.zincrbyTxn(ctx, cmd.Args[1], string(cmd.Args[3]), increment) - return txnErr - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteBulkString(formatRedisFloat(newScore)) -} - -func parseZRangeOptions(args [][]byte) (zrangeOptions, error) { - opts := zrangeOptions{} - for _, arg := range args { - switch strings.ToUpper(string(arg)) { - case "WITHSCORES": - opts.withScores = true - case "REV": - opts.reverse = true - default: - return zrangeOptions{}, errors.New("ERR syntax error") - } - } - return opts, nil -} - -func reverseZSetEntries(entries []redisZSetEntry) { - for i, j := 0, len(entries)-1; i < j; i, j = i+1, j-1 { - entries[i], entries[j] = entries[j], entries[i] - } -} - -func writeZRangeReply(conn redcon.Conn, entries []redisZSetEntry, withScores bool) { - if withScores { - conn.WriteArray(len(entries) * redisPairWidth) - for _, entry := range entries { - conn.WriteBulkString(entry.Member) - conn.WriteBulkString(formatRedisFloat(entry.Score)) - } - return - } - - conn.WriteArray(len(entries)) - for _, entry := range entries { - conn.WriteBulkString(entry.Member) - } -} - -func removeZSetMembers(members map[string]float64, rawMembers [][]byte) int { - removed := 0 - for _, member := range rawMembers { - memberKey := string(member) - if _, ok := members[memberKey]; ok { - delete(members, memberKey) - removed++ - } - } - return removed -} - -func (r *RedisServer) persistZSetEntriesTxn(ctx context.Context, key []byte, readTS uint64, entries []redisZSetEntry) error { - if len(entries) == 0 { - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, elems) - } - payload, err := marshalZSetValue(redisZSetValue{Entries: entries}) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ - {Op: kv.Put, Key: redisZSetKey(key), Value: payload}, - }) -} - -func (r *RedisServer) zrange(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - start, err := parseInt(cmd.Args[2]) - if err != nil { - writeRedisError(conn, err) - return - } - stop, err := parseInt(cmd.Args[3]) - if err != nil { - writeRedisError(conn, err) - return - } - - opts, err := parseZRangeOptions(cmd.Args[4:]) - if err != nil { - writeRedisError(conn, err) - return - } - - r.zrangeRead(conn, cmd.Args[1], start, stop, opts) -} - -func (r *RedisServer) zrangeRead(conn redcon.Conn, key []byte, start, stop int, opts zrangeOptions) { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), key, readTS, redisTypeZSet) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteArray(0) - return - } - if typ != redisTypeZSet { - conn.WriteError(wrongTypeMessage) - return - } - - value, _, err := r.loadZSetAt(context.Background(), key, readTS) - if err != nil { - writeRedisError(conn, err) - return - } - entries := append([]redisZSetEntry(nil), value.Entries...) - if opts.reverse { - reverseZSetEntries(entries) - } - s, e := normalizeRankRange(start, stop, len(entries)) - if e < s { - conn.WriteArray(0) - return - } - writeZRangeReply(conn, entries[s:e+1], opts.withScores) -} - -func (r *RedisServer) zrem(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var removed int - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, cmd.Args[1], readTS, redisTypeZSet) - if err != nil { - return err - } - if typ == redisTypeNone { - removed = 0 - return nil - } - if typ != redisTypeZSet { - return wrongTypeError() - } - value, _, err := r.loadZSetAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - return err - } - members := zsetEntriesToMap(value.Entries) - removed = removeZSetMembers(members, cmd.Args[2:]) - if removed == 0 { - return nil - } - return r.persistZSetEntriesTxn(ctx, cmd.Args[1], readTS, zsetMapToEntries(members)) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(removed) -} - -func (r *RedisServer) zremrangebyrank(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - start, err := parseInt(cmd.Args[2]) - if err != nil { - writeRedisError(conn, err) - return - } - stop, err := parseInt(cmd.Args[3]) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var removed int - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, cmd.Args[1], readTS, redisTypeZSet) - if err != nil { - return err - } - if typ == redisTypeNone { - removed = 0 - return nil - } - if typ != redisTypeZSet { - return wrongTypeError() - } - value, _, err := r.loadZSetAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - return err - } - s, e := normalizeRankRange(start, stop, len(value.Entries)) - if e < s { - removed = 0 - return nil - } - remaining := append([]redisZSetEntry{}, value.Entries[:s]...) - remaining = append(remaining, value.Entries[e+1:]...) - removed = e - s + 1 - return r.persistZSetEntriesTxn(ctx, cmd.Args[1], readTS, remaining) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(removed) -} - -// tryBZPopMinWithMode runs one BZPOPMIN attempt against key. The -// fast flag selects keyTypeAtExpectFast (no slow-path fallback, no -// wrongType detection) when true; the caller MUST guarantee that the -// only mutations since the previous full check are signalling writes -// (ZADD/ZINCRBY for zsetWaiters). bzpopminWaitLoop enforces this by -// running fast=false on the first iteration and after every -// fallback-timer wake or wall-time-bounded re-arm. -func (r *RedisServer) tryBZPopMinWithMode(key []byte, fast bool) (*bzpopminResult, error) { - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var result *bzpopminResult - err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - var typ redisValueType - var err error - if fast { - typ, err = r.keyTypeAtExpectFast(ctx, key, readTS, redisTypeZSet) - } else { - typ, err = r.keyTypeAtExpect(ctx, key, readTS, redisTypeZSet) - } - if err != nil { - return err - } - if typ == redisTypeNone { - result = nil - return nil - } - if typ != redisTypeZSet { - return wrongTypeError() - } - value, _, err := r.loadZSetAt(context.Background(), key, readTS) - if err != nil { - return err - } - if len(value.Entries) == 0 { - result = nil - return nil - } - popped := value.Entries[0] - remaining := append([]redisZSetEntry(nil), value.Entries[1:]...) - - // Detect wide-column storage. - memberPrefix := store.ZSetMemberScanPrefix(key) - memberEnd := store.PrefixScanEnd(memberPrefix) - probeKVs, probeErr := r.store.ScanAt(ctx, memberPrefix, memberEnd, 1, readTS) - if probeErr != nil { - return cockerrors.WithStack(probeErr) - } - isWide := len(probeKVs) > 0 - - if err := r.persistBZPopMinResult(ctx, key, readTS, popped, remaining, isWide); err != nil { - return err - } - result = &bzpopminResult{key: key, entry: popped} - return nil - }) - return result, err -} - -func (r *RedisServer) persistBZPopMinResult(ctx context.Context, key []byte, readTS uint64, popped redisZSetEntry, remaining []redisZSetEntry, isWide bool) error { - if len(remaining) == 0 { - elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, elems) - } - if isWide { - // Wide-column: delete the popped member key + score index, emit delta -1. - commitTS, err := r.coordinator.Clock().NextFenced() - if err != nil { - return cockerrors.Wrap(err, "persistBZPopMinResult: allocate commitTS") - } - deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: -1}) - elems := []*kv.Elem[kv.OP]{ - {Op: kv.Del, Key: store.ZSetMemberKey(key, []byte(popped.Member))}, - {Op: kv.Del, Key: store.ZSetScoreKey(key, popped.Score, []byte(popped.Member))}, - {Op: kv.Put, Key: store.ZSetMetaDeltaKey(key, commitTS, 0), Value: deltaVal}, - } - _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ - IsTxn: true, - StartTS: normalizeStartTS(readTS), - CommitTS: commitTS, - Elems: elems, - }) - return cockerrors.WithStack(dispatchErr) - } - // Legacy blob: write back all remaining entries. - payload, err := marshalZSetValue(redisZSetValue{Entries: remaining}) - if err != nil { - return err - } - return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ - {Op: kv.Put, Key: redisZSetKey(key), Value: payload}, - }) -} - -func (r *RedisServer) bzpopmin(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - timeoutSeconds, err := strconv.ParseFloat(string(cmd.Args[len(cmd.Args)-1]), 64) - if err != nil || timeoutSeconds < 0 { - conn.WriteError("ERR timeout is not a float or out of range") - return - } - - // timeout=0 means infinite wait in Redis; cap at redisDispatchTimeout to prevent goroutine leak. - if timeoutSeconds == 0 { - timeoutSeconds = redisDispatchTimeout.Seconds() - } - deadline := time.Now().Add(time.Duration(timeoutSeconds * float64(time.Second))) - - keys := cmd.Args[1 : len(cmd.Args)-1] - r.bzpopminWaitLoop(conn, keys, deadline) -} - -// bzpopminWaitLoop runs the BLOCK-window wait loop. Extracted from -// bzpopmin so the parent function stays under the cyclop budget. -// Uses an event-driven signal from the in-process ZADD / ZINCRBY -// path with a fallback timer for paths that bypass the signal. -// -// Registration happens BEFORE the first tryBZPopMin so a signal that -// fires between the check and the wait cannot be lost: the buffered -// channel holds it, and the next select wakes immediately. -func (r *RedisServer) bzpopminWaitLoop(conn redcon.Conn, keys [][]byte, deadline time.Time) { - handlerCtx := r.handlerContext() - w, release := r.zsetWaiters.Register(keys) - defer release() - // fast tracks whether the next iteration may skip the wrongType - // slow probe. The first iteration is always full so an existing - // wrongType key surfaces an immediate WRONGTYPE; subsequent - // iterations after a signal-driven wake skip the wrongType - // detection because zsetWaiters.Signal only fires for ZADD / - // ZINCRBY (neither of which can introduce a wrongType). - // - // lastFullCheck wall-time-bounds how long the fast mode can stay - // active under sustained signal pressure. Without this gate, a - // hot key whose zsetWaiters.Signal fires faster than each - // bzpopminTryAllKeys round finishes can keep waiterC perpetually - // full, starving the fallback timer and letting a wrongType - // write on a co-registered key (multi-key BZPOPMIN) go - // undetected for the entire BLOCK window. Demoting `fast` back - // to false after redisBlockWaitFallback elapses since the last - // full check restores the #666 ceiling: WRONGTYPE on any - // registered key surfaces within ~one fallback interval (100 ms) - // regardless of signal rate. See - // TestRedis_BZPopMinDetectsWrongTypeUnderSignalLoad for the - // regression scenario. - fast := false - lastFullCheck := time.Now() - for { - if handlerCtx.Err() != nil { - conn.WriteNull() - return - } - if r.bzpopminTryAllKeys(conn, keys, fast) { - return - } - if !fast { - lastFullCheck = time.Now() - } - if !time.Now().Before(deadline) { - conn.WriteNull() - return - } - signaled := waitForBlockedCommandUpdate(handlerCtx, w.C, deadline) - fast = signaled && time.Since(lastFullCheck) < redisBlockWaitFallback - } -} - -// bzpopminTryAllKeys runs one tryBZPopMinWithMode pass across keys. -// Returns true when a result was written (success or terminal error) -// and the caller should stop the loop, false to continue waiting. -// The fast flag is forwarded to tryBZPopMinWithMode: true selects -// the signal-driven-wake path (skips wrongType detection); false -// selects the full check. -func (r *RedisServer) bzpopminTryAllKeys(conn redcon.Conn, keys [][]byte, fast bool) bool { - for _, key := range keys { - result, err := r.tryBZPopMinWithMode(key, fast) - if err != nil { - writeRedisError(conn, err) - return true - } - if result == nil { - continue - } - conn.WriteArray(redisTripletWidth) - conn.WriteBulk(result.key) - conn.WriteBulkString(result.entry.Member) - conn.WriteBulkString(formatRedisFloat(result.entry.Score)) - return true - } - return false -} - -// waitForBlockedCommandUpdate blocks until one of: a write signal -// arrives, the fallback poll tick fires, the parent handlerCtx is -// cancelled, or the BLOCK deadline elapses — whichever happens first. -// The fallback bounds latency for write paths that do not signal (Lua -// flush, follower-applied entries); it cannot exceed the remaining -// BLOCK window so the deadline branch in the caller's loop top always -// gets a chance to fire when the BLOCK expires. Shared by every -// blocking-command wait loop (XREAD BLOCK, BZPOPMIN today; BLPOP / -// BRPOP / BLMOVE in follow-ups) — the keyWaiterRegistry that produces -// waiterC is per-domain (streamWaiters vs zsetWaiters), but the -// timer-and-select shape is identical. -// -// Returns true iff the wake came from waiterC (i.e., a producer -// Signal). False on fallback-timer fire or handlerCtx cancellation. -// Callers that have a signal-implied invariant (e.g., "only ZADD / -// ZINCRBY fires zsetWaiters.Signal") can use the return value to -// pick a faster re-check on the next iteration; fallback wakes -// always need the full check because writes that bypass Signal -// (Lua flush, follower-applied entries, wrongType-introducing -// commands) only become observable through the timer branch. -func waitForBlockedCommandUpdate(handlerCtx context.Context, waiterC <-chan struct{}, deadline time.Time) bool { - fallback := redisBlockWaitFallback - if remaining := time.Until(deadline); remaining < fallback { - fallback = remaining - } - timer := time.NewTimer(fallback) - defer func() { - if !timer.Stop() { - // The timer either fired (its case won and the channel - // was drained inline by select) or is still buffering - // the tick (waiter / handlerCtx won the race); drain - // the channel non-blocking so timer GC is clean. - select { - case <-timer.C: - default: - } - } - }() - select { - case <-waiterC: - return true - case <-timer.C: - return false - case <-handlerCtx.Done(): - return false - } -} - -func (r *RedisServer) lpush(conn redcon.Conn, cmd redcon.Command) { - r.listPushCmd(conn, cmd, r.listLPush, r.proxyLPush) -} - -func (r *RedisServer) ltrim(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - start, err := parseInt(cmd.Args[2]) - if err != nil { - writeRedisError(conn, err) - return - } - stop, err := parseInt(cmd.Args[3]) - if err != nil { - writeRedisError(conn, err) - return - } - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - if err := r.retryRedisWrite(ctx, func() error { - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - return err - } - if typ == redisTypeNone { - return nil - } - if typ != redisTypeList { - return wrongTypeError() - } - current, err := r.listValuesAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - return err - } - s, e := normalizeRankRange(start, stop, len(current)) - trimmed := []string{} - if e >= s { - trimmed = append(trimmed, current[s:e+1]...) - } - return r.rewriteListTxn(ctx, cmd.Args[1], readTS, trimmed) - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteString("OK") -} - -func (r *RedisServer) lindex(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - index, err := parseInt(cmd.Args[2]) - if err != nil { - writeRedisError(conn, err) - return - } - readTS := r.readTS() - typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteNull() - return - } - if typ != redisTypeList { - conn.WriteError(wrongTypeMessage) - return - } - values, err := r.listValuesAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - idx := normalizeIndex(index, len(values)) - if idx < 0 { - conn.WriteNull() - return - } - conn.WriteBulkString(values[idx]) -} - -func parseXAddMaxLen(args [][]byte) (int, int, error) { - argIndex := redisPairWidth - if len(args) < 5 || !strings.EqualFold(string(args[argIndex]), "MAXLEN") { - return -1, argIndex, nil - } - - argIndex++ - if argIndex < len(args) && string(args[argIndex]) == "~" { - argIndex++ - } - if argIndex >= len(args) { - return 0, 0, errors.New("ERR syntax error") - } - - maxLen, err := strconv.Atoi(string(args[argIndex])) - if err != nil || maxLen < 0 { - return 0, 0, errors.New("ERR syntax error") - } - return maxLen, argIndex + 1, nil -} - -func parseXAddFields(args [][]byte, argIndex int) ([]string, error) { - if argIndex >= len(args) { - return nil, errors.New("ERR syntax error") - } - if (len(args)-argIndex)%redisPairWidth != 0 { - return nil, errors.New("ERR wrong number of arguments for 'XADD' command") - } - - fields := make([]string, 0, len(args)-argIndex) - for _, arg := range args[argIndex:] { - fields = append(fields, string(arg)) - } - return fields, nil -} - -func parseXAddRequest(args [][]byte) (xaddRequest, error) { - maxLen, argIndex, err := parseXAddMaxLen(args) - if err != nil { - return xaddRequest{}, err - } - if argIndex >= len(args) { - return xaddRequest{}, errors.New("ERR syntax error") - } - fields, err := parseXAddFields(args, argIndex+1) - if err != nil { - return xaddRequest{}, err - } - return xaddRequest{maxLen: maxLen, id: string(args[argIndex]), fields: fields}, nil -} - -// nextXAddID computes the ID the next XADD should assign. -// -// hasLast reports whether the stream currently tracks a "last" ID (i.e. at -// least one XADD has ever succeeded). last{Ms,Seq} must be the highest ID -// the stream has ever seen — not merely the current tail — so that XADD '*' -// stays strictly monotonic even after XTRIM removes the current tail. -func nextXAddID(hasLast bool, lastMs, lastSeq uint64, requested string) (string, error) { - if requested != "*" { - requestedID, requestedValid := tryParseRedisStreamID(requested) - if !requestedValid { - return "", errors.New("ERR Invalid stream ID specified as stream command argument") - } - // Redis rejects IDs <= 0-0 unconditionally; a stream entry with - // ID "0-0" is unreachable via XREAD ... 0 (which means "after 0-0"). - if requestedID.ms == 0 && requestedID.seq == 0 { - return "", errors.New("ERR The ID specified in XADD must be greater than 0-0") - } - if hasLast && compareStreamIDs(requestedID.ms, requestedID.seq, lastMs, lastSeq) <= 0 { - return "", errors.New("ERR The ID specified in XADD is equal or smaller than the target stream top item") - } - return requested, nil - } - return autoXAddID(safeUnixMilliToUint64(time.Now().UnixMilli()), hasLast, lastMs, lastSeq) -} - -// autoXAddID resolves XADD '*' to a concrete stream ID given a wall-clock -// nowMs. Pulled out of nextXAddID so the auto-ID branch is testable -// without depending on time.Now() — the only un-injectable dependency is -// already isolated in the caller. -// -// Two corner cases the caller cannot rely on the wall clock to avoid: -// -// - nowMs == 0 on a fresh stream (!hasLast). A naive "-0" reply -// yields "0-0", which Redis explicitly rejects as a stream ID and -// which XREAD ... 0 would treat as the empty after-marker. Bump the -// seq to 1 so the first auto-generated entry is "0-1" — strictly -// greater than 0-0 and reachable via XREAD ... 0. (This case fires -// only when safeUnixMilliToUint64 clamped a pre-epoch clock to 0; -// under any sane clock, nowMs is well above 0.) -// -// - nowMs <= lastMs. Advance past lastMs/lastSeq via bumpStreamID so -// the stream stays strictly monotonic even across a backwards clock -// step or a corrupted meta where lastMs is far in the future. -func autoXAddID(nowMs uint64, hasLast bool, lastMs, lastSeq uint64) (string, error) { - if !hasLast || nowMs > lastMs { - seq := uint64(0) - if nowMs == 0 { - seq = 1 - } - return strconv.FormatUint(nowMs, 10) + "-" + strconv.FormatUint(seq, 10), nil - } - // Either nowMs == lastMs (same millisecond), or lastMs is in the future - // (monotonic guarantee across a backwards clock step or a corrupted - // meta). Advance past lastMs-lastSeq via bumpStreamID; if the ID space - // is exhausted, surface an error rather than wrap to 0. - ms, seq, err := bumpStreamID(lastMs, lastSeq) - if err != nil { - return "", err - } - return strconv.FormatUint(ms, 10) + "-" + strconv.FormatUint(seq, 10), nil -} - -// safeUnixMilliToUint64 returns ms as uint64, clamping any negative value -// (caused by a system clock set before the Unix epoch) to 0. Without this -// clamp, a direct uint64 cast of a negative int64 would yield a value -// near math.MaxUint64, which would then make nextXAddID's "future-ms" -// branch chase that pathological value forever — effectively wedging -// every subsequent XADD '*' on the stream until the clock recovers. -// The lastMs/lastSeq monotonic guarantee carries the stream forward -// from there via bumpStreamID. -func safeUnixMilliToUint64(ms int64) uint64 { - if ms < 0 { - return 0 - } - return uint64(ms) //nolint:gosec // negative values handled above -} - -// bumpStreamID returns the strictly-greater successor of (ms, seq) within -// the uint64-uint64 stream ID space. Bumps seq; on seq overflow carries -// to ms+1, seq=0; on ms overflow returns an error (no representable -// successor) instead of wrapping to 0-0, which would produce a duplicate -// or non-monotonic ID. -func bumpStreamID(ms, seq uint64) (uint64, uint64, error) { - switch { - case seq < ^uint64(0): - return ms, seq + 1, nil - case ms < ^uint64(0): - return ms + 1, 0, nil - default: - return 0, 0, errors.New("ERR The stream has exhausted the ID space") - } -} - -func compareStreamIDs(lms, lseq, rms, rseq uint64) int { - switch { - case lms < rms: - return -1 - case lms > rms: - return 1 - case lseq < rseq: - return -1 - case lseq > rseq: - return 1 - default: - return 0 - } -} - -func (r *RedisServer) xadd(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - req, err := parseXAddRequest(cmd.Args) - if err != nil { - writeRedisError(conn, err) - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var id string - if err := r.retryRedisWrite(ctx, func() error { - id, err = r.xaddTxn(ctx, cmd.Args[1], req) - return err - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteBulkString(id) -} - -func (r *RedisServer) xaddTxn(ctx context.Context, key []byte, req xaddRequest) (string, error) { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) - if err != nil { - return "", err - } - if typ != redisTypeNone && typ != redisTypeStream { - return "", wrongTypeError() - } - - legacyCleanup, meta, metaFound, err := r.streamWriteBase(ctx, key, readTS) - if err != nil { - return "", err - } - - id, parsedID, err := resolveXAddID(meta, metaFound, req.id) - if err != nil { - return "", err - } - - if err := xaddEnforceMaxWideColumn(key, meta.Length, req.maxLen); err != nil { - return "", err - } - - entryValue, err := marshalStreamEntry(newRedisStreamEntry(id, req.fields)) - if err != nil { - return "", err - } - - // Capacity hint covers: optional legacy-cleanup Del + one entry Put + - // one meta Put + the trim Dels. legacyCleanup is at most one element, - // and only non-empty on the very first write against a stream whose - // pre-migration blob is still on disk. - const xaddFixedElemCount = 2 - elems := make([]*kv.Elem[kv.OP], 0, - len(legacyCleanup)+xaddFixedElemCount+estimateXAddTrimCount(req.maxLen, meta.Length)) - elems = append(elems, legacyCleanup...) - elems = append(elems, &kv.Elem[kv.OP]{ - Op: kv.Put, - Key: store.StreamEntryKey(key, parsedID.ms, parsedID.seq), - Value: entryValue, - }) - - nextLen, trim, err := r.xaddTrimIfNeeded(ctx, key, readTS, req.maxLen, meta.Length+1) - if err != nil { - return "", err - } - elems = append(elems, trim...) - elems = appendMaxLenZeroSelfDel(elems, req.maxLen, key, parsedID) - - metaBytes, err := store.MarshalStreamMeta(store.StreamMeta{ - Length: nextLen, - LastMs: parsedID.ms, - LastSeq: parsedID.seq, - }) - if err != nil { - return "", cockerrors.WithStack(err) - } - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) - - return id, r.dispatchAndSignalStream(ctx, true, readTS, elems, key) -} - -// dispatchAndSignalStream dispatches the elems through the coordinator -// and, on success, wakes any XREAD BLOCK waiter on the same node. -// dispatchElems blocks until the FSM applies locally, so by the time -// Signal fires the new entries are visible at the readTS the woken -// waiter will pick on its next iteration. Pulled out of xaddTxn so the -// parent function stays under the cyclop budget — the signal step -// would otherwise add an extra branch on the dispatch error path. -func (r *RedisServer) dispatchAndSignalStream( - ctx context.Context, - isTxn bool, - startTS uint64, - elems []*kv.Elem[kv.OP], - streamKey []byte, -) error { - if err := r.dispatchElems(ctx, isTxn, startTS, elems); err != nil { - return err - } - r.streamWaiters.Signal(streamKey) - return nil -} - -// appendMaxLenZeroSelfDel handles the MAXLEN 0 edge case. The trim loop -// runs scans at readTS and therefore cannot see the entry we just queued, -// so without this follow-up Del the freshly-added entry would survive -// while meta.Length said 0. The coordinator applies elems in order at a -// single commitTS, so appending Del after the Put tombstones it cleanly. -func appendMaxLenZeroSelfDel(elems []*kv.Elem[kv.OP], maxLen int, key []byte, parsedID redisStreamID) []*kv.Elem[kv.OP] { - if maxLen != 0 { - return elems - } - return append(elems, &kv.Elem[kv.OP]{ - Op: kv.Del, - Key: store.StreamEntryKey(key, parsedID.ms, parsedID.seq), - }) -} - -// xaddEnforceMaxWideColumn rejects an XADD that would push the stream past -// maxWideColumnItems when no MAXLEN clause could rescue it. A MAXLEN >= 0 -// and <= the cap keeps the committed length bounded even when meta.Length is -// already at the ceiling, so we only reject on the ungated path. -func xaddEnforceMaxWideColumn(key []byte, currentLength int64, maxLen int) error { - if maxLen >= 0 && maxLen <= maxWideColumnItems { - return nil - } - if currentLength < int64(maxWideColumnItems) { - return nil - } - return cockerrors.Wrapf(ErrCollectionTooLarge, - "stream %q would exceed %d entries", key, maxWideColumnItems) -} - -// xaddTrimIfNeeded returns (finalLength, trimElems, err) for an XADD. -// estimateXAddTrimCount returns how many entries the XADD's MAXLEN trim -// will remove, or 0 when maxLen is unset or the current length fits under -// it. Used only as a capacity hint for the elems slice; the actual trim -// list is computed by xaddTrimIfNeeded. -func estimateXAddTrimCount(maxLen int, currentLength int64) int { - if maxLen < 0 { - return 0 - } - nextLen := currentLength + 1 - if nextLen <= int64(maxLen) { - return 0 - } - // Compute in int64 and clamp at maxWideColumnItems. A capacity hint - // of math.MaxInt would let make() try to allocate ~16 EiB on 64-bit - // targets and either panic or OOM; capping at the wide-column ceiling - // keeps the hint useful (saves slice growth in the common case) while - // preventing pathological allocation when meta.Length is corrupted. - // xaddTrimIfNeeded enforces the same cap on the actual trim count; - // this hint just sizes the elems slice. - diff := nextLen - int64(maxLen) - if diff <= 0 { - return 0 - } - if diff > int64(maxWideColumnItems) { - return maxWideColumnItems - } - return int(diff) -} - -// When maxLen < 0 (unset) or the new length fits under it, no trim is -// emitted and trimElems is nil; otherwise Del operations for the oldest -// entries are returned and finalLength equals maxLen. All scans use the -// caller's ctx and readTS so the trim happens at the same MVCC snapshot -// as the write. -func (r *RedisServer) xaddTrimIfNeeded( - ctx context.Context, - key []byte, - readTS uint64, - maxLen int, - candidateLen int64, -) (int64, []*kv.Elem[kv.OP], error) { - if maxLen < 0 || candidateLen <= int64(maxLen) { - return candidateLen, nil, nil - } - // int64 arithmetic + clamp at maxWideColumnItems. A single XADD must - // not emit more than maxWideColumnItems Del operations: it would risk - // exceeding the Raft message-size limit and would force a single - // commit to materialise an unbounded list of keys. The cap is loose - // enough that it never bites in normal operation (xaddEnforceMaxWideColumn - // rejects streams whose committed length is already at the ceiling), - // but defends against a corrupted meta.Length feeding the trim path. - diff := candidateLen - int64(maxLen) - if diff <= 0 { - return candidateLen, nil, nil - } - count := maxWideColumnItems - if diff <= int64(maxWideColumnItems) { - count = int(diff) - } - trim, err := r.buildXTrimHeadElems(ctx, key, readTS, count) - if err != nil { - return 0, nil, err - } - // Final length must reflect the trim that actually committed, not - // the requested maxLen, so that meta.Length stays consistent with - // the entries on disk when the cap kicks in or the scan returns - // fewer rows than requested. MAXLEN 0 is a special case: the - // freshly-added entry is removed by appendMaxLenZeroSelfDel in the - // caller, so the post-commit length is 0 regardless of what trim - // did to the pre-existing rows. - if maxLen == 0 { - return 0, trim, nil - } - return candidateLen - int64(len(trim)), trim, nil -} - -// streamWriteBase prepares a write to a stream. Returns the loaded meta -// (zero value when the stream has never been written) and, when a legacy -// single-blob key is still present on disk, a Del elem that the caller -// must include in the write transaction. No migration is performed: -// legacy entries are discarded, not re-materialised into the new layout. -// This matches the PR #620 operator directive that pre-migration data is -// expendable and is cleared explicitly rather than saved. -func (r *RedisServer) streamWriteBase(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], store.StreamMeta, bool, error) { - meta, metaFound, err := r.loadStreamMetaAt(ctx, key, readTS) - if err != nil { - return nil, store.StreamMeta{}, false, err - } - if metaFound { - return nil, meta, true, nil - } - legacyCleanup, err := r.legacyStreamCleanupElems(ctx, key, readTS) - if err != nil { - return nil, store.StreamMeta{}, false, err - } - return legacyCleanup, store.StreamMeta{}, false, nil -} - -// legacyStreamCleanupElems returns a Del elem for the legacy single-blob -// key if one is still present on disk, or nil otherwise. Called by -// streamWriteBase and deleteStreamWideColumnElems so every write or delete -// that touches a stream also evicts any stale legacy data. -func (r *RedisServer) legacyStreamCleanupElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { - legacyKey := redisStreamKey(key) - exists, err := r.store.ExistsAt(ctx, legacyKey, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - if !exists { - return nil, nil - } - return []*kv.Elem[kv.OP]{{Op: kv.Del, Key: legacyKey}}, nil -} - -// resolveXAddID resolves the requested ID (possibly '*') against the current -// stream meta and returns the assigned string ID plus its parsed form. -func resolveXAddID(meta store.StreamMeta, hasMeta bool, requested string) (string, redisStreamID, error) { - var ( - hasLast bool - lastMs, lastSeq uint64 - ) - if hasMeta { - // LastMs/LastSeq carry the highest ID ever assigned even when the - // stream was trimmed to empty, so auto-ID generation stays - // monotonic across MAXLEN=0 / XDEL-all cycles. - hasLast = meta.Length > 0 || meta.LastMs != 0 || meta.LastSeq != 0 - lastMs, lastSeq = meta.LastMs, meta.LastSeq - } - id, err := nextXAddID(hasLast, lastMs, lastSeq, requested) - if err != nil { - return "", redisStreamID{}, err - } - parsed, ok := tryParseRedisStreamID(id) - if !ok { - return "", redisStreamID{}, errors.New("ERR Invalid stream ID specified as stream command argument") - } - return id, parsed, nil -} - -// buildXTrimHeadElems emits Del operations for the oldest `count` entries -// in the entry-per-key layout via a bounded range scan at the caller's -// MVCC snapshot (ctx, readTS). Mixing a later timestamp here would let us -// tombstone keys the caller's view never saw. -func (r *RedisServer) buildXTrimHeadElems( - ctx context.Context, - key []byte, - readTS uint64, - count int, -) ([]*kv.Elem[kv.OP], error) { - if count <= 0 { - return nil, nil - } - // Defense-in-depth cap on the per-trim scan so a caller that asked - // for math.MaxInt (corrupted meta upstream) cannot try to materialise - // an unbounded list of Del elems in a single transaction. Callers - // (xaddTrimIfNeeded, xtrimTxn) already cap; this is a belt-and-braces - // guard on the boundary that actually allocates. - if count > maxWideColumnItems { - count = maxWideColumnItems - } - prefix := store.StreamEntryScanPrefix(key) - end := store.PrefixScanEnd(prefix) - kvs, err := r.store.ScanAt(ctx, prefix, end, count, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - elems := make([]*kv.Elem[kv.OP], 0, len(kvs)) - for _, pair := range kvs { - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: append([]byte(nil), pair.Key...)}) - } - return elems, nil -} - -func parseXTrimMaxLen(args [][]byte) (int, error) { - if !strings.EqualFold(string(args[2]), "MAXLEN") { - return 0, errors.New("ERR syntax error") - } - - argIndex := 3 - if argIndex < len(args) && (string(args[argIndex]) == "~" || string(args[argIndex]) == "=") { - argIndex++ - } - if argIndex != len(args)-1 { - return 0, errors.New("ERR syntax error") - } - - maxLen, err := strconv.Atoi(string(args[argIndex])) - if err != nil || maxLen < 0 { - return 0, errors.New("ERR syntax error") - } - return maxLen, nil -} - -func (r *RedisServer) xtrim(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - maxLen, err := parseXTrimMaxLen(cmd.Args) - if err != nil { - conn.WriteError("ERR syntax error") - return - } - - ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) - defer cancel() - var removed int - if err := r.retryRedisWrite(ctx, func() error { - var err error - removed, err = r.xtrimTxn(ctx, cmd.Args[1], maxLen) - return err - }); err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt(removed) -} - -// streamTypeForWrite returns (true, nil) when the key is either absent -// (no-op write) or already a stream, (false, nil) when the caller should -// short-circuit with "no stream here", and (_, err) for wrong-type or -// store errors. Extracted from xtrimTxn so the outer function stays -// within the cyclop budget. -func (r *RedisServer) streamTypeForWrite(ctx context.Context, key []byte, readTS uint64) (bool, error) { - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) - if err != nil { - return false, err - } - switch typ { - case redisTypeNone: - return false, nil - case redisTypeStream: - return true, nil - case redisTypeString, redisTypeList, redisTypeHash, redisTypeSet, redisTypeZSet: - return false, wrongTypeError() - default: - return false, wrongTypeError() - } -} - -// flushLegacyCleanupOnTrimNoOp commits the legacy-blob Del + meta Put -// for an XTRIM whose length is already under maxLen. Without this -// flush a subsequent read would still find the stale legacy blob. -// Returns 0 removed entries; callers use that directly. -func (r *RedisServer) flushLegacyCleanupOnTrimNoOp( - ctx context.Context, readTS uint64, key []byte, - meta store.StreamMeta, legacyCleanup []*kv.Elem[kv.OP], -) (int, error) { - if len(legacyCleanup) == 0 { - return 0, nil - } - metaBytes, err := store.MarshalStreamMeta(meta) - if err != nil { - return 0, cockerrors.WithStack(err) - } - elems := make([]*kv.Elem[kv.OP], 0, len(legacyCleanup)+1) - elems = append(elems, legacyCleanup...) - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) - return 0, r.dispatchElems(ctx, true, readTS, elems) -} - -func (r *RedisServer) xtrimTxn(ctx context.Context, key []byte, maxLen int) (int, error) { - readTS := r.readTS() - proceed, err := r.streamTypeForWrite(ctx, key, readTS) - if err != nil || !proceed { - return 0, err - } - - legacyCleanup, meta, _, err := r.streamWriteBase(ctx, key, readTS) - if err != nil { - return 0, err - } - - if meta.Length <= int64(maxLen) { - return r.flushLegacyCleanupOnTrimNoOp(ctx, readTS, key, meta, legacyCleanup) - } - - // Cap the trim request at maxWideColumnItems so a single XTRIM cannot - // emit an unbounded list of Del operations in one Raft commit. int64 - // arithmetic upfront also keeps a corrupted meta.Length (>MaxInt) - // from wrapping into a negative scan count. - diff := meta.Length - int64(maxLen) - requestedRemoved := maxWideColumnItems - if diff <= int64(maxWideColumnItems) { - requestedRemoved = int(diff) - } - trim, err := r.buildXTrimHeadElems(ctx, key, readTS, requestedRemoved) - if err != nil { - return 0, err - } - - // Use len(trim) — the actual entries we are about to delete — for - // both the meta.Length update and the XTRIM return value. The - // requested count and the actual count can diverge when the trim - // hits the per-txn cap or the underlying scan returns fewer rows - // than requested (concurrent writes / partial consistency); using - // the actual count keeps meta.Length consistent with on-disk state - // and reports the truth back to the client. - actualRemoved := len(trim) - elems := make([]*kv.Elem[kv.OP], 0, len(legacyCleanup)+actualRemoved+1) - elems = append(elems, legacyCleanup...) - elems = append(elems, trim...) - meta.Length -= int64(actualRemoved) - metaBytes, err := store.MarshalStreamMeta(meta) - if err != nil { - return 0, cockerrors.WithStack(err) - } - elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) - return actualRemoved, r.dispatchElems(ctx, true, readTS, elems) -} - -func (r *RedisServer) xrange(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.rangeStream(conn, cmd, false) -} - -func (r *RedisServer) xrevrange(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - r.rangeStream(conn, cmd, true) -} - -func parseXReadCountArg(args [][]byte, index int) (int, error) { - if index+1 >= len(args) { - return 0, errors.New("ERR syntax error") - } - count, err := strconv.Atoi(string(args[index+1])) - if err != nil || count <= 0 { - return 0, errors.New("ERR syntax error") - } - // Clamp client-supplied COUNT to the wide-column ceiling so a single - // XREAD cannot pre-allocate a maxInt-sized []redisStreamEntry slice or - // pull more entries than the store will return for the equivalent - // uncapped scan. Cap is silent (Redis-compatible): the client always - // sees at most maxWideColumnItems entries per stream per call. - if count > maxWideColumnItems { - count = maxWideColumnItems - } - return count, nil -} - -func parseXReadBlockArg(args [][]byte, index int) (time.Duration, error) { - if index+1 >= len(args) { - return 0, errors.New("ERR syntax error") - } - ms, err := strconv.Atoi(string(args[index+1])) - if err != nil || ms < 0 { - return 0, errors.New("ERR syntax error") - } - return time.Duration(ms) * time.Millisecond, nil -} - -func parseXReadOptions(args [][]byte) (xreadOptions, error) { - opts := xreadOptions{count: -1, streamsIndex: -1} - for i := 1; i < len(args); { - next, done, err := parseXReadOption(&opts, args, i) - if err != nil { - return xreadOptions{}, err - } - if done { - return opts, nil - } - i = next - } - return opts, nil -} - -func parseXReadOption(opts *xreadOptions, args [][]byte, i int) (int, bool, error) { - switch strings.ToUpper(string(args[i])) { - case redisKeywordCount: - count, err := parseXReadCountArg(args, i) - if err != nil { - return 0, false, err - } - opts.count = count - return i + redisPairWidth, false, nil - case "BLOCK": - block, err := parseXReadBlockArg(args, i) - if err != nil { - return 0, false, err - } - opts.block = block - return i + redisPairWidth, false, nil - case "STREAMS": - opts.streamsIndex = i + 1 - return len(args), true, nil - default: - return 0, false, errors.New("ERR syntax error") - } -} - -func splitXReadStreams(args [][]byte, streamsIndex int) ([][]byte, []string, error) { - if streamsIndex < 0 || streamsIndex >= len(args) { - return nil, nil, errors.New("ERR syntax error") - } - remaining := len(args) - streamsIndex - if remaining%redisPairWidth != 0 { - return nil, nil, errors.New("ERR syntax error") - } - - streamCount := remaining / redisPairWidth - keys := make([][]byte, streamCount) - afterIDs := make([]string, streamCount) - for i := range streamCount { - keys[i] = args[streamsIndex+i] - afterIDs[i] = string(args[streamsIndex+streamCount+i]) - } - return keys, afterIDs, nil -} - -func parseXReadRequest(args [][]byte) (xreadRequest, error) { - opts, err := parseXReadOptions(args) - if err != nil { - return xreadRequest{}, err - } - keys, afterIDs, err := splitXReadStreams(args, opts.streamsIndex) - if err != nil { - return xreadRequest{}, err - } - return xreadRequest{block: opts.block, count: opts.count, keys: keys, afterIDs: afterIDs}, nil -} - -func (r *RedisServer) resolveXReadAfterIDs(ctx context.Context, req *xreadRequest) error { - for i, afterID := range req.afterIDs { - if afterID != "$" { - continue - } - resolved, err := r.resolveXReadDollarID(ctx, req.keys[i]) - if err != nil { - return err - } - req.afterIDs[i] = resolved - } - return nil -} - -// resolveXReadDollarID resolves the "$" after-ID for a single stream by -// asking the store for the highest ID ever assigned. The new-layout meta -// answers in one read; when meta is absent the stream is treated as -// empty — legacy single-blob data is intentionally ignored under the -// "discard-on-read, delete-on-write" contract documented on -// dollarIDFromState (and matching loadStreamAt). Returns streamZeroID -// for non-existent and empty-never-written streams. ctx threads through -// the caller's cancellation/deadline so the resolve step doesn't survive -// past a BLOCK-window cancel. -func (r *RedisServer) resolveXReadDollarID(ctx context.Context, key []byte) (string, error) { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) - if err != nil { - return "", err - } - if typ == redisTypeNone { - return streamZeroID, nil - } - if typ != redisTypeStream { - return "", wrongTypeError() - } - return r.dollarIDFromState(ctx, key, readTS) -} - -// dollarIDFromState returns the highest-ever-assigned stream ID as a string. -// Reads the new-layout meta record (O(1)); when meta is absent the stream -// is treated as empty — legacy single-blob data is intentionally ignored -// under the "discard-on-read, delete-on-write" contract (see loadStreamAt -// and the PR #620 writeup), so $ resolves to streamZeroID for any stream -// that has never been written in the new layout. -func (r *RedisServer) dollarIDFromState(ctx context.Context, key []byte, readTS uint64) (string, error) { - meta, found, err := r.loadStreamMetaAt(ctx, key, readTS) - if err != nil { - return "", err - } - if !found { - return streamZeroID, nil - } - if meta.Length == 0 && meta.LastMs == 0 && meta.LastSeq == 0 { - return streamZeroID, nil - } - return strconv.FormatUint(meta.LastMs, 10) + "-" + strconv.FormatUint(meta.LastSeq, 10), nil -} - -func selectXReadEntries(entries []redisStreamEntry, afterID string, count int) []redisStreamEntry { - afterParsedID, afterParsedValid := tryParseRedisStreamID(afterID) - start := sort.Search(len(entries), func(i int) bool { - return entries[i].compareID(afterID, afterParsedID, afterParsedValid) > 0 - }) - if start >= len(entries) { - return nil - } - end := len(entries) - if count > 0 && start+count < end { - end = start + count - } - return entries[start:end] -} - -func (r *RedisServer) xreadOnce(ctx context.Context, req xreadRequest) ([]xreadResult, error) { - results := make([]xreadResult, 0, len(req.keys)) - for i, key := range req.keys { - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) - if err != nil { - return nil, err - } - if typ == redisTypeNone { - continue - } - if typ != redisTypeStream { - return nil, wrongTypeError() - } - - entries, err := r.readStreamAfter(ctx, key, readTS, req.afterIDs[i], req.count) - if err != nil { - return nil, err - } - if len(entries) > 0 { - results = append(results, xreadResult{key: key, entries: entries}) - } - } - return results, nil -} - -// readStreamAfter returns up to `count` entries with ID strictly greater -// than afterID via the entry-per-key range scan. When the meta key is -// absent the stream is treated as empty; legacy single-blob data is -// intentionally ignored under the "discard-on-read, delete-on-write" -// contract documented on loadStreamAt. A subsequent XADD or XTRIM will -// delete any lingering legacy blob in the same transaction, so a stream -// whose meta is still missing here cannot have live legacy data from the -// caller's perspective. -func (r *RedisServer) readStreamAfter(ctx context.Context, key []byte, readTS uint64, afterID string, count int) ([]redisStreamEntry, error) { - _, found, err := r.loadStreamMetaAt(ctx, key, readTS) - if err != nil { - return nil, err - } - if !found { - return nil, nil - } - return r.scanStreamEntriesAfter(ctx, key, readTS, afterID, count) -} - -// scanStreamEntriesAfter runs a [strictly-after(afterID), ∞) range scan over -// entry keys, capped by count (when positive) or maxWideScanLimit otherwise. -// When count is non-positive, we mirror scanStreamEntriesAt's guard: request -// maxWideScanLimit (which is maxWideColumnItems+1) and reject if the scan -// filled, so an XREAD without COUNT cannot OOM the server on a pathological -// stream. -// -// afterID must be a parseable stream ID in either the strict "ms-seq" form or -// the shorthand "ms" form (no dash), which Redis normalises to "ms-0". -// Genuinely malformed IDs are rejected immediately so the caller never -// receives a full-stream result set for invalid input. -func (r *RedisServer) scanStreamEntriesAfter(ctx context.Context, key []byte, readTS uint64, afterID string, count int) ([]redisStreamEntry, error) { - afterID, ok := normalizeStreamAfterID(afterID) - if !ok { - return nil, errors.New("ERR Invalid stream ID specified as stream command argument") - } - prefix := store.StreamEntryScanPrefix(key) - end := store.PrefixScanEnd(prefix) - start := streamScanStartForAfter(prefix, afterID) - limit := count - unbounded := limit <= 0 - if unbounded { - limit = maxWideScanLimit - } - kvs, err := r.store.ScanAt(ctx, start, end, limit, readTS) - if err != nil { - return nil, cockerrors.WithStack(err) - } - if unbounded && len(kvs) > maxWideColumnItems { - return nil, cockerrors.Wrapf(ErrCollectionTooLarge, "stream %q exceeds %d entries", key, maxWideColumnItems) - } - entries := make([]redisStreamEntry, 0, len(kvs)) - for _, pair := range kvs { - entry, err := unmarshalStreamEntry(pair.Value) - if err != nil { - return nil, err - } - entries = append(entries, entry) - } - return entries, nil -} - -// streamScanStartForAfter returns the inclusive start key to use for an -// XREAD-style "after afterID" range scan. If afterID parses cleanly we -// start at ID+1 so the scan is exclusive of afterID. Callers must validate -// afterID before calling this function; if afterID is unparseable, the -// returned prefix is the entry-prefix start, which gives a full scan. -// -// Edge case: if afterID is (math.MaxUint64-math.MaxUint64), there is no -// successor ID inside the entry-prefix keyspace, so the correct start is -// one past the prefix (empty scan). Returning the afterID key itself -// would make the inclusive scan include it, which is the opposite of -// "strictly after." -func streamScanStartForAfter(prefix []byte, afterID string) []byte { - parsed, ok := tryParseRedisStreamID(afterID) - if !ok { - return prefix - } - ms, seq := parsed.ms, parsed.seq - switch { - case seq < ^uint64(0): - seq++ - case ms < ^uint64(0): - ms++ - seq = 0 - default: - // afterID is the largest representable stream ID. No entry can be - // strictly after it; return the scan-end sentinel so the scan is - // empty instead of silently inclusive. - return store.PrefixScanEnd(prefix) - } - start := make([]byte, 0, len(prefix)+store.StreamIDBytes) - start = append(start, prefix...) - start = append(start, store.EncodeStreamID(ms, seq)...) - return start -} - -// normalizeStreamAfterID normalises an XREAD afterID to the strict "ms-seq" -// form used by tryParseRedisStreamID. Redis accepts a shorthand "ms" form -// (no dash) as meaning "ms-0". Truly invalid IDs — those that are neither -// valid "ms-seq" strings nor parseable as a bare uint64 — return ("", false). -func normalizeStreamAfterID(id string) (string, bool) { - if strings.IndexByte(id, '-') >= 0 { - _, ok := tryParseRedisStreamID(id) - return id, ok - } - // Shorthand: bare millisecond component only. Redis treats "ms" as "ms-0" - // for XREAD after-IDs (entries strictly after ms-0). - if _, err := strconv.ParseUint(id, 10, 64); err != nil { - return "", false - } - return id + "-0", true -} - -func writeStreamEntry(conn redcon.Conn, entry redisStreamEntry) { - conn.WriteArray(redisPairWidth) - conn.WriteBulkString(entry.ID) - conn.WriteArray(len(entry.Fields)) - for _, field := range entry.Fields { - conn.WriteBulkString(field) - } -} - -func writeStreamEntries(conn redcon.Conn, entries []redisStreamEntry) { - conn.WriteArray(len(entries)) - for _, entry := range entries { - writeStreamEntry(conn, entry) - } -} - -func writeXReadResults(conn redcon.Conn, results []xreadResult) { - conn.WriteArray(len(results)) - for _, result := range results { - conn.WriteArray(redisPairWidth) - conn.WriteBulk(result.key) - writeStreamEntries(conn, result.entries) - } -} - -// isXReadIterCtxError reports whether err originates from the per-iteration -// XREAD context firing (BLOCK budget consumed mid-call). The check covers -// the bare context sentinels, cockroachdb/errors-wrapped variants, and -// gRPC's status.Error(codes.DeadlineExceeded / codes.Canceled, ...) which -// is what bubbles up through coordinator.Dispatch when the iter ctx fires -// during a Raft-mediated read. Hits on this path must be silently -// translated to "empty iteration" so the BLOCK-window null contract holds. -func isXReadIterCtxError(err error) bool { - if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { - return true - } - if cockerrors.Is(err, context.DeadlineExceeded) || cockerrors.Is(err, context.Canceled) { - return true - } - switch status.Code(err) { //nolint:exhaustive // only the two ctx-related codes matter; the rest must propagate as real errors - case codes.DeadlineExceeded, codes.Canceled: - return true - default: - return false - } -} - -func (r *RedisServer) xread(conn redcon.Conn, cmd redcon.Command) { - req, err := parseXReadRequest(cmd.Args) - if err != nil { - writeRedisError(conn, err) - return - } - - blockDuration := req.block - // block=0 means infinite wait in Redis; cap at redisDispatchTimeout to prevent goroutine leak. - if blockDuration == 0 { - blockDuration = redisDispatchTimeout - } - deadline := time.Now().Add(blockDuration) - - // $ resolution uses a short fixed timeout rather than the BLOCK - // window: it's a single bounded read per key, not a wait. A tight - // BLOCK (e.g. `BLOCK 1`) used to turn any slow $-resolve into a - // protocol-level error on this path; use redisDispatchTimeout so - // the resolve either succeeds quickly or fails cleanly, leaving - // the BLOCK-window timeout semantics (null on expiry) to the - // busy-poll below. - // - // Parent on r.handlerContext() (not context.Background()) so an - // in-flight resolve aborts promptly when the server is shutting - // down — otherwise the per-resolve ScanAt could survive past - // graceful-shutdown's drain window. - resolveCtx, resolveCancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) - err = r.resolveXReadAfterIDs(resolveCtx, &req) - resolveCancel() - if err != nil { - writeRedisError(conn, err) - return - } - - r.xreadBusyPoll(conn, req, deadline) -} - -// xreadBusyPoll runs the BLOCK-window wait loop. Extracted from xread so -// the parent function stays under the cyclop budget. Uses an event-driven -// signal from the in-process XADD path with a fallback timer for paths -// that bypass the signal (Lua flush, follower-side FSM apply). -// -// Registration happens BEFORE the first xreadOnce so a signal that fires -// between the check and the wait cannot be lost: the buffered channel -// holds it, and the next select wakes immediately. -func (r *RedisServer) xreadBusyPoll(conn redcon.Conn, req xreadRequest, deadline time.Time) { - handlerCtx := r.handlerContext() - w, release := r.streamWaiters.Register(req.keys) - defer release() - for { - // Server-shutdown short-circuit: if the parent handlerContext - // has been cancelled, abandon the wait loop immediately rather - // than block until the BLOCK deadline. iterCtx below is rooted - // in handlerCtx, so it would cancel-on-call too — but routing - // through isXReadIterCtxError silently translates that into an - // empty iteration and the loop would otherwise wait at - // redisBlockWaitFallback cadence until the deadline. - if handlerCtx.Err() != nil { - conn.WriteNull() - return - } - // BLOCK-expired before the loop body: respect the Redis contract - // that a BLOCK timeout returns null, not an error. If we fell - // through here without remaining time (very small BLOCK, or - // $-resolution consumed the budget) creating an - // already-expired context.WithTimeout would make xreadOnce - // return DeadlineExceeded, which we'd then surface as an error. - iterTimeout := time.Until(deadline) - if iterTimeout <= 0 { - conn.WriteNull() - return - } - // Cap each iteration at redisDispatchTimeout to avoid holding - // storage resources longer than a single dispatch. - if iterTimeout > redisDispatchTimeout { - iterTimeout = redisDispatchTimeout - } - // iterCtx is rooted in handlerCtx so its underlying storage - // scans abort promptly on server shutdown rather than running - // until iterTimeout fires. The handlerCtx.Err() guard at the - // top of each iteration prevents the loop from spinning once - // the parent ctx is cancelled. - iterCtx, iterCancel := context.WithTimeout(handlerCtx, iterTimeout) - results, err := r.xreadOnce(iterCtx, req) - iterCancel() - // Per-iteration ctx hitting its deadline (or being cancelled by - // the upstream BLOCK timeout) is not a client-facing error — it - // just means this poll round did not see any new entries. Treat - // it as an empty iteration so the loop continues to the next - // round (or falls through to the null-on-deadline branch below). - // Without this, a tight BLOCK (e.g. BLOCK 10 against a busy / - // slow node) leaks the iteration ctx-deadline into a -ERR reply, - // which violates the Redis BLOCK-timeout contract (null on - // timeout). xreadOnce returns nil results on any error, so - // suppressing iter-ctx errors here is sound. - if err != nil && !isXReadIterCtxError(err) { - writeRedisError(conn, err) - return - } - if len(results) > 0 { - writeXReadResults(conn, results) - return - } - - if !time.Now().Before(deadline) { - conn.WriteNull() - return - } - waitForBlockedCommandUpdate(handlerCtx, w.C, deadline) - } -} - -func (r *RedisServer) xlen(conn redcon.Conn, cmd redcon.Command) { - if r.proxyToLeader(conn, cmd, cmd.Args[1]) { - return - } - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeStream) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteInt(0) - return - } - if typ != redisTypeStream { - conn.WriteError(wrongTypeMessage) - return - } - meta, found, err := r.loadStreamMetaAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if found { - conn.WriteInt64(meta.Length) - return - } - stream, err := r.loadStreamAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - conn.WriteInt64(int64(len(stream.Entries))) -} - -func parseRangeStreamCount(args [][]byte) (int, error) { - count := -1 - for i := 4; i < len(args); i += redisPairWidth { - // args[i] is safe: the for-loop guard `i < len(args)` ensures it. - // gosec G602 false-positives here under flow analysis. - if i+1 >= len(args) || !strings.EqualFold(string(args[i]), redisKeywordCount) { //nolint:gosec - return 0, errors.New("ERR syntax error") - } - nextCount, err := strconv.Atoi(string(args[i+1])) - if err != nil || nextCount < 0 { - return 0, errors.New("ERR syntax error") - } - count = nextCount - } - // Clamp client-supplied COUNT for XRANGE / XREVRANGE the same way XREAD - // clamps it (parseXReadCountArg). The negative sentinel -1 (no COUNT) - // is preserved unchanged so the unbounded path still trips - // maxWideColumnItems guard inside rangeStreamNewLayout. - if count > maxWideColumnItems { - count = maxWideColumnItems - } - return count, nil -} - -func streamEntryMatchesRange(entryID, startRaw, endRaw string, reverse bool) bool { - if reverse { - return streamWithinUpper(entryID, startRaw) && streamWithinLower(entryID, endRaw) - } - return streamWithinLower(entryID, startRaw) && streamWithinUpper(entryID, endRaw) -} - -func selectForwardStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, count int) []redisStreamEntry { - selected := make([]redisStreamEntry, 0, len(entries)) - for _, entry := range entries { - if !streamEntryMatchesRange(entry.ID, startRaw, endRaw, false) { - continue - } - selected = append(selected, entry) - if count >= 0 && len(selected) >= count { - break - } - } - return selected -} - -func selectReverseStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, count int) []redisStreamEntry { - selected := make([]redisStreamEntry, 0, len(entries)) - for i := len(entries) - 1; i >= 0; i-- { - if !streamEntryMatchesRange(entries[i].ID, startRaw, endRaw, true) { - continue - } - selected = append(selected, entries[i]) - if count >= 0 && len(selected) >= count { - break - } - } - return selected -} - -func selectStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, reverse bool, count int) []redisStreamEntry { - if reverse { - return selectReverseStreamRangeEntries(entries, startRaw, endRaw, count) - } - return selectForwardStreamRangeEntries(entries, startRaw, endRaw, count) -} - -func (r *RedisServer) rangeStream(conn redcon.Conn, cmd redcon.Command, reverse bool) { - count, err := parseRangeStreamCount(cmd.Args) - if err != nil { - writeRedisError(conn, err) - return - } - - readTS := r.readTS() - typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeStream) - if err != nil { - writeRedisError(conn, err) - return - } - if typ == redisTypeNone { - conn.WriteArray(0) - return - } - if typ != redisTypeStream { - conn.WriteError(wrongTypeMessage) - return - } - - startRaw, endRaw := string(cmd.Args[2]), string(cmd.Args[3]) - - _, metaFound, err := r.loadStreamMetaAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - if metaFound { - selected, err := r.rangeStreamNewLayout(context.Background(), cmd.Args[1], readTS, startRaw, endRaw, reverse, count) - if err != nil { - writeRedisError(conn, err) - return - } - writeStreamEntries(conn, selected) - return - } - - stream, err := r.loadStreamAt(context.Background(), cmd.Args[1], readTS) - if err != nil { - writeRedisError(conn, err) - return - } - selected := selectStreamRangeEntries(stream.Entries, startRaw, endRaw, reverse, count) - writeStreamEntries(conn, selected) -} - -// rangeStreamNewLayout serves XRANGE / XREVRANGE from the entry-per-key -// layout via a bounded range scan. The (start, end) inputs are the raw -// command bounds — "-", "+", "(1000-0", or "1000-0" — and are converted to -// binary scan bounds so only the selected entries are unmarshaled. -func (r *RedisServer) rangeStreamNewLayout( - ctx context.Context, key []byte, readTS uint64, - startRaw, endRaw string, reverse bool, count int, -) ([]redisStreamEntry, error) { - prefix := store.StreamEntryScanPrefix(key) - scanStart, scanEnd, ok, err := streamScanBounds(prefix, startRaw, endRaw, reverse) - if err != nil { - return nil, err - } - if !ok { - return nil, nil - } - limit := count - unbounded := limit <= 0 - if unbounded { - limit = maxWideScanLimit - } - var kvs []*store.KVPair - if reverse { - kvs, err = r.store.ReverseScanAt(ctx, scanStart, scanEnd, limit, readTS) - } else { - kvs, err = r.store.ScanAt(ctx, scanStart, scanEnd, limit, readTS) - } - if err != nil { - return nil, cockerrors.WithStack(err) - } - // An XRANGE/XREVRANGE without COUNT on a pathological stream must - // not be able to pull maxWideScanLimit entries into a single reply. - // Mirror scanStreamEntriesAt's guard. - if unbounded && len(kvs) > maxWideColumnItems { - return nil, cockerrors.Wrapf(ErrCollectionTooLarge, "stream %q exceeds %d entries", key, maxWideColumnItems) - } - entries := make([]redisStreamEntry, 0, len(kvs)) - for _, pair := range kvs { - entry, err := unmarshalStreamEntry(pair.Value) - if err != nil { - return nil, err - } - entries = append(entries, entry) - } - return entries, nil -} - -// streamScanBounds maps the raw XRANGE / XREVRANGE bounds to half-open -// [start, end) scan bounds over the entry prefix. For reverse scans, -// the ReverseScanAt convention is still [start, end) with results in -// descending order starting from just-before(end). -// -// Returns ok=false when the bounds define an empty range (e.g. start > end), -// in which case the caller should emit an empty array. -func streamScanBounds(prefix []byte, startRaw, endRaw string, reverse bool) ([]byte, []byte, bool, error) { - var lowRaw, highRaw string - if reverse { - // XREVRANGE takes (high, low). - highRaw, lowRaw = startRaw, endRaw - } else { - lowRaw, highRaw = startRaw, endRaw - } - - start, err := streamBoundLow(prefix, lowRaw) - if err != nil { - return nil, nil, false, err - } - end, err := streamBoundHigh(prefix, highRaw) - if err != nil { - return nil, nil, false, err - } - if bytes.Compare(start, end) >= 0 { - return nil, nil, false, nil - } - return start, end, true, nil -} - -// streamBoundLow returns the inclusive lower bound of the scan in binary form. -// When the bound is "(ID" (exclusive) and ID is the largest representable -// stream ID, the scan-end sentinel is returned so streamScanBounds' -// start >= end check collapses the range to empty; otherwise the scan -// would silently include the exclusive bound entry. -func streamBoundLow(prefix []byte, raw string) ([]byte, error) { - if raw == "-" { - return prefix, nil - } - exclusive := strings.HasPrefix(raw, "(") - if exclusive { - raw = raw[1:] - } - ms, seq, ok := parseStreamBoundID(raw, false, exclusive) - if !ok { - return nil, errors.New("ERR Invalid stream ID specified as stream command argument") - } - if exclusive { - switch { - case seq < ^uint64(0): - seq++ - case ms < ^uint64(0): - ms++ - seq = 0 - default: - return store.PrefixScanEnd(prefix), nil - } - } - return appendStreamKey(prefix, ms, seq), nil -} - -// streamBoundHigh returns the exclusive upper bound of the scan in binary form. -func streamBoundHigh(prefix []byte, raw string) ([]byte, error) { - if raw == "+" { - return store.PrefixScanEnd(prefix), nil - } - exclusive := strings.HasPrefix(raw, "(") - if exclusive { - raw = raw[1:] - } - ms, seq, ok := parseStreamBoundID(raw, true, exclusive) - if !ok { - return nil, errors.New("ERR Invalid stream ID specified as stream command argument") - } - if !exclusive { - switch { - case seq < ^uint64(0): - seq++ - case ms < ^uint64(0): - ms++ - seq = 0 - default: - return store.PrefixScanEnd(prefix), nil - } - } - return appendStreamKey(prefix, ms, seq), nil -} - -// parseStreamBoundID accepts both the strict ms-seq form and the shorthand -// "ms" form that Redis XRANGE/XREVRANGE allow. Redis interprets a shorthand -// ID differently depending on position and exclusivity: -// -// - Lower bound inclusive ("5"): expand to 5-0; scan starts at 5-0. -// - Lower bound exclusive ("(5"): expand to 5-0; caller shifts +1 → 5-1. -// - Upper bound inclusive ("5"): expand to 5-MaxUint64; caller shifts +1 → 6-0 (exclusive upper). -// - Upper bound exclusive ("(5"): expand to 5-0; scan stops at 5-0 (excludes all ms=5 entries). -// -// The rule is: seq = MaxUint64 when upper && !exclusive (need to include the -// full ms row before the caller's inclusive→exclusive shift), seq = 0 -// otherwise. Full ms-seq IDs pass through unchanged. -func parseStreamBoundID(raw string, upper, exclusive bool) (uint64, uint64, bool) { - if strings.IndexByte(raw, '-') >= 0 { - parsed, ok := tryParseRedisStreamID(raw) - if !ok { - return 0, 0, false - } - return parsed.ms, parsed.seq, true - } - ms, err := strconv.ParseUint(raw, 10, 64) - if err != nil { - return 0, 0, false - } - // Upper inclusive bounds need seq=MaxUint64 so the caller's +1 shift - // produces (ms+1)-0, covering the entire ms row. All other - // combinations use seq=0: lower inclusive starts at ms-0, lower - // exclusive starts at ms-0 then the caller shifts to ms-1, and upper - // exclusive stops before ms-0 (excluding the whole ms). - if upper && !exclusive { - return ms, ^uint64(0), true - } - return ms, 0, true -} - -func appendStreamKey(prefix []byte, ms, seq uint64) []byte { - out := make([]byte, 0, len(prefix)+store.StreamIDBytes) - out = append(out, prefix...) - out = append(out, store.EncodeStreamID(ms, seq)...) - return out -} - -func streamWithinLower(entryID, raw string) bool { - if raw == "-" { - return true - } - exclusive := strings.HasPrefix(raw, "(") - if exclusive { - raw = raw[1:] - } - cmp := compareRedisStreamID(entryID, raw) - if exclusive { - return cmp > 0 - } - return cmp >= 0 -} - -func streamWithinUpper(entryID, raw string) bool { - if raw == "+" { - return true - } - exclusive := strings.HasPrefix(raw, "(") - if exclusive { - raw = raw[1:] - } - cmp := compareRedisStreamID(entryID, raw) - if exclusive { - return cmp < 0 - } - return cmp <= 0 -} diff --git a/adapter/redis_expire_cmds.go b/adapter/redis_expire_cmds.go new file mode 100644 index 00000000..6e1d034c --- /dev/null +++ b/adapter/redis_expire_cmds.go @@ -0,0 +1,346 @@ +package adapter + +import ( + "context" + "errors" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +// SETEX key seconds value — equivalent to SET key value EX seconds +func (r *RedisServer) setex(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + seconds, err := strconv.ParseInt(string(cmd.Args[2]), 10, 64) + if err != nil || seconds <= 0 { + conn.WriteError("ERR invalid expire time in 'setex' command") + return + } + ttl := time.Now().Add(time.Duration(seconds) * time.Second) + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + if err := r.saveString(ctx, cmd.Args[1], cmd.Args[3], &ttl); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteString("OK") +} + +// GETDEL key — get the value and delete the key atomically +func (r *RedisServer) getdel(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + key := cmd.Args[1] + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var v []byte + err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), key, readTS) + if err != nil { + return err + } + if typ == redisTypeNone { + v = nil + return nil + } + if typ != redisTypeString { + return wrongTypeError() + } + raw, _, err := r.readRedisStringAt(key, readTS) + if err != nil { + // Key may have expired or been deleted between type check and read. + v = nil + return nil //nolint:nilerr // treat not-found/expired as nil value + } + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { + return err + } + v = raw + return nil + }) + if err != nil { + writeRedisError(conn, err) + return + } + if v == nil { + conn.WriteNull() + return + } + conn.WriteBulk(v) +} + +// SETNX key value — set if not exists, returns 1 on success, 0 on failure +func (r *RedisServer) setnx(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + + opts := redisSetOptions{missingCond: true} + result, err := r.executeSet(ctx, cmd.Args[1], cmd.Args[2], opts) + if err != nil { + writeRedisError(conn, err) + return + } + if result.wroteNull { + conn.WriteInt(0) + return + } + conn.WriteInt(1) +} + +func (r *RedisServer) ttl(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.writeTTL(conn, cmd.Args[1], false) +} + +func (r *RedisServer) pttl(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.writeTTL(conn, cmd.Args[1], true) +} + +func (r *RedisServer) writeTTL(conn redcon.Conn, key []byte, milliseconds bool) { + readTS := r.readTS() + exists, err := r.logicalExistsAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if !exists { + conn.WriteInt64(-2) + return + } + ttl, err := r.ttlAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + ms := ttlMilliseconds(ttl) + if ms == -1 { + conn.WriteInt64(-1) + return + } + if !milliseconds && ms >= 0 { + ms /= 1000 + } + conn.WriteInt64(ms) +} + +func (r *RedisServer) expire(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.setExpire(conn, cmd, time.Second) +} + +func (r *RedisServer) pexpire(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.setExpire(conn, cmd, time.Millisecond) +} + +func parseExpireNXOnly(args [][]byte) (bool, error) { + nxOnly := false + for _, arg := range args { + if !strings.EqualFold(string(arg), "NX") { + return false, errors.New("ERR syntax error") + } + nxOnly = true + } + return nxOnly, nil +} + +func hasActiveTTL(ttl *time.Time, now time.Time) bool { + return ttl != nil && ttl.After(now) +} + +func parseExpireTTL(raw []byte) (int64, error) { + ttl, err := strconv.ParseInt(string(raw), 10, 64) + if err != nil { + return 0, fmt.Errorf("parse expire ttl: %w", err) + } + return ttl, nil +} + +func (r *RedisServer) prepareExpire(key []byte, nxOnly bool) (uint64, bool, error) { + readTS := r.readTS() + exists, err := r.logicalExistsAt(context.Background(), key, readTS) + if err != nil { + return 0, false, err + } + if !exists { + return readTS, false, nil + } + + if !nxOnly { + return readTS, true, nil + } + + currentTTL, err := r.ttlAt(context.Background(), key, readTS) + if err != nil { + return 0, false, err + } + return readTS, !hasActiveTTL(currentTTL, time.Now()), nil +} + +func (r *RedisServer) setExpire(conn redcon.Conn, cmd redcon.Command, unit time.Duration) { + ttl, err := parseExpireTTL(cmd.Args[2]) + if err != nil { + writeRedisError(conn, err) + return + } + + nxOnly, err := parseExpireNXOnly(cmd.Args[3:]) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + + // Pin expireAt once before the retry loop so successive attempts all write + // the same wall-clock deadline (OCC retries must not push expiry forward). + var expireAt time.Time + if ttl > 0 { + if ttl > math.MaxInt64/int64(unit) { + conn.WriteError("ERR invalid expire time in command") + return + } + expireAt = time.Now().Add(time.Duration(ttl) * unit) + } + + var result int + if err := r.retryRedisWrite(ctx, func() error { + var retErr error + result, retErr = r.doSetExpire(ctx, cmd.Args[1], ttl, expireAt, nxOnly) + return retErr + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(result) +} + +// doSetExpire is the inner body of setExpire's retryRedisWrite loop. +// All reads (existence, type, value) use the same readTS snapshot so they form +// a consistent view. The subsequent dispatchElems calls use IsTxn=true with +// StartTS=readTS, which causes coordinator.Dispatch to reject the write with +// ErrWriteConflict if any touched key was modified after readTS. retryRedisWrite +// then re-invokes doSetExpire with a fresh readTS, providing OCC safety without +// an explicit mutex. Leadership is verified by coordinator.Dispatch itself. +func (r *RedisServer) doSetExpire(ctx context.Context, key []byte, ttl int64, expireAt time.Time, nxOnly bool) (int, error) { + readTS, eligible, err := r.prepareExpire(key, nxOnly) + if err != nil { + return 0, err + } + if !eligible { + return 0, nil + } + if ttl <= 0 { + return r.expireDeleteKey(ctx, key, readTS) + } + typ, err := r.rawKeyTypeAt(ctx, key, readTS) + if err != nil { + return 0, err + } + if typ == redisTypeString { + // rawKeyTypeAt also reports HLL as redisTypeString; HLL payloads live + // under !redis|hll| and don't carry an inline TTL, so fall back + // to the legacy scan-index path for them. + plain, err := r.isPlainRedisString(ctx, key, readTS) + if err != nil { + return 0, err + } + if plain { + applied, err := r.dispatchStringExpire(ctx, key, readTS, expireAt) + if err != nil || !applied { + return 0, err + } + return 1, nil + } + } + elems := []*kv.Elem[kv.OP]{{Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(expireAt)}} + return 1, r.dispatchElems(ctx, true, readTS, elems) +} + +// isPlainRedisString distinguishes a plain Redis string (stored under +// !redis|str| or, for legacy data, the bare key) from a HyperLogLog +// (stored under !redis|hll|), both of which rawKeyTypeAt reports as +// redisTypeString. +func (r *RedisServer) isPlainRedisString(ctx context.Context, key []byte, readTS uint64) (bool, error) { + exists, err := r.store.ExistsAt(ctx, redisStrKey(key), readTS) + if err != nil { + return false, cockerrors.WithStack(err) + } + if exists { + return true, nil + } + // Fall back to the bare legacy layout. + legacy, err := r.store.ExistsAt(ctx, key, readTS) + if err != nil { + return false, cockerrors.WithStack(err) + } + return legacy, nil +} + +func (r *RedisServer) expireDeleteKey(ctx context.Context, key []byte, readTS uint64) (int, error) { + elems, existed, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return 0, err + } + if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { + return 0, err + } + if existed { + return 1, nil + } + return 0, nil +} + +// dispatchStringExpire performs a read-modify-write on the string anchor key: +// it reads the current value at readTS, re-encodes it with the new expiry, and +// writes both the updated value and the !redis|ttl| scan index in a single Raft +// entry (IsTxn=true, StartTS=readTS). The coordinator rejects the write with +// ErrWriteConflict if any key was modified after readTS, so stale-data safety is +// guaranteed by OCC — no explicit mutex is required. +func (r *RedisServer) dispatchStringExpire(ctx context.Context, key []byte, readTS uint64, expireAt time.Time) (bool, error) { + userValue, _, readErr := r.readRedisStringAt(key, readTS) + if readErr != nil { + if cockerrors.Is(readErr, store.ErrKeyNotFound) { + // Raced with a delete/expiry between prepareExpire and this read; + // do not resurrect the key with an empty anchor. + return false, nil + } + return false, cockerrors.WithStack(readErr) + } + encoded := encodeRedisStr(userValue, &expireAt) + elems := []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: redisStrKey(key), Value: encoded}, + {Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(expireAt)}, + } + return true, r.dispatchElems(ctx, true, readTS, elems) +} diff --git a/adapter/redis_hash_cmds.go b/adapter/redis_hash_cmds.go new file mode 100644 index 00000000..1a54bee6 --- /dev/null +++ b/adapter/redis_hash_cmds.go @@ -0,0 +1,927 @@ +package adapter + +import ( + "context" + "errors" + "fmt" + "sort" + "strconv" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +func (r *RedisServer) hset(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + added, err := r.applyHashFieldPairs(cmd.Args[1], cmd.Args[2:]) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(added) +} + +func (r *RedisServer) hmset(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + if _, err := r.applyHashFieldPairs(cmd.Args[1], cmd.Args[2:]); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteString("OK") +} + +// buildHashLegacyMigrationElems returns ops that atomically migrate a legacy +// !redis|hash| blob to wide-column !hs|fld| keys. Returns nil if no legacy +// blob exists. The base meta key is also written with the migrated count so +// that resolveHashMeta works correctly after migration. +func (r *RedisServer) buildHashLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { + raw, err := r.store.GetAt(ctx, redisHashKey(key), readTS) + if cockerrors.Is(err, store.ErrKeyNotFound) { + return nil, nil + } + if err != nil { + return nil, cockerrors.WithStack(err) + } + value, err := unmarshalHashValue(raw) + if err != nil { + return nil, err + } + elems := make([]*kv.Elem[kv.OP], 0, len(value)+setWideColOverhead) + for field, val := range value { + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashFieldKey(key, []byte(field)), + Value: []byte(val), + }) + } + // Delete the legacy blob. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisHashKey(key)}) + // Write a base meta so that resolveHashMeta starts from an accurate count. + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaKey(key), + Value: store.MarshalHashMeta(store.HashMeta{Len: int64(len(value))}), + }) + return elems, nil +} + +// buildSetLegacyMigrationElems returns ops that atomically migrate a legacy +// !redis|set| blob to wide-column !st|mem| keys. Returns nil if no legacy +// blob exists. +func (r *RedisServer) buildSetLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { + raw, err := r.store.GetAt(ctx, redisSetKey(key), readTS) + if cockerrors.Is(err, store.ErrKeyNotFound) { + return nil, nil + } + if err != nil { + return nil, cockerrors.WithStack(err) + } + value, err := unmarshalSetValue(raw) + if err != nil { + return nil, err + } + elems := make([]*kv.Elem[kv.OP], 0, len(value.Members)+setWideColOverhead) + for _, member := range value.Members { + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.SetMemberKey(key, []byte(member)), + Value: []byte{}, + }) + } + // Delete the legacy blob. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisSetKey(key)}) + // Write a base meta so that resolveSetMeta starts from an accurate count. + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.SetMetaKey(key), + Value: store.MarshalSetMeta(store.SetMeta{Len: int64(len(value.Members))}), + }) + return elems, nil +} + +// buildZSetLegacyMigrationElems returns ops that atomically migrate a legacy +// !redis|zset| blob to wide-column !zs|mem| + !zs|scr| keys. Returns nil if no legacy +// blob exists. The base meta key is also written with the migrated count so +// that resolveZSetMeta works correctly after migration. +func (r *RedisServer) buildZSetLegacyMigrationElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { + raw, err := r.store.GetAt(ctx, redisZSetKey(key), readTS) + if cockerrors.Is(err, store.ErrKeyNotFound) { + return nil, nil + } + if err != nil { + return nil, cockerrors.WithStack(err) + } + value, err := unmarshalZSetValue(raw) + if err != nil { + return nil, err + } + // Each entry → member key + score index key; plus legacy blob deletion + base meta. + elems := make([]*kv.Elem[kv.OP], 0, len(value.Entries)*2+setWideColOverhead) //nolint:mnd // 2 ops per entry (member + score index) + for _, entry := range value.Entries { + elems = append(elems, + &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetMemberKey(key, []byte(entry.Member)), + Value: store.MarshalZSetScore(entry.Score), + }, + &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetScoreKey(key, entry.Score, []byte(entry.Member)), + Value: []byte{}, + }, + ) + } + // Delete the legacy blob. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisZSetKey(key)}) + // Write a base meta so that resolveZSetMeta starts from an accurate count. + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetMetaKey(key), + Value: store.MarshalZSetMeta(store.ZSetMeta{Len: int64(len(value.Entries))}), + }) + return elems, nil +} + +// addLegacyHashFieldsToMap adds field names from migration Put elems (fields +// being migrated in the current transaction, not yet visible at readTS) into +// existsMap so that buildHashFieldElems does not count them as new fields. +func addLegacyHashFieldsToMap(migrationElems []*kv.Elem[kv.OP], key []byte, existsMap map[string]struct{}) { + for _, elem := range migrationElems { + if elem.Op == kv.Put { + if f := store.ExtractHashFieldName(elem.Key, key); f != nil { + existsMap[string(f)] = struct{}{} + } + } + } +} + +// buildLegacySetMemberBase extracts member names from migration Put elems +// (members being migrated in the current transaction, invisible at readTS) +// and returns them as a set. Returns nil when no migration is happening. +func buildLegacySetMemberBase(migrationElems []*kv.Elem[kv.OP], key []byte) map[string]struct{} { + var base map[string]struct{} + for _, elem := range migrationElems { + if elem.Op == kv.Put { + if m := store.ExtractSetMemberName(elem.Key, key); m != nil { + if base == nil { + base = make(map[string]struct{}) + } + base[string(m)] = struct{}{} + } + } + } + return base +} + +// buildHashFieldElems iterates over field-value pairs in args, checks each +// field against existsMap to determine if it is new, appends Put operations +// to elems, and returns the updated elems and new-field count. +// existsMap is built by scanHashFieldExistsMap before this call so that +// existence checks are a single bulk scan rather than N ExistsAt round-trips. +func (r *RedisServer) buildHashFieldElems(key []byte, args [][]byte, existsMap map[string]struct{}, elems []*kv.Elem[kv.OP]) ([]*kv.Elem[kv.OP], int) { + newFields := 0 + for i := 0; i < len(args); i += redisPairWidth { + field := args[i] + value := args[i+1] + fieldStr := string(field) + fieldKey := store.HashFieldKey(key, field) + if _, exists := existsMap[fieldStr]; !exists { + newFields++ + // Mark as seen so duplicate field names in one HSET call are not + // counted as additional new fields (Redis deduplication semantics). + existsMap[fieldStr] = struct{}{} + } + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: value}) + } + return elems, newFields +} + +func (r *RedisServer) applyHashFieldPairs(key []byte, args [][]byte) (int, error) { + if len(args) == 0 || len(args)%redisPairWidth != 0 { + return 0, errors.New("ERR wrong number of arguments for hash command") + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var added int + err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeHash); err != nil { + return err + } + + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return cockerrors.Wrap(err, "applyHashFieldPairs: allocate commitTS") + } + + // Atomically migrate any legacy blob on first wide-column write. + // Fetch migration elems before allocating the main elems slice so that + // the initial capacity accounts for both migration and field Put ops, + // avoiding a reallocation when a legacy blob is present. + migrationElems, err := r.buildHashLegacyMigrationElems(ctx, key, readTS) + if err != nil { + return err + } + elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(args)/redisPairWidth+setWideColOverhead) + elems = append(elems, migrationElems...) + + // Bulk-scan existing fields once so buildHashFieldElems can check + // existence via a map lookup instead of per-field ExistsAt. + existsMap, err := r.scanHashFieldExistsMap(ctx, key, readTS) + if err != nil { + return err + } + // Fields from the legacy blob are being migrated in this same transaction, + // so they are not yet visible at readTS. Add them to existsMap so that + // buildHashFieldElems does not count already-existing fields as new. + addLegacyHashFieldsToMap(migrationElems, key, existsMap) + + var newFields int + elems, newFields = r.buildHashFieldElems(key, args, existsMap, elems) + added = newFields + + // Emit a single delta key for all newly-added fields. + if newFields != 0 { + deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: int64(newFields)}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + + if len(elems) == 0 { + return nil + } + + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return cockerrors.WithStack(dispatchErr) + }) + return added, err +} + +func (r *RedisServer) hget(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + key := cmd.Args[1] + field := cmd.Args[2] + readTS := r.readTS() + ctx := context.Background() + + // Fast path: look the wide-column field up directly. Live + // wide-column hashes resolve here in 1 seek + TTL probe versus + // the ~17 seeks rawKeyTypeAt issues through keyTypeAt. Legacy- + // blob hashes miss the wide-column key and fall through. + raw, hit, alive, err := r.hashFieldFastLookup(ctx, key, field, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if hit { + if !alive { + conn.WriteNull() + return + } + // WriteBulk sends the payload directly from the []byte backing + // store; WriteBulkString(string(raw)) would force a []byte → + // string copy on every fast-path hit. + conn.WriteBulk(raw) + return + } + r.hgetSlow(conn, ctx, key, field, readTS) +} + +// hashFieldFastLookup probes the wide-column field entry directly and +// reports whether it is present and TTL-alive. Returns hit=false when +// the wide-column key is absent, or when the narrow string-encoding +// guard in hasHigherPriorityStringEncoding fires, so the caller +// falls through to hgetSlow. +// +// Priority-alignment scope: this fast path does NOT fully mirror +// rawKeyTypeAt / keyTypeAt's priority checks. The guard only probes +// redisStrKey (the common SET-over-previous-hash corruption case); +// rarer dual-encoding corruption involving HLL, legacy bare keys, or +// list meta / delta entries is NOT caught here and will surface the +// wide-column hash answer instead of the WRONGTYPE / nil response +// keyTypeAt would produce. In normal operation at most one encoding +// exists per user key, so the guard is a guaranteed miss and the +// priority-alignment gap is invisible; pre-existing writers already +// clean up the old encoding before switching types. A full check +// would cost ~3-5 extra seeks per fast-path hit, which would negate +// most of the gain over the ~17-seek keyTypeAt slow path. +func (r *RedisServer) hashFieldFastLookup(ctx context.Context, key, field []byte, readTS uint64) (raw []byte, hit, alive bool, err error) { + // Probe the wide-column field FIRST so the priority guard only + // runs on a hit. Placing the guard before the probe made every + // miss (nonexistent key, legacy-blob hash, or wrong-type) pay an + // unnecessary ExistsAt on redisStrKey -- pure overhead for the + // common negative-lookup case and for any workload that still + // carries legacy-blob encodings. See the PR #565 independent + // review for the Medium-severity regression this addresses. + raw, err = r.store.GetAt(ctx, store.HashFieldKey(key, field), readTS) + if err != nil { + if cockerrors.Is(err, store.ErrKeyNotFound) { + return nil, false, false, nil + } + return nil, false, false, cockerrors.WithStack(err) + } + // Only pay the guard seek when we actually have a hit to defer. + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return nil, false, false, hErr + } else if higher { + return nil, false, false, nil + } + expired, expErr := r.hasExpired(ctx, key, readTS, true) + if expErr != nil { + return nil, false, false, cockerrors.WithStack(expErr) + } + return raw, true, !expired, nil +} + +// hasHigherPriorityStringEncoding returns true iff the new-format +// string encoding (redisStrKey) exists for key. This is NARROWER +// than rawKeyTypeAt's full string-wins tiebreaker, which also covers +// HyperLogLog (redisHLLKey) and the legacy bare key: those rarer +// dual-encoding corruption cases still reach the wide-column fast +// path and may return the collection-specific answer instead of +// WRONGTYPE / nil. +// +// The narrow scope is deliberate -- expanding the guard to every +// string-priority candidate (3 ExistsAt calls + the list-meta probe) +// would cost ~4-5 extra seeks per fast-path hit, regressing the +// negative case further than the ordering tweak in +// hashFieldFastLookup / setMemberFastExists / hashFieldFastExists +// already saved. Callers that require complete priority alignment +// must take the keyTypeAt slow path explicitly. +func (r *RedisServer) hasHigherPriorityStringEncoding(ctx context.Context, key []byte, readTS uint64) (bool, error) { + exists, err := r.store.ExistsAt(ctx, redisStrKey(key), readTS) + if err != nil { + return false, cockerrors.WithStack(err) + } + return exists, nil +} + +// hgetSlow falls back to the type-probing path when hashFieldFastLookup +// misses. Handles legacy-blob hashes and nil / WRONGTYPE disambiguation. +func (r *RedisServer) hgetSlow(conn redcon.Conn, ctx context.Context, key, field []byte, readTS uint64) { + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteNull() + return + } + if typ != redisTypeHash { + conn.WriteError(wrongTypeMessage) + return + } + value, err := r.loadHashAt(ctx, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + fieldValue, ok := value[string(field)] + if !ok { + conn.WriteNull() + return + } + conn.WriteBulkString(fieldValue) +} + +func (r *RedisServer) hmget(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) + if err != nil { + writeRedisError(conn, err) + return + } + fields := cmd.Args[redisPairWidth:] + if typ == redisTypeNone { + conn.WriteArray(len(fields)) + for range cmd.Args[2:] { + conn.WriteNull() + } + return + } + if typ != redisTypeHash { + conn.WriteError(wrongTypeMessage) + return + } + + value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteArray(len(fields)) + for _, field := range fields { + fieldValue, ok := value[string(field)] + if !ok { + conn.WriteNull() + continue + } + conn.WriteBulkString(fieldValue) + } +} + +func (r *RedisServer) hdel(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var removed int + if err := r.retryRedisWrite(ctx, func() error { + var err error + removed, err = r.hdelTxn(ctx, cmd.Args[1], cmd.Args[2:]) + return err + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(removed) +} + +// hdelWideColumn deletes the given fields from the wide-column hash and emits a negative delta. +func (r *RedisServer) hdelWideColumn(ctx context.Context, key []byte, fields [][]byte, readTS uint64) (int, error) { + delElems, removed, err := r.resolveHashFieldDelElems(ctx, key, fields, readTS) + if err != nil { + return 0, err + } + if removed == 0 { + return 0, nil + } + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return 0, cockerrors.Wrap(err, "hdelWideColumn: allocate commitTS") + } + elems := delElems + deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: int64(-removed)}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return removed, cockerrors.WithStack(dispatchErr) +} + +// resolveHashFieldDelElems checks which fields exist using either a bulk scan +// (for large batches) or individual ExistsAt calls (for small batches), then +// returns Del elems for every field that exists and the count of deletions. +func (r *RedisServer) resolveHashFieldDelElems(ctx context.Context, key []byte, fields [][]byte, readTS uint64) ([]*kv.Elem[kv.OP], int, error) { + var existsMap map[string]struct{} + if len(fields) >= wideColumnBulkScanThreshold { + var err error + existsMap, err = r.scanHashFieldExistsMap(ctx, key, readTS) + if err != nil { + return nil, 0, err + } + } + elems := make([]*kv.Elem[kv.OP], 0, len(fields)+1) + removed := 0 + for _, field := range fields { + fieldKey := store.HashFieldKey(key, field) + var exists bool + if existsMap != nil { + _, exists = existsMap[string(field)] + } else { + var err error + exists, err = r.store.ExistsAt(ctx, fieldKey, readTS) + if err != nil { + return nil, 0, cockerrors.WithStack(err) + } + } + if exists { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: fieldKey}) + removed++ + } + } + return elems, removed, nil +} + +func (r *RedisServer) hdelTxn(ctx context.Context, key []byte, fields [][]byte) (int, error) { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) + if err != nil { + return 0, err + } + if typ == redisTypeNone { + return 0, nil + } + if typ != redisTypeHash { + return 0, wrongTypeError() + } + + // Wide-column path: check if any !hs|fld| keys exist for this key. + hashFieldPrefix := store.HashFieldScanPrefix(key) + hashFieldEnd := store.PrefixScanEnd(hashFieldPrefix) + wideKVs, err := r.store.ScanAt(context.Background(), hashFieldPrefix, hashFieldEnd, 1, readTS) + if err != nil { + return 0, cockerrors.WithStack(err) + } + if len(wideKVs) > 0 { + return r.hdelWideColumn(ctx, key, fields, readTS) + } + + // Legacy blob path. + value, err := r.loadHashAt(context.Background(), key, readTS) + if err != nil { + return 0, err + } + removed := removeHashFields(value, fields) + if removed == 0 { + return 0, nil + } + return removed, r.persistHashTxn(ctx, key, readTS, value) +} + +func removeHashFields(value redisHashValue, fields [][]byte) int { + removed := 0 + for _, field := range fields { + if _, ok := value[string(field)]; ok { + delete(value, string(field)) + removed++ + } + } + return removed +} + +func (r *RedisServer) persistHashTxn(ctx context.Context, key []byte, readTS uint64, value redisHashValue) error { + if len(value) == 0 { + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, elems) + } + // Wide-column rewrite: write per-field keys and a new base meta. + // deleteLogicalKeyElems (called by the caller when needed) clears old keys. + elems := make([]*kv.Elem[kv.OP], 0, len(value)+1) + for field, val := range value { + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashFieldKey(key, []byte(field)), + Value: []byte(val), + }) + } + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaKey(key), + Value: store.MarshalHashMeta(store.HashMeta{Len: int64(len(value))}), + }) + // Also remove the legacy blob if it was present. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisHashKey(key)}) + return r.dispatchElems(ctx, true, readTS, elems) +} + +func (r *RedisServer) hexists(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + key := cmd.Args[1] + field := cmd.Args[2] + readTS := r.readTS() + ctx := context.Background() + + // Fast path: direct wide-column field existence check. ExistsAt + // is cheaper than GetAt since we don't need the value payload. + hit, alive, err := r.hashFieldFastExists(ctx, key, field, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if hit { + if alive { + conn.WriteInt(1) + } else { + conn.WriteInt(0) + } + return + } + r.hexistsSlow(conn, ctx, key, field, readTS) +} + +func (r *RedisServer) hashFieldFastExists(ctx context.Context, key, field []byte, readTS uint64) (hit, alive bool, err error) { + // Probe FIRST; guard only on hit. See hashFieldFastLookup for the + // regression rationale. + exists, err := r.store.ExistsAt(ctx, store.HashFieldKey(key, field), readTS) + if err != nil { + return false, false, cockerrors.WithStack(err) + } + if !exists { + return false, false, nil + } + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return false, false, hErr + } else if higher { + return false, false, nil + } + expired, expErr := r.hasExpired(ctx, key, readTS, true) + if expErr != nil { + return false, false, cockerrors.WithStack(expErr) + } + return true, !expired, nil +} + +func (r *RedisServer) hexistsSlow(conn redcon.Conn, ctx context.Context, key, field []byte, readTS uint64) { + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeHash) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteInt(0) + return + } + if typ != redisTypeHash { + conn.WriteError(wrongTypeMessage) + return + } + value, err := r.loadHashAt(ctx, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if _, ok := value[string(field)]; ok { + conn.WriteInt(1) + return + } + conn.WriteInt(0) +} + +func (r *RedisServer) hlen(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteInt(0) + return + } + if typ != redisTypeHash { + conn.WriteError(wrongTypeMessage) + return + } + + // Wide-column path: use delta-aggregated metadata for O(1) count. + count, exists, err := r.resolveHashMeta(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if exists { + conn.WriteInt64(count) + return + } + // Legacy blob fallback: load all fields and count. + value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(len(value)) +} + +func (r *RedisServer) hincrby(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + increment, err := strconv.ParseInt(string(cmd.Args[3]), 10, 64) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var current int64 + if err := r.retryRedisWrite(ctx, func() error { + var txnErr error + current, txnErr = r.hincrbyTxn(ctx, cmd.Args[1], cmd.Args[2], increment) + return txnErr + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt64(current) +} + +// readHashFieldInt reads the current integer value of a hash field from wide-column or legacy storage. +// Returns (current, isNewField, legacyHashValue, error). legacyHashValue is non-nil only when +// the value came from a legacy JSON blob that needs to be migrated on the next write. +func (r *RedisServer) readHashFieldInt(ctx context.Context, key, field []byte, readTS uint64) (int64, bool, redisHashValue, error) { + fieldKey := store.HashFieldKey(key, field) + raw, readErr := r.store.GetAt(ctx, fieldKey, readTS) + if readErr != nil && !cockerrors.Is(readErr, store.ErrKeyNotFound) { + return 0, true, nil, cockerrors.WithStack(readErr) + } + if readErr == nil { + current, parseErr := strconv.ParseInt(string(raw), 10, 64) + if parseErr != nil { + return 0, false, nil, errors.New("ERR hash value is not an integer") + } + return current, false, nil, nil + } + // Not in wide-column – check legacy blob. + legacyValue, legacyErr := r.loadHashAt(ctx, key, readTS) + if legacyErr != nil { + return 0, true, nil, legacyErr + } + if rawLegacy, ok := legacyValue[string(field)]; ok { + current, parseErr := strconv.ParseInt(rawLegacy, 10, 64) + if parseErr != nil { + return 0, false, nil, errors.New("ERR hash value is not an integer") + } + return current, false, legacyValue, nil + } + return 0, true, legacyValue, nil +} + +// hincrbyWithMigration handles the HINCRBY case where a legacy JSON blob must be migrated +// atomically with the increment operation. +func (r *RedisServer) hincrbyWithMigration(ctx context.Context, key, fieldKey []byte, readTS, commitTS uint64, current int64, isNewField bool, increment int64) (int64, error) { + migrationElems, migErr := r.buildHashLegacyMigrationElems(ctx, key, readTS) + if migErr != nil { + return 0, migErr + } + current += increment + newVal := strconv.FormatInt(current, 10) + elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+setWideColOverhead) + elems = append(elems, migrationElems...) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: []byte(newVal)}) + if isNewField { + deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: 1}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return current, cockerrors.WithStack(dispatchErr) +} + +func (r *RedisServer) hincrbyTxn(ctx context.Context, key, field []byte, increment int64) (int64, error) { + readTS := r.readTS() + if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeHash); err != nil { + return 0, err + } + + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return 0, cockerrors.Wrap(err, "hincrbyTxn: allocate commitTS") + } + fieldKey := store.HashFieldKey(key, field) + + current, isNewField, legacyValue, err := r.readHashFieldInt(ctx, key, field, readTS) + if err != nil { + return 0, err + } + + // If a legacy blob exists, migrate it atomically with the increment. + if len(legacyValue) > 0 { + return r.hincrbyWithMigration(ctx, key, fieldKey, readTS, commitTS, current, isNewField, increment) + } + + current += increment + newVal := strconv.FormatInt(current, 10) + elems := make([]*kv.Elem[kv.OP], 0, setWideColOverhead) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: fieldKey, Value: []byte(newVal)}) + if isNewField { + deltaVal := store.MarshalHashMetaDelta(store.HashMetaDelta{LenDelta: 1}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.HashMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return current, cockerrors.WithStack(dispatchErr) +} + +func (r *RedisServer) incr(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var current int64 + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + return err + } + if typ != redisTypeNone && typ != redisTypeString { + return wrongTypeError() + } + + current = 0 + var existingTTL *time.Time + if typ == redisTypeString { + raw, ttl, err := r.readRedisStringAt(cmd.Args[1], readTS) + if err != nil { + return err + } + existingTTL = ttl + current, err = strconv.ParseInt(string(raw), 10, 64) + if err != nil { + return fmt.Errorf("ERR value is not an integer or out of range") + } + } + current++ + + // INCR preserves any existing TTL (Redis semantics). + encoded := encodeRedisStr([]byte(strconv.FormatInt(current, 10)), existingTTL) + elems := []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: redisStrKey(cmd.Args[1]), Value: encoded}, + } + if existingTTL != nil { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(cmd.Args[1]), Value: encodeRedisTTL(*existingTTL)}) + } else { + // Defensively clear any stale/legacy scan index entry so the sweeper + // cannot later expire a now-persistent key. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(cmd.Args[1])}) + } + return r.dispatchElems(ctx, true, readTS, elems) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt64(current) +} + +func (r *RedisServer) hgetall(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeHash) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteArray(0) + return + } + if typ != redisTypeHash { + conn.WriteError(wrongTypeMessage) + return + } + + value, err := r.loadHashAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + fields := make([]string, 0, len(value)) + for field := range value { + fields = append(fields, field) + } + sort.Strings(fields) + conn.WriteArray(len(fields) * redisPairWidth) + for _, field := range fields { + conn.WriteBulkString(field) + conn.WriteBulkString(value[field]) + } +} diff --git a/adapter/redis_keys.go b/adapter/redis_keys.go new file mode 100644 index 00000000..5c3ef0a5 --- /dev/null +++ b/adapter/redis_keys.go @@ -0,0 +1,320 @@ +package adapter + +import ( + "bytes" + "context" + "maps" + "math" + + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +func (r *RedisServer) keys(conn redcon.Conn, cmd redcon.Command) { + pattern := cmd.Args[1] + + if r.coordinator.IsLeader() { + // Per-call ctx with redisDispatchTimeout instead of the + // long-lived handlerContext: a stalled VerifyLeader on KEYS + // must not pin the command handler indefinitely. The same + // bound the rest of the dispatch path (sadd, set, …) uses; + // see Codex P1 review on PR #749. + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + if err := r.coordinator.VerifyLeader(ctx); err != nil { + writeRedisError(conn, err) + return + } + keys, err := r.visibleKeys(pattern) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteArray(len(keys)) + for _, k := range keys { + conn.WriteBulk(k) + } + return + } + + keys, err := r.proxyKeys(pattern) + if err != nil { + writeRedisError(conn, err) + return + } + + conn.WriteArray(len(keys)) + for _, k := range keys { + conn.WriteBulkString(k) + } +} + +func (r *RedisServer) localKeys(pattern []byte) ([][]byte, error) { + if !bytes.Contains(pattern, []byte("*")) { + return r.localKeysExact(pattern) + } + return r.localKeysPattern(pattern) +} + +func (r *RedisServer) localKeysExact(pattern []byte) ([][]byte, error) { + typ, err := r.keyTypeAt(context.Background(), pattern, r.readTS()) + if err != nil { + return nil, err + } + if typ != redisTypeNone { + return [][]byte{bytes.Clone(pattern)}, nil + } + return [][]byte{}, nil +} + +// mergeInternalNamespaces scans all internal key namespaces (list, hash, set, +// zset, and other internal prefixes) for keys that match pattern and merges +// them into the caller's keyset via mergeScannedKeys. Called only when the +// pattern is bounded (start != nil) because unbounded scans already cover the +// full keyspace. +func (r *RedisServer) mergeInternalNamespaces(start []byte, pattern []byte, mergeScannedKeys func([]byte, []byte) error) error { + metaStart, metaEnd := listPatternScanBounds(store.ListMetaPrefix, pattern) + if err := mergeScannedKeys(metaStart, metaEnd); err != nil { + return err + } + itemStart, itemEnd := listPatternScanBounds(store.ListItemPrefix, pattern) + if err := mergeScannedKeys(itemStart, itemEnd); err != nil { + return err + } + for _, prefix := range redisInternalPrefixes { + // !stream|meta| keys are length-prefixed (see store.StreamMetaKey): + // a pattern-bound scan over the raw prefix would mask out every + // migrated stream because the user-key bytes do not start at + // prefix[len(prefix):]. Delegate to the wide-column scan below, + // which uses streamMetaScanStart(start) to place the user-key + // lower bound past the length field. + if prefix == store.StreamMetaPrefix { + continue + } + internalStart, internalEnd := listPatternScanBounds(prefix, pattern) + if err := mergeScannedKeys(internalStart, internalEnd); err != nil { + return err + } + } + // Wide-column hash/set/zset keys embed the user-key as + // <4-byte-len>, so the binary length + // prefix makes straightforward bounds-based scanning non-trivial. + // Use the user-key prefix as the lower bound and scan to the end of each + // namespace; collectUserKeys filters false positives by pattern. + hashFieldStart := store.HashFieldScanPrefix(start) + hashFieldEnd := prefixScanEnd([]byte(store.HashFieldPrefix)) + if err := mergeScannedKeys(hashFieldStart, hashFieldEnd); err != nil { + return err + } + setMemberStart := store.SetMemberScanPrefix(start) + setMemberEnd := prefixScanEnd([]byte(store.SetMemberPrefix)) + if err := mergeScannedKeys(setMemberStart, setMemberEnd); err != nil { + return err + } + zsetMemberStart := store.ZSetMemberScanPrefix(start) + zsetMemberEnd := prefixScanEnd([]byte(store.ZSetMemberPrefix)) + if err := mergeScannedKeys(zsetMemberStart, zsetMemberEnd); err != nil { + return err + } + // Post-migration streams live under !stream|meta|. + // The meta record is enough to expose the logical key via KEYS; + // entry rows are filtered out by redisVisibleUserKey / collectUserKeys + // so the result stays one-line-per-stream regardless of entry count. + streamMetaStart := streamMetaScanStart(start) + streamMetaEnd := prefixScanEnd([]byte(store.StreamMetaPrefix)) + return mergeScannedKeys(streamMetaStart, streamMetaEnd) +} + +// streamMetaScanStart returns the lower bound for scanning stream meta +// keys that begin with the given user-key prefix. The store helper +// already returns StreamMetaPrefix + len(userKey) + userKey, so callers +// only need to supply the bounded pattern prefix. +func streamMetaScanStart(userPrefix []byte) []byte { + if len(userPrefix) == 0 { + return []byte(store.StreamMetaPrefix) + } + return store.StreamMetaKey(userPrefix) +} + +func (r *RedisServer) localKeysPattern(pattern []byte) ([][]byte, error) { + start, end := patternScanBounds(pattern) + keyset := map[string][]byte{} + readTS := r.readTS() + + mergeScannedKeys := func(scanStart, scanEnd []byte) error { + keys, err := r.store.ScanAt(context.Background(), scanStart, scanEnd, math.MaxInt, readTS) + if err != nil { + return errors.WithStack(err) + } + maps.Copy(keyset, r.collectUserKeys(keys, pattern)) + return nil + } + + if err := mergeScannedKeys(start, end); err != nil { + return nil, err + } + + // When the pattern is bounded (start != nil), user-key scans do not + // naturally include internal data namespaces, so scan those separately + // and map them back to logical user keys. For unbounded patterns + // (e.g. "*"), the full-keyspace scan already covers everything. + if start != nil { + if err := r.mergeInternalNamespaces(start, pattern, mergeScannedKeys); err != nil { + return nil, err + } + } + + out := make([][]byte, 0, len(keyset)) + for _, v := range keyset { + out = append(out, v) + } + return out, nil +} + +func patternScanBounds(pattern []byte) ([]byte, []byte) { + if bytes.Equal(pattern, []byte("*")) { + return nil, nil + } + + i := bytes.IndexByte(pattern, '*') + if i <= 0 { + return nil, nil + } + + start := bytes.Clone(pattern[:i]) + return start, prefixScanEnd(start) +} + +func listPatternScanBounds(prefix string, pattern []byte) ([]byte, []byte) { + userStart, userEnd := patternScanBounds(pattern) + prefixBytes := []byte(prefix) + + if userStart == nil && userEnd == nil { + return prefixBytes, prefixScanEnd(prefixBytes) + } + + start := append(bytes.Clone(prefixBytes), userStart...) + if userEnd == nil { + return start, prefixScanEnd(prefixBytes) + } + end := append(bytes.Clone(prefixBytes), userEnd...) + return start, end +} + +func matchesAsteriskPattern(pattern, key []byte) bool { + parts := bytes.Split(pattern, []byte("*")) + if len(parts) == 1 { + return bytes.Equal(pattern, key) + } + + pos := 0 + if len(parts[0]) > 0 { + if !bytes.HasPrefix(key, parts[0]) { + return false + } + pos = len(parts[0]) + } + + for i := 1; i < len(parts)-1; i++ { + part := parts[i] + if len(part) == 0 { + continue + } + idx := bytes.Index(key[pos:], part) + if idx < 0 { + return false + } + pos += idx + len(part) + } + + last := parts[len(parts)-1] + if len(last) > 0 && !bytes.HasSuffix(key, last) { + return false + } + + return true +} + +func (r *RedisServer) collectUserKeys(kvs []*store.KVPair, pattern []byte) map[string][]byte { + keyset := map[string][]byte{} + for _, kvPair := range kvs { + userKey := redisVisibleUserKey(kvPair.Key) + if userKey == nil || !matchesAsteriskPattern(pattern, userKey) { + continue + } + keyset[string(userKey)] = userKey + } + return keyset +} + +// zsetWideColumnVisibleUserKey handles the ZSet-specific part of wide-column key mapping. +// Returns (nil, true) for internal-only keys and (userKey, true) for visible keys. +func zsetWideColumnVisibleUserKey(key []byte) (userKey []byte, isWide bool) { + if store.IsZSetMetaDeltaKey(key) || store.IsZSetMetaKey(key) { + return nil, true + } + if store.IsZSetMemberKey(key) { + return store.ExtractZSetUserKeyFromMember(key), true + } + if store.IsZSetScoreKey(key) { + return store.ExtractZSetUserKeyFromScore(key), true + } + return nil, false +} + +// wideColumnVisibleUserKey maps a wide-column internal key to its visible user +// key, or returns (nil, true) for internal-only keys (meta/delta), and +// (nil, false) if the key is not a wide-column key at all. +func wideColumnVisibleUserKey(key []byte) (userKey []byte, isWide bool) { + // Check delta prefixes before meta prefixes (delta starts with meta prefix). + if store.IsHashMetaDeltaKey(key) || store.IsHashMetaKey(key) { + return nil, true + } + if store.IsHashFieldKey(key) { + return store.ExtractHashUserKeyFromField(key), true + } + if store.IsSetMetaDeltaKey(key) || store.IsSetMetaKey(key) { + return nil, true + } + if store.IsSetMemberKey(key) { + return store.ExtractSetUserKeyFromMember(key), true + } + if userKey, ok := streamWideColumnVisibleUserKey(key); ok { + return userKey, true + } + return zsetWideColumnVisibleUserKey(key) +} + +// streamWideColumnVisibleUserKey maps a wide-column stream key to its +// visible user key. Meta keys expose the stream exactly once; entry keys +// are internal-only so KEYS / SCAN don't leak one result per entry. +func streamWideColumnVisibleUserKey(key []byte) ([]byte, bool) { + if store.IsStreamMetaKey(key) { + return store.ExtractStreamUserKeyFromMeta(key), true + } + if store.IsStreamEntryKey(key) { + return nil, true + } + return nil, false +} + +func redisVisibleUserKey(key []byte) []byte { + if bytes.HasPrefix(key, redisTxnKeyPrefix) || isRedisTTLKey(key) { + return nil + } + // List item keys are visible; meta, delta, and claim keys are internal-only. + if store.IsListItemKey(key) { + return store.ExtractListUserKey(key) + } + if store.IsListMetaKey(key) || store.IsListMetaDeltaKey(key) || store.IsListClaimKey(key) { + return nil + } + if userKey, isWide := wideColumnVisibleUserKey(key); isWide { + return userKey + } + if userKey := extractRedisInternalUserKey(key); userKey != nil { + return userKey + } + return key +} diff --git a/adapter/redis_lists.go b/adapter/redis_lists.go new file mode 100644 index 00000000..5071641f --- /dev/null +++ b/adapter/redis_lists.go @@ -0,0 +1,836 @@ +package adapter + +import ( + "bytes" + "context" + "math" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +func listMetaKey(userKey []byte) []byte { + return store.ListMetaKey(userKey) +} + +func listItemKey(userKey []byte, seq int64) []byte { + return store.ListItemKey(userKey, seq) +} + +func clampRange(start, end, length int) (int, int) { + if start < 0 { + start = length + start + } + if end < 0 { + end = length + end + } + if start < 0 { + start = 0 + } + if end >= length { + end = length - 1 + } + if end < start { + return 0, -1 + } + return start, end +} + +func (r *RedisServer) loadListMetaAt(ctx context.Context, key []byte, readTS uint64) (store.ListMeta, bool, error) { + val, err := r.store.GetAt(ctx, store.ListMetaKey(key), readTS) + if err != nil { + if errors.Is(err, store.ErrKeyNotFound) { + return store.ListMeta{}, false, nil + } + return store.ListMeta{}, false, errors.WithStack(err) + } + meta, err := store.UnmarshalListMeta(val) + if err != nil { + return store.ListMeta{}, false, errors.WithStack(err) + } + return meta, true, nil +} + +func (r *RedisServer) isListKeyAt(ctx context.Context, key []byte, readTS uint64) (bool, error) { + _, exists, err := r.loadListMetaAt(ctx, key, readTS) + return exists, err +} + +// buildRPushOps creates operations to append values to the tail of a list using +// the Delta pattern. Instead of writing to the base metadata key (causing OCC +// conflicts), it emits a single ListMetaDelta key with LenDelta = len(values). +// commitTS must be pre-allocated via dispatchElemsWithCommitTS; seqInTxn +// disambiguates multiple push operations in the same transaction. +func (r *RedisServer) buildRPushOps(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) { + if len(values) == 0 { + return nil, meta, nil + } + + elems := make([]*kv.Elem[kv.OP], 0, len(values)+1) + seq := meta.Head + meta.Len + for _, v := range values { + vCopy := bytes.Clone(v) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: listItemKey(key, seq), Value: vCopy}) + seq++ + } + + // Emit a Delta key instead of writing the base meta key. + delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: 0, LenDelta: int64(len(values))}) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListMetaDeltaKey(key, commitTS, seqInTxn), Value: delta}) + + meta.Len += int64(len(values)) + meta.Tail = meta.Head + meta.Len + return elems, meta, nil +} + +// listPushBuildFn is the type for functions that build list push operations. +type listPushBuildFn func(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) + +// listPushCore is the shared retry loop for RPUSH and LPUSH. The caller supplies +// a buildFn that assembles the specific operations (RPUSH appends to tail, LPUSH +// prepends to head). When onePhaseTxnDedup is enabled it uses the write-set-reuse +// retry path (option 2); otherwise it keeps the original recompute-on-retry loop. +func (r *RedisServer) listPushCore(ctx context.Context, key []byte, values [][]byte, buildFn listPushBuildFn) (int64, error) { + if r.onePhaseTxnDedup { + return r.listPushCoreWithDedup(ctx, key, values, buildFn) + } + + var newLen int64 + err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + meta, _, err := r.resolveListMeta(ctx, key, readTS) + if err != nil { + return err + } + + // Pre-allocate commitTS so we can embed it in the Delta key. + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return errors.Wrap(err, "listPushCore: allocate commitTS") + } + ops, updatedMeta, err := buildFn(meta, key, values, commitTS, 0) + if err != nil { + return err + } + if len(ops) == 0 { + newLen = updatedMeta.Len + return nil + } + + // Dispatch with the pre-allocated commitTS. + _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: ops, + }) + if dispErr != nil { + return errors.WithStack(dispErr) + } + newLen = updatedMeta.Len + return nil + }) + return newLen, err +} + +// reusableListPush captures a dispatched list-push attempt so a subsequent +// retry can reuse its exact write set (same seq, same item/delta keys) and +// probe whether it already landed, instead of recomputing seq from a fresh +// meta read. Recomputing is what duplicates the element under leadership +// churn: attempt 1 commits at T1 but returns an ambiguous error, the retry +// reads the now-larger list and appends at a NEW seq. Reuse + the FSM's +// exact-ts dedup probe close that. See option 2 in +// docs/design/2026_05_21_proposed_txn_secondary_idempotency.md. +type reusableListPush struct { + ops []*kv.Elem[kv.OP] + startTS uint64 + // commitTS is the most recent dispatched commit_ts for this write set; + // the next retry passes it as prev_commit_ts so the FSM probes exactly + // the attempt that might have landed. + commitTS uint64 + // length is the client-visible post-push length. It is invariant across + // reuse — the write set was built once from attempt 1's meta — so it is + // also the correct value to return when the FSM dedup no-ops the apply + // (R1 result reconstruction: no store re-read needed). + length int64 + // readKeys is the boundary read set captured at attempt 1's meta read: + // listItemKey(Head) and (when Len > 1) listItemKey(Tail-1). It is the + // load-bearing fence against the codex P1 scenario where an intervening + // pop/trim shrinks the list before the retry — without it, the reused + // seq would land past the new Tail and be unreachable to LRANGE. OCC + // validates these atomically against startTS at FSM apply, so any + // boundary-touching commit fires WriteConflict and the adapter drops + // pending → recomputes. Empty when attempt 1 read an empty list (no + // boundary to fence; the OCC on the write key suffices for that case). + readKeys [][]byte +} + +// dispatchListPushReuse runs one iteration of the option-2 reuse path: +// dispatches the captured write set under a fresh commit_ts (carrying +// pending.commitTS as PrevCommitTS so the FSM probes whether the prior +// attempt landed) and returns the post-push length on success. The drop +// return signals the caller to clear pending — set on a genuine +// WriteConflict from another txn so the next iteration recomputes from +// fresh meta. Extracted from listPushCoreWithDedup to keep that closure +// under the cyclop / gocognit / nestif limits. +func (r *RedisServer) dispatchListPushReuse(ctx context.Context, key []byte, pending *reusableListPush) (newLen int64, drop bool, err error) { + // HLC-4 parity: persistence-grade commit_ts allocation must honor + // the physical-ceiling fence so a stale-leader window cannot mint a + // timestamp that collides with the successor's. The error path + // returns ErrCeilingExpired which isRetryableRedisTxnErr classifies + // as non-retryable, so it exits retryRedisWrite directly to the + // client — same shape as the other persistence-grade Next call + // sites in this file. + commitTS, allocErr := r.coordinator.Clock().NextFenced() + if allocErr != nil { + return 0, false, errors.Wrap(allocErr, "redis list-push reuse: allocate commitTS") + } + _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: pending.startTS, + CommitTS: commitTS, + PrevCommitTS: pending.commitTS, + ReadKeys: pending.readKeys, + Elems: pending.ops, + }) + if dispErr == nil { + return r.resolveReuseLength(ctx, key, pending), false, nil + } + if errors.Is(dispErr, store.ErrWriteConflict) { + // Self-inflicted-conflict guard (codex P1): the apply might have + // landed at this fresh commitTS but bubbled up as WriteConflict due + // to leadership churn (the original bug class the doc's "Resolved" + // section identifies). Without this probe, dropping pending here + // would recompute and append a second copy. Ask the store: did + // our just-attempted commit_ts land? If yes, this conflict is + // against our own commit — return success and keep pending pointing + // at THIS commit_ts so any subsequent retry probes the right point. + // + // Length resolution (codex P2 round-11): pending.length was computed + // during the prior attempt and is stale w.r.t. any non-conflicting + // list-modifying writes that landed between attempt 1 and this fresh + // apply. Probing pending.commitTS would hit for the fresh apply and + // (under the old resolveReuseLength shortcut) silently return the + // prior-attempt length — understating the count. Always re-read meta + // in the self-conflict path. resolveListMeta failure falls back to + // pending.length to honor codex P2 round-10 ("avoid failing after a + // reuse apply"). + if probeKey := firstWriteKey(pending.ops); len(probeKey) > 0 { + landed, perr := r.store.CommittedVersionAt(ctx, probeKey, commitTS) + if perr == nil && landed { + pending.commitTS = commitTS + return r.resolveLengthAfterFreshApply(ctx, key, pending), false, nil + } + } + // Our attempt did not land at commitTS and the target seq is taken + // by another txn — a genuine conflict. Drop pending; the next + // iteration recomputes from a fresh meta read. + return 0, true, errors.WithStack(dispErr) + } + // Still ambiguous (lock / other retryable): this reuse may itself + // have landed, so the next retry must probe THIS commit_ts. Only + // advance pending.commitTS if retryRedisWrite will actually loop + // (non-retryable errors escape to the client; pending is then + // discarded with the goroutine, so the update is wasted and the + // stale value would be misleading if some future caller reads it). + if isRetryableRedisTxnErr(dispErr) { + pending.commitTS = commitTS + } + return 0, false, errors.WithStack(dispErr) +} + +// resolveReuseLength returns the client-visible post-push length after a +// successful reuse dispatch. If our prior attempt's exact commit_ts +// version exists, the FSM no-op'd (probe hit) and pending.length is the +// correct length we computed at that attempt. Otherwise the FSM applied +// the reused write set at a fresh commit_ts and we must re-read meta to +// capture any non-conflicting list-modifying writes that committed +// between attempts (codex P2) — without this, the return value would +// silently understate the count when the boundary OCC fence and +// write-key OCC both pass but the list length changed. +// +// Failure modes are converted to a degraded return (pending.length) rather +// than surfaced as an error, because the dispatch already committed. Per +// codex P2 round-10 ("avoid failing after a reuse apply"), reporting a +// write error after the apply landed drives the client into a retry that +// has no pending state and would re-append the element — the very anomaly +// this feature prevents. Specifically: +// - probe error of any kind: prefer pending.length over failure. +// - resolveListMeta failure (e.g. delta scan over MaxDeltaScanLimit +// under churn): fall back to pending.length. +// +// Returns int64 directly (no error) so callers do not have to invent +// caller-side fallback logic; the degraded-return contract is fixed here +// (golangci unparam / nilerr fix on the prior error-returning shape). +func (r *RedisServer) resolveReuseLength(ctx context.Context, key []byte, pending *reusableListPush) int64 { + if probeKey := firstWriteKey(pending.ops); len(probeKey) > 0 { + hit, perr := r.store.CommittedVersionAt(ctx, probeKey, pending.commitTS) + if perr == nil && hit { + return pending.length + } + if perr != nil { + // Probe failed; the dispatch already committed so degrade + // gracefully rather than propagate the read error. + return pending.length + } + // perr == nil && !hit: prior attempt didn't land at this ts; the + // FSM applied fresh writes, fall through to re-read meta. + } + return r.resolveLengthAfterFreshApply(ctx, key, pending) +} + +// resolveLengthAfterFreshApply re-reads list meta to capture the post-apply +// length when we know the fresh commitTS applied (no probe shortcut), with +// the same fall-back-to-pending.length contract as resolveReuseLength. Used +// by the self-conflict path (codex P2 round-11): there pending.length is +// stale w.r.t. intervening non-conflicting writes, so the probe-hit +// shortcut would silently understate the count. +func (r *RedisServer) resolveLengthAfterFreshApply(ctx context.Context, key []byte, pending *reusableListPush) int64 { + currentMeta, _, mErr := r.resolveListMeta(ctx, key, r.readTS()) + if mErr != nil { + return pending.length + } + return currentMeta.Len +} + +// firstWriteKey returns the first non-empty element key from ops, or nil +// when there is none. Used after a successful reuse dispatch to probe +// whether our prior attempt's commit_ts actually landed: attempt 1 writes +// all its elem keys atomically at the same commit_ts, so any one of them +// answers the question. +func firstWriteKey(ops []*kv.Elem[kv.OP]) []byte { + for _, e := range ops { + if e != nil && len(e.Key) > 0 { + return e.Key + } + } + return nil +} + +// listPushBoundaryReadKeys returns the boundary positions of the list as +// read keys for OCC. Including these in the dispatched OperationGroup makes +// FSM apply atomically reject the retry when any pop/trim has touched the +// boundary between attempts (codex P1 fix: prevents a reused seq from +// landing past a shrunk Tail). The keys are deduped: a single-element list +// has Head == Tail-1, so we emit it once. +func listPushBoundaryReadKeys(key []byte, meta store.ListMeta) [][]byte { + if meta.Len <= 0 { + return nil + } + tailIdx := meta.Tail - 1 + if tailIdx == meta.Head { + return [][]byte{listItemKey(key, meta.Head)} + } + return [][]byte{ + listItemKey(key, meta.Head), + listItemKey(key, tailIdx), + } +} + +// listPushCoreWithDedup is the option-2 retry loop. The first attempt computes +// the write set from the current meta; any retryable failure makes the next +// iteration REUSE that write set under a fresh commit_ts with prev_commit_ts +// set, so the FSM no-ops if the prior attempt already landed. A WriteConflict +// on a reuse attempt means the probe ruled out our own prior attempt and the +// seq is genuinely taken by another txn, so we fall back to a full recompute. +func (r *RedisServer) listPushCoreWithDedup(ctx context.Context, key []byte, values [][]byte, buildFn listPushBuildFn) (int64, error) { + var newLen int64 + var pending *reusableListPush + err := r.retryRedisWrite(ctx, func() error { + if pending != nil { + length, drop, dispErr := r.dispatchListPushReuse(ctx, key, pending) + if drop { + pending = nil + } + if dispErr != nil { + return dispErr + } + newLen = length + return nil + } + + readTS := r.readTS() + meta, _, err := r.resolveListMeta(ctx, key, readTS) + if err != nil { + return err + } + + // HLC-4 parity with prepareDispatch / dispatchExecReuse — + // see dispatchListPushReuse above for the rationale. + commitTS, allocErr := r.coordinator.Clock().NextFenced() + if allocErr != nil { + return errors.Wrap(allocErr, "redis list-push first-attempt: allocate commitTS") + } + ops, updatedMeta, err := buildFn(meta, key, values, commitTS, 0) + if err != nil { + return err + } + if len(ops) == 0 { + newLen = updatedMeta.Len + return nil + } + + startTS := normalizeStartTS(readTS) + boundaryReads := listPushBoundaryReadKeys(key, meta) + _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: startTS, + CommitTS: commitTS, + ReadKeys: boundaryReads, + Elems: ops, + }) + if dispErr == nil { + newLen = updatedMeta.Len + return nil + } + // Only remember the attempt for reuse if retryRedisWrite will actually + // loop — i.e. the error is one of WriteConflict / TxnLocked. For + // errors that escape the loop (transient-leader, context deadline, + // FSM apply error, etc.), `pending` would be discarded with the + // goroutine, and recording it would mislead a future reader about + // what state was preserved. The dedup window is therefore bounded by + // retryRedisWrite's retry predicate; ambiguous errors that escape + // to the client are a separate problem space (cross-request + // idempotency cache) and out of scope for this design. + if isRetryableRedisTxnErr(dispErr) { + pending = &reusableListPush{ + ops: ops, + startTS: startTS, + commitTS: commitTS, + length: updatedMeta.Len, + readKeys: boundaryReads, + } + } + return errors.WithStack(dispErr) + }) + return newLen, err +} + +func (r *RedisServer) listRPush(ctx context.Context, key []byte, values [][]byte) (int64, error) { + return r.listPushCore(ctx, key, values, r.buildRPushOps) +} + +// buildLPushOps creates operations to prepend values to the head of a list using +// the Delta pattern. LPUSH reverses the order of arguments: +// LPUSH key a b c → [c, b, a, ...existing]. +func (r *RedisServer) buildLPushOps(meta store.ListMeta, key []byte, values [][]byte, commitTS uint64, seqInTxn uint32) ([]*kv.Elem[kv.OP], store.ListMeta, error) { + if len(values) == 0 { + return nil, meta, nil + } + + n := int64(len(values)) + if meta.Head < math.MinInt64+n { + return nil, meta, errors.WithStack(errors.New("LPUSH would underflow list Head sequence number")) + } + elems := make([]*kv.Elem[kv.OP], 0, len(values)+1) + // LPUSH reverses args, so last arg gets the lowest sequence number. + newHead := meta.Head - n + for i, v := range values { + // values[0]=a, values[1]=b, values[2]=c → seq ordering: c(newHead), b(newHead+1), a(newHead+2) + seq := newHead + n - 1 - int64(i) + vCopy := bytes.Clone(v) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: listItemKey(key, seq), Value: vCopy}) + } + + // Emit a Delta key instead of writing the base meta key. + delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: -n, LenDelta: n}) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListMetaDeltaKey(key, commitTS, seqInTxn), Value: delta}) + + meta.Head = newHead + meta.Len += n + return elems, meta, nil +} + +func (r *RedisServer) listLPush(ctx context.Context, key []byte, values [][]byte) (int64, error) { + return r.listPushCore(ctx, key, values, r.buildLPushOps) +} + +// clampPopCount clamps count to [1, min(listLen, maxWideColumnItems)]. +// An error is returned when the effective count would exceed maxWideColumnItems, +// which guards against OOM from enormous claim-key allocations. +func clampPopCount(count int, listLen int64) (int64, error) { + n := int64(count) + if n > listLen { + n = listLen + } + if n > int64(maxWideColumnItems) { + return 0, errors.Wrapf(ErrCollectionTooLarge, "LPOP/RPOP count %d exceeds maximum %d", n, maxWideColumnItems) + } + return n, nil +} + +// listPopClaim implements LPOP (left=true) or RPOP (left=false) using the +// Claim pattern to avoid write-write conflicts on the list metadata key. +// For each item popped it emits: +// - Del(listItemKey) — removes the item value +// - Put(listClaimKey, empty) — uniqueness guard; conflicts if another txn +// claims the same sequence number concurrently +// +// A single ListMetaDelta with {HeadDelta, LenDelta} is emitted for the whole batch. +// +// Returns the popped values (len ≤ count) or nil if the list does not exist. +func (r *RedisServer) buildListPopElems(ctx context.Context, key []byte, meta store.ListMeta, n int64, left bool, readTS uint64) ([]string, []*kv.Elem[kv.OP], error) { + // Build the [start, end) scan range covering exactly the n items to pop. + // n is already clamped to meta.Len by the caller, so no overflow is possible. + var startKey, endKey []byte + if left { + startKey = listItemKey(key, meta.Head) + endKey = listItemKey(key, meta.Head+n) + } else { + startKey = listItemKey(key, meta.Tail-n) + endKey = listItemKey(key, meta.Tail) + } + + var kvps []*store.KVPair + var scanErr error + if left { + kvps, scanErr = r.store.ScanAt(ctx, startKey, endKey, int(n), readTS) + } else { + kvps, scanErr = r.store.ReverseScanAt(ctx, startKey, endKey, int(n), readTS) + } + if scanErr != nil { + return nil, nil, errors.WithStack(scanErr) + } + + // Emit claim keys for every sequence position in the claimed range, including + // holes. This ensures that two concurrent pops over the same hole produce a + // write conflict rather than both silently advancing HeadDelta over the same + // empty position, which would otherwise orphan later items. + var claimStart, claimEnd int64 + if left { + claimStart = meta.Head + claimEnd = meta.Head + n + } else { + claimStart = meta.Tail - n + claimEnd = meta.Tail + } + // Capacity: n claim keys + n Del(item) for found items + 1 for the delta key appended by caller. + // n is bounded by maxWideColumnItems (100_000) so the int conversion is safe. + elems := make([]*kv.Elem[kv.OP], 0, int(n)+len(kvps)+listPopDeltaOverhead) + for seq := claimStart; seq < claimEnd; seq++ { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ListClaimKey(key, seq), Value: []byte{}}) + } + + values := make([]string, 0, len(kvps)) + for _, pair := range kvps { + _, ok := store.ExtractListItemSeq(pair.Key, key) + if !ok { + continue + } + values = append(values, string(pair.Value)) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: bytes.Clone(pair.Key)}) + } + return values, elems, nil +} + +// checkListKeyType verifies the key is a list. Returns (keyFound, error). +// Writes wrongTypeError if the key exists but is not a list. +func (r *RedisServer) checkListKeyType(ctx context.Context, key []byte, readTS uint64) (found bool, err error) { + typ, typErr := r.keyTypeAt(ctx, key, readTS) + if typErr != nil { + return false, typErr + } + if typ == redisTypeNone { + return false, nil + } + if typ != redisTypeList { + return false, wrongTypeError() + } + return true, nil +} + +// listPopClaimOnce executes one attempt of a pop-with-claim transaction. +// Returns (nil, nil) for a missing key or an empty list, and the popped +// values otherwise. +func (r *RedisServer) listPopClaimOnce(ctx context.Context, key []byte, count int, left bool, readTS uint64) ([]string, error) { + found, typeErr := r.checkListKeyType(ctx, key, readTS) + if typeErr != nil || !found { + return nil, typeErr + } + + meta, exists, metaErr := r.resolveListMeta(ctx, key, readTS) + if metaErr != nil { + return nil, metaErr + } + if !exists || meta.Len == 0 { + // count >= 1 on an empty list: Redis returns nil (same as missing key). + return nil, nil + } + + n, err := clampPopCount(count, meta.Len) + if err != nil { + return nil, err + } + + values, elems, buildErr := r.buildListPopElems(ctx, key, meta, n, left, readTS) + if buildErr != nil { + return nil, buildErr + } + + if err := r.commitListPop(ctx, key, elems, n, left, readTS); err != nil { + return nil, err + } + return values, nil +} + +// commitListPop allocates commitTS, appends the ListMetaDelta entry, +// and dispatches the pop transaction. Extracted from listPopClaimOnce +// so that function stays under the cyclop ceiling after the HLC-4 +// (iii) NextFenced fence added a new error branch (PR #867 Phase 2b). +func (r *RedisServer) commitListPop(ctx context.Context, key []byte, elems []*kv.Elem[kv.OP], n int64, left bool, readTS uint64) error { + // n is the number of sequence positions claimed (including any holes). + // HeadDelta and LenDelta must use n, not len(values), so that Head + // advances past holes and the metadata stays consistent with Tail. + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return errors.Wrap(err, "commitListPop: allocate commitTS") + } + var headDelta int64 + if left { + headDelta = n // head advances by n positions for LPOP + } + delta := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: headDelta, LenDelta: -n}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ListMetaDeltaKey(key, commitTS, 0), + Value: delta, + }) + + if _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }); dispErr != nil { + return errors.WithStack(dispErr) + } + return nil +} + +func (r *RedisServer) listPopClaim(ctx context.Context, key []byte, count int, left bool) ([]string, error) { + // count=0: Redis returns an empty array if the key exists as a list, nil otherwise. + if count <= 0 { + readTS := r.readTS() + found, err := r.checkListKeyType(ctx, key, readTS) + if err != nil || !found { + return nil, err + } + return []string{}, nil + } + + var popped []string + err := r.retryRedisWrite(ctx, func() error { + result, popErr := r.listPopClaimOnce(ctx, key, count, left, r.readTS()) + if popErr != nil { + return popErr + } + popped = result + return nil + }) + return popped, err +} + +func (r *RedisServer) fetchListRange(ctx context.Context, key []byte, meta store.ListMeta, startIdx, endIdx int64, readTS uint64) ([]string, error) { + if endIdx < startIdx { + return []string{}, nil + } + + startSeq := meta.Head + startIdx + endSeq := meta.Head + endIdx + + startKey := listItemKey(key, startSeq) + endKey := listItemKey(key, endSeq+1) // exclusive + + kvs, err := r.store.ScanAt(ctx, startKey, endKey, int(endIdx-startIdx+1), readTS) + if err != nil { + return nil, errors.WithStack(err) + } + + out := make([]string, 0, len(kvs)) + for _, kvp := range kvs { + out = append(out, string(kvp.Value)) + } + return out, nil +} + +func (r *RedisServer) rangeList(ctx context.Context, key []byte, startRaw, endRaw []byte) ([]string, error) { + if !r.coordinator.IsLeaderForKey(key) { + return r.proxyLRange(key, startRaw, endRaw) + } + + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), key, readTS) + if err != nil { + return nil, err + } + if typ == redisTypeNone { + return []string{}, nil + } + if typ != redisTypeList { + return nil, wrongTypeError() + } + + // PR #749 follow-up: pass the per-call dispatch ctx so a stalled + // VerifyLeaderForKey honours the caller's deadline rather than the + // long-lived handlerContext + verifyLeaderEngineCtx fallback. Same + // shape as keys() / FLUSHDB. + if err := r.coordinator.VerifyLeaderForKey(ctx, key); err != nil { + return nil, errors.WithStack(err) + } + + meta, exists, err := r.resolveListMeta(context.Background(), key, readTS) + if err != nil { + return nil, err + } + if !exists || meta.Len == 0 { + return []string{}, nil + } + + s, e, err := parseRangeBounds(startRaw, endRaw, int(meta.Len)) + if err != nil { + return nil, err + } + + return r.fetchListRange(context.Background(), key, meta, int64(s), int64(e), readTS) +} + +type listPushFunc func(ctx context.Context, key []byte, values [][]byte) (int64, error) +type listProxyFunc func(key []byte, values [][]byte) (int64, error) + +func (r *RedisServer) listPushCmd(conn redcon.Conn, cmd redcon.Command, pushFn listPushFunc, proxyFn listProxyFunc) { + key := cmd.Args[1] + if !r.coordinator.IsLeaderForKey(key) { + length, err := proxyFn(key, cmd.Args[2:]) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt64(length) + return + } + + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ != redisTypeNone && typ != redisTypeList { + conn.WriteError(wrongTypeMessage) + return + } + + ctx := context.Background() + length, err := pushFn(ctx, key, cmd.Args[2:]) + + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt64(length) +} + +func (r *RedisServer) rpush(conn redcon.Conn, cmd redcon.Command) { + r.listPushCmd(conn, cmd, r.listRPush, r.proxyRPush) +} + +func (r *RedisServer) lrange(conn redcon.Conn, cmd redcon.Command) { + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + items, err := r.rangeList(ctx, cmd.Args[1], cmd.Args[2], cmd.Args[3]) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteArray(len(items)) + for _, it := range items { + conn.WriteBulkString(it) + } +} + +func (r *RedisServer) lpush(conn redcon.Conn, cmd redcon.Command) { + r.listPushCmd(conn, cmd, r.listLPush, r.proxyLPush) +} + +func (r *RedisServer) ltrim(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + start, err := parseInt(cmd.Args[2]) + if err != nil { + writeRedisError(conn, err) + return + } + stop, err := parseInt(cmd.Args[3]) + if err != nil { + writeRedisError(conn, err) + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + return err + } + if typ == redisTypeNone { + return nil + } + if typ != redisTypeList { + return wrongTypeError() + } + current, err := r.listValuesAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + return err + } + s, e := normalizeRankRange(start, stop, len(current)) + trimmed := []string{} + if e >= s { + trimmed = append(trimmed, current[s:e+1]...) + } + return r.rewriteListTxn(ctx, cmd.Args[1], readTS, trimmed) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteString("OK") +} + +func (r *RedisServer) lindex(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + index, err := parseInt(cmd.Args[2]) + if err != nil { + writeRedisError(conn, err) + return + } + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteNull() + return + } + if typ != redisTypeList { + conn.WriteError(wrongTypeMessage) + return + } + values, err := r.listValuesAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + idx := normalizeIndex(index, len(values)) + if idx < 0 { + conn.WriteNull() + return + } + conn.WriteBulkString(values[idx]) +} diff --git a/adapter/redis_proxy_leader.go b/adapter/redis_proxy_leader.go new file mode 100644 index 00000000..f239729c --- /dev/null +++ b/adapter/redis_proxy_leader.go @@ -0,0 +1,375 @@ +package adapter + +import ( + "bytes" + "context" + "fmt" + "io" + "net" + "strconv" + + pb "github.com/bootjp/elastickv/proto" + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + "github.com/redis/go-redis/v9" + "github.com/tidwall/redcon" +) + +func (r *RedisServer) proxyKeys(pattern []byte) ([]string, error) { + leader := r.coordinator.RaftLeader() + if leader == "" { + return nil, ErrLeaderNotFound + } + + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) + } + + cli := r.getOrCreateLeaderClient(leaderAddr) + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + keys, err := cli.Keys(ctx, string(pattern)).Result() + return keys, errors.WithStack(err) +} + +// proxyTransactionToLeader forwards a MULTI/EXEC transaction to the leader +// node and writes the EXEC response array back to conn. +// +//nolint:cyclop // inherent complexity of MULTI/EXEC proxy; refactoring would obscure the protocol flow +func (r *RedisServer) proxyTransactionToLeader(conn redcon.Conn, queue []redcon.Command) { + leaderAddr, ok := r.resolveLeaderRedisAddr(conn) + if !ok { + return + } + cli := r.getOrCreateLeaderClient(leaderAddr) + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + cmds, err := r.execTxPipeline(ctx, cli, queue) + if handleProxyTxnError(conn, err) { + return + } + writeProxyCmdsResult(conn, cmds) +} + +// resolveLeaderRedisAddr looks up the Redis address of the current Raft leader, +// writes an error reply to conn on failure and returns ("", false). +func (r *RedisServer) resolveLeaderRedisAddr(conn redcon.Conn) (string, bool) { + leader := r.coordinator.RaftLeader() + if leader == "" { + writeRedisError(conn, ErrLeaderNotFound) + return "", false + } + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + conn.WriteError(fmt.Sprintf("ERR leader redis address unknown for raft address %s", leader)) + return "", false + } + return leaderAddr, true +} + +// execTxPipeline sends queue as a single TxPipelined batch and returns the +// per-command result handles together with any pipeline-level error. +func (r *RedisServer) execTxPipeline(ctx context.Context, cli *redis.Client, queue []redcon.Command) ([]*redis.Cmd, error) { + cmds := make([]*redis.Cmd, 0, len(queue)) + _, err := cli.TxPipelined(ctx, func(pipe redis.Pipeliner) error { + for _, cmd := range queue { + args := make([]interface{}, len(cmd.Args)) + for i, a := range cmd.Args { + args[i] = a + } + cmds = append(cmds, pipe.Do(ctx, args...)) + } + return nil + }) + return cmds, errors.WithStack(err) +} + +// handleProxyTxnError writes the appropriate reply for terminal pipeline errors +// and returns true when the caller should return early without writing results. +func handleProxyTxnError(conn redcon.Conn, err error) bool { + // Transaction aborted (WATCH conflict): Redis protocol requires a Null + // array reply (*-1\r\n), not a null bulk string or an error. + // redis.Nil is a per-command nil response and must NOT be treated as an + // EXEC abort — only redis.TxFailedErr signals that. + if errors.Is(err, redis.TxFailedErr) { + conn.WriteArray(-1) + return true + } + // Fatal transport / context error: per-command results are unreliable. + if err != nil { + var netErr net.Error + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) || + errors.Is(err, io.EOF) || + errors.Is(err, io.ErrUnexpectedEOF) || + errors.As(err, &netErr) { + writeRedisError(conn, err) + return true + } + } + return false +} + +// writeProxyCmdsResult writes an EXEC-style array reply for the given pipeline +// command handles. For any other non-nil per-command errors, each cmd carries +// its own result, which is the correct Redis EXEC semantics. +func writeProxyCmdsResult(conn redcon.Conn, cmds []*redis.Cmd) { + conn.WriteArray(len(cmds)) + for _, cmd := range cmds { + writeGoRedisResult(conn, cmd) + } +} + +func (r *RedisServer) proxyLRange(key []byte, startRaw, endRaw []byte) ([]string, error) { + leader := r.coordinator.RaftLeaderForKey(key) + if leader == "" { + return nil, ErrLeaderNotFound + } + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) + } + + cli := r.getOrCreateLeaderClient(leaderAddr) + + start, err := parseInt(startRaw) + if err != nil { + return nil, err + } + end, err := parseInt(endRaw) + if err != nil { + return nil, err + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + res, err := cli.LRange(ctx, string(key), int64(start), int64(end)).Result() + return res, errors.WithStack(err) +} + +func (r *RedisServer) proxyRPush(key []byte, values [][]byte) (int64, error) { + leader := r.coordinator.RaftLeaderForKey(key) + if leader == "" { + return 0, ErrLeaderNotFound + } + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + return 0, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) + } + + cli := r.getOrCreateLeaderClient(leaderAddr) + + args := make([]any, 0, len(values)) + for _, v := range values { + args = append(args, string(v)) + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + res, err := cli.RPush(ctx, string(key), args...).Result() + return res, errors.WithStack(err) +} + +func (r *RedisServer) proxyLPush(key []byte, values [][]byte) (int64, error) { + leader := r.coordinator.RaftLeaderForKey(key) + if leader == "" { + return 0, ErrLeaderNotFound + } + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + return 0, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) + } + + cli := r.getOrCreateLeaderClient(leaderAddr) + + args := make([]any, 0, len(values)) + for _, v := range values { + args = append(args, string(v)) + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + res, err := cli.LPush(ctx, string(key), args...).Result() + return res, errors.WithStack(err) +} + +// getOrCreateLeaderClient returns a cached go-redis client for the given address, +// creating one if it doesn't exist. +func (r *RedisServer) getOrCreateLeaderClient(addr string) *redis.Client { + r.leaderClientsMu.RLock() + cli, ok := r.leaderClients[addr] + r.leaderClientsMu.RUnlock() + if ok { + return cli + } + + r.leaderClientsMu.Lock() + defer r.leaderClientsMu.Unlock() + // Double-check after acquiring write lock. + if cli, ok = r.leaderClients[addr]; ok { + return cli + } + cli = redis.NewClient(&redis.Options{Addr: addr}) + r.leaderClients[addr] = cli + return cli +} + +// leaderClientForKey returns a cached go-redis client connected to the leader +// for the given key. +func (r *RedisServer) leaderClientForKey(key []byte) (*redis.Client, error) { + leader := r.coordinator.RaftLeaderForKey(key) + if leader == "" { + return nil, ErrLeaderNotFound + } + leaderAddr, ok := r.leaderRedis[leader] + if !ok || leaderAddr == "" { + return nil, errors.WithStack(errors.Newf("ERR leader redis address unknown for %s", leader)) + } + return r.getOrCreateLeaderClient(leaderAddr), nil +} + +// proxyToLeader forwards a Redis command to the leader and writes the +// response to conn. Returns true if the command was proxied (caller should +// return immediately), false if this node is the leader. +func (r *RedisServer) proxyToLeader(conn redcon.Conn, cmd redcon.Command, key []byte) bool { + if r.coordinator.IsLeaderForKey(key) { + return false + } + cli, err := r.leaderClientForKey(key) + if err != nil { + writeRedisError(conn, err) + return true + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + args := make([]interface{}, len(cmd.Args)) + for i, a := range cmd.Args { + args[i] = a + } + writeGoRedisResult(conn, cli.Do(ctx, args...)) + return true +} + +func writeGoRedisResult(conn redcon.Conn, cmd *redis.Cmd) { + val, err := cmd.Result() + if err != nil { + if errors.Is(err, redis.Nil) { + conn.WriteNull() + } else { + writeRedisError(conn, err) + } + return + } + writeGoRedisValue(conn, val) +} + +func writeGoRedisValue(conn redcon.Conn, val interface{}) { + switch v := val.(type) { + case string: + conn.WriteBulkString(v) + case []byte: + conn.WriteBulk(v) + case int64: + conn.WriteInt64(v) + case bool: + conn.WriteInt(boolToInt(v)) + case float64: + conn.WriteBulkString(strconv.FormatFloat(v, 'f', -1, 64)) + case []interface{}: + writeGoRedisArray(conn, v) + case nil: + conn.WriteNull() + default: + conn.WriteBulkString(fmt.Sprint(v)) + } +} + +func writeGoRedisArray(conn redcon.Conn, arr []interface{}) { + conn.WriteArray(len(arr)) + for _, item := range arr { + writeGoRedisValue(conn, item) + } +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +func parseInt(b []byte) (int, error) { + i, err := strconv.Atoi(string(b)) + return i, errors.WithStack(err) +} + +// tryLeaderGet proxies a GET to the current Raft leader, returning the value and +// whether the proxy succeeded. +func (r *RedisServer) tryLeaderGetAt(key []byte, ts uint64) ([]byte, error) { + addr := r.coordinator.RaftLeaderForKey(key) + if addr == "" { + return nil, ErrLeaderNotFound + } + + conn, err := r.relayConnCache.ConnFor(addr) + if err != nil { + return nil, errors.WithStack(err) + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisRelayPublishTimeout) + defer cancel() + + cli := pb.NewRawKVClient(conn) + resp, err := cli.RawGet(ctx, &pb.RawGetRequest{Key: key, Ts: ts}) + if err != nil { + return nil, errors.WithStack(err) + } + // Compatibility with older nodes that don't set RawGetResponse.exists: + // treat any non-nil payload as found even when exists=false. + if !resp.GetExists() && resp.GetValue() == nil { + return nil, errors.WithStack(store.ErrKeyNotFound) + } + return resp.Value, nil +} + +func (r *RedisServer) readValueAt(ctx context.Context, key []byte, readTS uint64) ([]byte, error) { + ttlKey := key + nonStringInternal := false + if userKey := extractRedisInternalUserKey(key); userKey != nil { + ttlKey = userKey + // Non-string internal keys (!redis|hash|, !redis|set|, …) can never + // carry an embedded-TTL payload, so we can skip the !redis|str| probe + // that ttlAt would otherwise make. + nonStringInternal = !bytes.HasPrefix(key, []byte(redisStrPrefix)) + } + expired, err := r.hasExpired(context.Background(), ttlKey, readTS, nonStringInternal) + if err != nil { + return nil, err + } + if expired { + return nil, errors.WithStack(store.ErrKeyNotFound) + } + + if r.coordinator.IsLeaderForKey(key) { + // PR #749 follow-up: caller-supplied ctx (with + // redisDispatchTimeout from the dispatch handler) replaces + // r.handlerContext() so VerifyLeaderForKey honours the + // per-command deadline. Same shape as keys() / FLUSHDB. + if err := r.coordinator.VerifyLeaderForKey(ctx, key); err != nil { + return nil, errors.WithStack(err) + } + v, err := r.store.GetAt(context.Background(), key, readTS) + return v, errors.WithStack(err) + } + return r.tryLeaderGetAt(key, readTS) +} diff --git a/adapter/redis_server_cmds.go b/adapter/redis_server_cmds.go new file mode 100644 index 00000000..fe560331 --- /dev/null +++ b/adapter/redis_server_cmds.go @@ -0,0 +1,795 @@ +package adapter + +import ( + "context" + "errors" + "fmt" + "log" + "sort" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +func (r *RedisServer) info(conn redcon.Conn, _ redcon.Command) { + role := "slave" + if r.coordinator != nil && r.coordinator.IsLeader() { + role = "master" + } + + leaderRedis := r.raftLeaderRedisAddr() + + conn.WriteBulkString(strings.Join([]string{ + "# Server", + "redis_version:7.2.0", + "loading:0", + "role:" + role, + "", + "# Replication", + "role:" + role, + "raft_leader_redis:" + leaderRedis, + "", + }, "\r\n")) +} + +// raftLeaderRedisAddr returns the Redis-protocol address of the current Raft +// leader as known by this node. When this node is itself the leader the +// server's own listen address is returned. An empty string is returned when +// the leader is not yet known or when the leader's Redis address is not +// configured in the leaderRedis map. +func (r *RedisServer) raftLeaderRedisAddr() string { + if r.coordinator == nil { + return "" + } + if r.coordinator.IsLeader() { + return r.redisAddr + } + leader := r.coordinator.RaftLeader() + if leader == "" { + return "" + } + return r.leaderRedis[leader] +} + +// clientSubcommandArgCount is the total cmd.Args length (including +// CLIENT + subcommand) required by no-operand CLIENT subcommands +// like GETNAME / ID / INFO. +const clientSubcommandArgCount = 2 + +// checkClientArity verifies cmd.Args has exactly want elements and +// writes the standard Redis wrong-arity error otherwise. Returns +// true when the caller should stop handling (bad arity). +func checkClientArity(conn redcon.Conn, cmd redcon.Command, sub string, want int) bool { + if len(cmd.Args) == want { + return false + } + conn.WriteError("ERR wrong number of arguments for 'client|" + strings.ToLower(sub) + "' command") + return true +} + +// clientSetName handles CLIENT SETNAME. SETNAME is shared with +// HELLO's SETNAME clause; both write into the same connState.clientName +// slot so a client that uses HELLO SETNAME once and then queries +// CLIENT GETNAME gets the right answer without having to re-issue +// CLIENT SETNAME. +func clientSetName(conn redcon.Conn, cmd redcon.Command, state *connState) { + if checkClientArity(conn, cmd, "SETNAME", clientSetNameArgCount) { + return + } + state.clientName = string(cmd.Args[2]) + conn.WriteString("OK") +} + +func clientGetName(conn redcon.Conn, cmd redcon.Command, state *connState) { + if checkClientArity(conn, cmd, "GETNAME", clientSubcommandArgCount) { + return + } + if state.clientName == "" { + conn.WriteNull() + return + } + conn.WriteBulkString(state.clientName) +} + +func (r *RedisServer) clientID(conn redcon.Conn, cmd redcon.Command, state *connState) { + if checkClientArity(conn, cmd, "ID", clientSubcommandArgCount) { + return + } + conn.WriteInt64(int64(r.ensureConnID(state))) //nolint:gosec // connID monotonic counter, guaranteed <= math.MaxInt64 in practice +} + +func (r *RedisServer) clientInfo(conn redcon.Conn, cmd redcon.Command, state *connState) { + if checkClientArity(conn, cmd, "INFO", clientSubcommandArgCount) { + return + } + id := r.ensureConnID(state) + conn.WriteBulkString(fmt.Sprintf("id=%d addr=%s name=%s", id, conn.RemoteAddr(), state.clientName)) +} + +// clientSetInfo handles CLIENT SETINFO . elastickv does +// not persist the advertised attributes (lib-name / lib-ver, etc.), but +// it MUST still enforce exact arity — otherwise `CLIENT SETINFO` with +// no operands returns OK and masks a client bug that real Redis would +// have surfaced as a wrong-arity error. +func clientSetInfo(conn redcon.Conn, cmd redcon.Command) { + if checkClientArity(conn, cmd, "SETINFO", clientSetInfoArgCount) { + return + } + conn.WriteString("OK") +} + +func (r *RedisServer) client(conn redcon.Conn, cmd redcon.Command) { + sub := strings.ToUpper(string(cmd.Args[1])) + state := getConnState(conn) + switch sub { + case "SETINFO": + clientSetInfo(conn, cmd) + case "SETNAME": + clientSetName(conn, cmd, state) + case "GETNAME": + clientGetName(conn, cmd, state) + case "ID": + r.clientID(conn, cmd, state) + case "INFO": + r.clientInfo(conn, cmd, state) + default: + conn.WriteError("ERR unsupported CLIENT subcommand '" + sub + "'") + } +} + +// command implements the Redis `COMMAND` family used by clients for +// capability probing at connect time (go-redis, redis-py, ioredis, …). +// Subcommand matrix: +// +// COMMAND -> array of per-command info +// COMMAND COUNT -> integer +// COMMAND LIST -> array of names (FILTERBY rejected) +// COMMAND INFO [name ...] -> array of per-command info (nil per unknown) +// COMMAND DOCS [name ...] -> minimal map-shaped doc entries +// COMMAND GETKEYS cmd args -> array of extracted keys +// COMMAND GETKEYSANDFLAGS -> ERR unsupported +func (r *RedisServer) command(conn redcon.Conn, cmd redcon.Command) { + if len(cmd.Args) == 1 { + r.writeCommandInfoAll(conn) + return + } + sub := strings.ToUpper(string(cmd.Args[1])) + switch sub { + case "COUNT": + // COUNT must match the cardinality of COMMAND / COMMAND LIST — + // which iterate argsLen (= routed set). The table has the same + // size by invariant, but driving COUNT off argsLen keeps the + // three subcommands wire-consistent even during the brief + // window when a new route has been added but the table row is + // still pending. + conn.WriteInt(len(argsLen)) + case "LIST": + // `COMMAND LIST` takes no args (bare list) or `FILTERBY …` which we + // reject below. Anything past the subcommand slot is a filter. + const commandListArgFixed = 2 + if len(cmd.Args) > commandListArgFixed { + // We explicitly do not support FILTERBY MODULE|ACLCAT|PATTERN + // — elastickv has no modules and no ACL categories. Rejecting + // here is consistent with how real Redis would behave when a + // filter resolves to an empty universe; clients that see this + // fall back to COMMAND (no args), which we support. + conn.WriteError("ERR unsupported COMMAND LIST filter") + return + } + r.writeCommandList(conn) + case "INFO": + r.writeCommandInfo(conn, cmd.Args[2:]) + case "DOCS": + r.writeCommandDocs(conn, cmd.Args[2:]) + case "GETKEYS": + r.writeCommandGetKeys(conn, cmd.Args[2:]) + case "GETKEYSANDFLAGS": + conn.WriteError("ERR unsupported COMMAND subcommand 'GETKEYSANDFLAGS'") + default: + conn.WriteError("ERR Unknown COMMAND subcommand '" + sub + "'") + } +} + +// writeCommandInfoEntry emits the 6-element per-command info array for a +// single command. Redis 7 extends this to 10 elements; we deliberately +// stop at 6 because every client we care about parses the first 6 fields +// and ignores trailing elements. +func writeCommandInfoEntry(conn redcon.Conn, meta redisCommandMeta) { + const infoArity = 6 + conn.WriteArray(infoArity) + conn.WriteBulkString(meta.Name) + conn.WriteInt(meta.Arity) + conn.WriteArray(len(meta.Flags)) + for _, f := range meta.Flags { + conn.WriteBulkString(f) + } + conn.WriteInt(meta.FirstKey) + conn.WriteInt(meta.LastKey) + conn.WriteInt(meta.Step) +} + +func (r *RedisServer) writeCommandInfoAll(conn redcon.Conn) { + metas := routedRedisCommandMetas() + conn.WriteArray(len(metas)) + for _, meta := range metas { + writeCommandInfoEntry(conn, meta) + } +} + +func (r *RedisServer) writeCommandList(conn redcon.Conn) { + metas := routedRedisCommandMetas() + conn.WriteArray(len(metas)) + for _, meta := range metas { + conn.WriteBulkString(meta.Name) + } +} + +func (r *RedisServer) writeCommandInfo(conn redcon.Conn, requested [][]byte) { + // `COMMAND INFO` with no names is equivalent to `COMMAND` (no args): + // return info for every known command. This is what real Redis does + // and what go-redis relies on when it issues bare `COMMAND INFO`. + if len(requested) == 0 { + r.writeCommandInfoAll(conn) + return + } + conn.WriteArray(len(requested)) + for _, raw := range requested { + meta, ok := redisCommandTable[strings.ToUpper(string(raw))] + if !ok { + conn.WriteNull() + continue + } + writeCommandInfoEntry(conn, meta) + } +} + +// writeCommandDocs emits the RESP2 flat-map form of COMMAND DOCS: +// alternating command-name keys and 4-element doc-maps with "summary" +// and "arguments" fields. Two compliance-critical behaviours: +// +// 1. Bare `COMMAND DOCS` (no names) returns docs for ALL routed +// commands, identical to how `COMMAND INFO` and bare `COMMAND` +// behave. Clients/tools like redis-cli --docs rely on this. +// 2. Every requested entry writes BOTH the command-name key AND the +// doc map value. Clients decode the top-level array as a map of +// name -> docs, so skipping the name key makes the reply +// unparseable. Unknown commands emit the requested name followed +// by nil (Redis semantics). +// +// We do not maintain per-command docs, so summary is "" and arguments +// is empty. The wire-shape is what clients care about at connect time. +func (r *RedisServer) writeCommandDocs(conn redcon.Conn, requested [][]byte) { + const docEntryLen = 4 + // Bare DOCS (no command names): iterate the routed set so the + // reply mirrors `COMMAND` / `COMMAND INFO` / `COMMAND LIST`. + if len(requested) == 0 { + metas := routedRedisCommandMetas() + // Two wire slots per command (name + doc map). + conn.WriteArray(len(metas) * 2) //nolint:mnd // 2 = (name, docs) pair + for _, meta := range metas { + conn.WriteBulkString(meta.Name) + conn.WriteArray(docEntryLen) + conn.WriteBulkString("summary") + conn.WriteBulkString("") + conn.WriteBulkString("arguments") + conn.WriteArray(0) + } + return + } + // Explicit names: preserve the caller-supplied order so a client + // that expects its own request ordering back (e.g. for building a + // lookup table) is not surprised. Each pair is (name, docs) or + // (name, nil) for unknowns. + conn.WriteArray(len(requested) * 2) //nolint:mnd // 2 = (name, docs) pair + for _, raw := range requested { + name := string(raw) + meta, ok := redisCommandTable[strings.ToUpper(name)] + if !ok { + conn.WriteBulkString(name) + conn.WriteNull() + continue + } + conn.WriteBulkString(meta.Name) + conn.WriteArray(docEntryLen) + conn.WriteBulkString("summary") + conn.WriteBulkString("") + conn.WriteBulkString("arguments") + conn.WriteArray(0) + } +} + +// writeCommandGetKeys dispatches COMMAND GETKEYS for a given subcommand +// plus its arguments. Real Redis requires at least one arg after GETKEYS +// (the command name itself); we enforce that here rather than lean on +// argsLen which only validates the outer COMMAND call. +func (r *RedisServer) writeCommandGetKeys(conn redcon.Conn, argv [][]byte) { + if len(argv) == 0 { + conn.WriteError("ERR wrong number of arguments for 'command|getkeys' command") + return + } + meta, ok := redisCommandTable[strings.ToUpper(string(argv[0]))] + if !ok { + conn.WriteError("ERR Invalid command specified") + return + } + // validate arity of the nested command so we match Redis behaviour of + // refusing to compute keys for obviously malformed commands (a common + // source of confusion in client test suites). + switch { + case meta.Arity > 0 && len(argv) != meta.Arity: + conn.WriteError("ERR Invalid arguments specified for populating the array of keys") + return + case meta.Arity < 0 && len(argv) < -meta.Arity: + conn.WriteError("ERR Invalid arguments specified for populating the array of keys") + return + } + keys := redisCommandGetKeys(meta, argv) + if len(keys) == 0 { + // `The command has no key arguments` — real Redis returns an error + // in this case rather than an empty array, and go-redis's test + // suite expects the error form. + conn.WriteError("ERR The command has no key arguments") + return + } + conn.WriteArray(len(keys)) + for _, k := range keys { + conn.WriteBulk(k) + } +} + +// helloParseError is the internal signal used by parseHelloArgs to +// surface a client-facing error without forcing the top-level hello +// handler to pay for additional branches. The caller writes err to +// the wire verbatim. +type helloParseError struct{ msg string } + +func (e *helloParseError) Error() string { return e.msg } + +// parseHelloArgs walks the optional HELLO argument list and mutates +// connState for any recognized options. Returns a non-nil error +// containing the exact wire-format string to emit via WriteError. +// Split out of hello() so the handler's cyclomatic complexity stays +// within the linter's budget. +// parsedHelloOption is the pure-function result of a single option +// token. advance is the number of input args consumed. Exactly one +// of (advance > 0) or (err != nil) is non-zero. +type parsedHelloOption struct { + name string + hasName bool + advance int +} + +// parseHelloOption decodes one HELLO option starting at args[0] (the +// option keyword). Returns how many input tokens the option consumed +// and any client-side staging it wants applied. +func parseHelloOption(args [][]byte) (parsedHelloOption, error) { + opt := strings.ToUpper(string(args[0])) + switch opt { + case "AUTH": + if len(args) < helloAuthOptionArity { + return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO AUTH"} + } + // elastickv's Redis adapter has no AUTH layer. Rejecting rather + // than silently accepting keeps operators honest. + return parsedHelloOption{}, &helloParseError{msg: "NOPERM HELLO AUTH is not supported"} + case "SETNAME": + if len(args) < helloSetNameOptionArity { + return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO SETNAME"} + } + return parsedHelloOption{ + name: string(args[1]), + hasName: true, + advance: helloSetNameOptionArity, + }, nil + default: + return parsedHelloOption{}, &helloParseError{msg: "ERR Syntax error in HELLO option '" + opt + "'"} + } +} + +func parseHelloArgs(state *connState, args [][]byte) error { + if len(args) == 0 { + return nil + } + protover, err := strconv.Atoi(string(args[0])) + if err != nil || protover != helloReplyProto { + // Non-numeric, RESP3 (3), or any other requested version: + // reject with NOPROTO so well-behaved clients fall back to + // RESP2. + return &helloParseError{msg: "NOPROTO unsupported protocol version"} + } + // Buffer side effects locally so a partial parse (e.g. SETNAME + // followed by a bad option or AUTH) leaves connState untouched — + // the command must be all-or-nothing, matching real Redis. + var ( + pendingName string + pendingNameSet bool + ) + for i := 1; i < len(args); { + opt, err := parseHelloOption(args[i:]) + if err != nil { + return err + } + if opt.hasName { + pendingName = opt.name + pendingNameSet = true + } + i += opt.advance + } + if pendingNameSet { + state.clientName = pendingName + } + return nil +} + +// hello implements the Redis HELLO command. Syntax: +// +// HELLO [protover [AUTH username password] [SETNAME clientname]] +// +// elastickv speaks RESP2 only (redcon is RESP2-only and exposes no +// RESP3 map-reply API), so: +// +// - No protover, or protover == 2: succeed and return the server-info +// array. +// - protover == 3 or any other non-2 value: reply with the +// NOPROTO error the real Redis server uses when a client requests +// an unsupported protocol version. go-redis and friends fall back +// to RESP2 when they see this. +// - AUTH is rejected because elastickv has no auth layer wired into +// the Redis adapter; silently accepting any credentials would be a +// security footgun for operators who assume AUTH means something. +// We return a NOPERM-style error so clients surface a clear error +// rather than assuming auth succeeded. +// - SETNAME is wired into the existing connState.clientName slot, so +// a subsequent CLIENT GETNAME observes the name set here. +func (r *RedisServer) hello(conn redcon.Conn, cmd redcon.Command) { + state := getConnState(conn) + if err := parseHelloArgs(state, cmd.Args[1:]); err != nil { + writeRedisError(conn, err) + return + } + + role := "slave" + if r.coordinator != nil && r.coordinator.IsLeader() { + role = "master" + } + id := r.ensureConnID(state) + + // Reply as a flat RESP2 array of alternating key/value pairs, the + // same wire shape Redis uses when a client negotiates RESP2 via + // HELLO. Order matches real Redis so clients that parse + // positionally (jedis has done this historically) still work. + conn.WriteArray(helloReplyArrayLen) + conn.WriteBulkString("server") + conn.WriteBulkString("redis") + conn.WriteBulkString("version") + conn.WriteBulkString(helloReplyVersion) + conn.WriteBulkString("proto") + conn.WriteInt(helloReplyProto) + conn.WriteBulkString("id") + conn.WriteInt64(int64(id)) //nolint:gosec // connID monotonic counter, fits in int64 in practice. + conn.WriteBulkString("mode") + conn.WriteBulkString("standalone") + conn.WriteBulkString("role") + conn.WriteBulkString(role) + conn.WriteBulkString("modules") + conn.WriteArray(0) +} + +func (r *RedisServer) selectDB(conn redcon.Conn, cmd redcon.Command) { + if _, err := strconv.Atoi(string(cmd.Args[1])); err != nil { + conn.WriteError("ERR invalid DB index") + return + } + conn.WriteString("OK") +} + +func (r *RedisServer) quit(conn redcon.Conn, _ redcon.Command) { + conn.WriteString("OK") + _ = conn.Close() +} + +func (r *RedisServer) typeCmd(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + typ, err := r.keyType(context.Background(), cmd.Args[1]) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteString(string(typ)) +} + +func parseScanArgs(args [][]byte) (int, []byte, int, error) { + cursor, err := strconv.Atoi(string(args[1])) + if err != nil || cursor < 0 { + return 0, nil, 0, errors.New("ERR invalid cursor") + } + + pattern := []byte("*") + count := 10 + for i := redisPairWidth; i < len(args); i += redisPairWidth { + if i+1 >= len(args) { + return 0, nil, 0, errors.New("ERR syntax error") + } + switch strings.ToUpper(string(args[i])) { + case "MATCH": + pattern = args[i+1] + case redisKeywordCount: + count, err = strconv.Atoi(string(args[i+1])) + if err != nil || count <= 0 { + return 0, nil, 0, errors.New("ERR syntax error") + } + default: + return 0, nil, 0, errors.New("ERR syntax error") + } + } + return cursor, pattern, count, nil +} + +func writeScanReply(conn redcon.Conn, next int, keys [][]byte) { + conn.WriteArray(redisPairWidth) + conn.WriteBulkString(strconv.Itoa(next)) + conn.WriteArray(len(keys)) + for _, key := range keys { + conn.WriteBulk(key) + } +} + +func (r *RedisServer) scan(conn redcon.Conn, cmd redcon.Command) { + cursor, pattern, count, err := parseScanArgs(cmd.Args) + if err != nil { + writeRedisError(conn, err) + return + } + + keys, err := r.visibleKeys(pattern) + if err != nil { + writeRedisError(conn, err) + return + } + if cursor >= len(keys) { + writeScanReply(conn, 0, nil) + return + } + + end := minRedisInt(cursor+count, len(keys)) + next := 0 + if end < len(keys) { + next = end + } + + writeScanReply(conn, next, keys[cursor:end]) +} + +func (r *RedisServer) publish(conn redcon.Conn, cmd redcon.Command) { + count := r.publishCluster(context.Background(), cmd.Args[1], cmd.Args[2]) + if r.traceCommands { + log.Printf("redis trace publish remote=%s channel=%q subscribers=%d", conn.RemoteAddr(), string(cmd.Args[1]), count) + } + conn.WriteInt64(count) +} + +func (r *RedisServer) subscribe(conn redcon.Conn, cmd redcon.Command) { + for _, channel := range cmd.Args[1:] { + r.pubsub.Subscribe(conn, string(channel)) + } +} + +func (r *RedisServer) dbsize(conn redcon.Conn, _ redcon.Command) { + if !r.coordinator.IsLeader() { + size, err := r.proxyDBSize() + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(size) + return + } + if err := r.coordinator.VerifyLeader(r.handlerContext()); err != nil { + writeRedisError(conn, err) + return + } + + keys, err := r.visibleKeys([]byte("*")) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(len(keys)) +} + +func (r *RedisServer) flushdb(conn redcon.Conn, _ redcon.Command) { + r.flushDatabase(conn, false) +} + +func (r *RedisServer) flushall(conn redcon.Conn, _ redcon.Command) { + r.flushDatabase(conn, true) +} + +// deleteLegacyKeys scans the full keyspace and deletes keys that do not belong +// to any known internal prefix. Returns the number of user-visible legacy keys +// deleted. TTL keys are intentionally NOT deleted because the !redis|ttl| +// namespace is shared across all Redis types — deleting them could strip +// expiration from already-migrated or newly-created keys. +func (r *RedisServer) deleteLegacyKeys(ctx context.Context, readTS uint64) (int, error) { + const batchSize = 1000 + var totalDeleted int + cursor := make([]byte, 0, batchSize) + for { + kvs, err := r.store.ScanAt(ctx, cursor, nil, batchSize, readTS) + if err != nil { + return totalDeleted, fmt.Errorf("scan: %w", err) + } + if len(kvs) == 0 { + break + } + + elems := make([]*kv.Elem[kv.OP], 0, len(kvs)) + legacyCount := 0 + for _, pair := range kvs { + if !isKnownInternalKey(pair.Key) { + legacyCount++ + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: pair.Key}) + } + } + + if len(elems) > 0 { + if err := r.dispatchElems(ctx, false, readTS, elems); err != nil { + return totalDeleted, err + } + totalDeleted += legacyCount + } + + // Advance cursor past the last key in this batch. + lastKey := kvs[len(kvs)-1].Key + cursor = make([]byte, len(lastKey)+1) + copy(cursor, lastKey) + + // Yield briefly between batches to avoid saturating the Raft log. + time.Sleep(time.Millisecond) + } + return totalDeleted, nil +} + +// flushlegacy deletes old unprefixed Redis string keys that were written before +// the !redis|str| prefix migration. It scans all keys and deletes those that +// do not match any known internal prefix. This is a one-time migration operation. +func (r *RedisServer) flushlegacy(conn redcon.Conn, _ redcon.Command) { + if !r.coordinator.IsLeader() { + n, err := r.proxyFlushLegacy() + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(n) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisFlushLegacyTimeout) + defer cancel() + + totalDeleted, err := r.deleteLegacyKeys(ctx, r.readTS()) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(totalDeleted) +} + +func (r *RedisServer) flushDatabase(conn redcon.Conn, all bool) { + if !r.coordinator.IsLeader() { + if err := r.proxyFlushDatabase(all); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteString("OK") + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + + if err := r.retryRedisWrite(ctx, func() error { + // Use the per-call ctx with redisDispatchTimeout, NOT + // handlerContext (the long-lived server baseCtx). FLUSHDB's + // retry budget already lives in ctx; routing it to + // VerifyLeader keeps the whole command bounded. + if err := r.coordinator.VerifyLeader(ctx); err != nil { + return fmt.Errorf("verify leader: %w", err) + } + + // Delete only Redis-related keys. Each DEL_PREFIX operation must be + // dispatched separately because the FSM processes only one DEL_PREFIX + // per request (the first mutation). + // + // Namespaces covered: + // "!redis|" — str, legacy hash/set/zset/hll/stream, ttl + // "!lst|" — list meta + items + // "!zs|" — zset wide-column + // "!hs|" — hash wide-column meta/field/delta + // "!st|" — set wide-column meta/member/delta + // + // Legacy bare keys are NOT deleted here to avoid a full keyspace + // scan. Run FLUSHLEGACY first to clean up legacy data. + // + // All prefixes are attempted even if one dispatch fails so that we + // delete as many namespaces as possible before reporting errors. + var combined error + for _, prefix := range [][]byte{ + []byte("!redis|"), + []byte("!lst|"), + []byte("!zs|"), + []byte("!hs|"), + []byte("!st|"), + } { + if _, err := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + Elems: []*kv.Elem[kv.OP]{ + {Op: kv.DelPrefix, Key: prefix}, + }, + }); err != nil { + combined = cockerrors.CombineErrors(combined, fmt.Errorf("dispatch del_prefix %q: %w", prefix, err)) + } + } + return cockerrors.WithStack(combined) + }); err != nil { + writeRedisError(conn, err) + return + } + + conn.WriteString("OK") +} + +func (r *RedisServer) pubsubCmd(conn redcon.Conn, cmd redcon.Command) { + switch strings.ToUpper(string(cmd.Args[1])) { + case "CHANNELS": + r.writePubSubChannels(conn, cmd.Args) + case "NUMSUB": + r.writePubSubNumSub(conn, cmd.Args) + case "NUMPAT": + conn.WriteInt(0) + default: + conn.WriteError("ERR unsupported PUBSUB subcommand '" + string(cmd.Args[1]) + "'") + } +} + +func (r *RedisServer) writePubSubChannels(conn redcon.Conn, args [][]byte) { + pattern := []byte("*") + if len(args) >= pubsubPatternArgMin { + pattern = args[pubsubFirstChannel] + } + + counts := r.pubsubChannelCounts() + channels := make([]string, 0, len(counts)) + for channel, count := range counts { + if count <= 0 || !matchesAsteriskPattern(pattern, []byte(channel)) { + continue + } + channels = append(channels, channel) + } + + sort.Strings(channels) + conn.WriteArray(len(channels)) + for _, channel := range channels { + conn.WriteBulkString(channel) + } +} + +func (r *RedisServer) writePubSubNumSub(conn redcon.Conn, args [][]byte) { + channels := args[pubsubFirstChannel:] + snapshot := r.pubsubChannelCounts() + + conn.WriteArray(len(channels) * redisPairWidth) + for _, channel := range channels { + conn.WriteBulk(channel) + conn.WriteInt(snapshot[string(channel)]) + } +} + +func (r *RedisServer) pubsubChannelCounts() map[string]int { + return r.pubsub.ChannelCounts() +} diff --git a/adapter/redis_set_cmds.go b/adapter/redis_set_cmds.go new file mode 100644 index 00000000..ec671dac --- /dev/null +++ b/adapter/redis_set_cmds.go @@ -0,0 +1,632 @@ +package adapter + +import ( + "context" + "errors" + "fmt" + "slices" + "sort" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +func (r *RedisServer) sadd(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.mutateExactSet(conn, setKind, cmd.Args[1], cmd.Args[2:], true) +} + +func (r *RedisServer) srem(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.mutateExactSet(conn, setKind, cmd.Args[1], cmd.Args[2:], false) +} + +func (r *RedisServer) validateExactSetKind(kind string, key []byte, readTS uint64) error { + typ, err := r.keyTypeAt(context.Background(), key, readTS) + if err != nil { + return err + } + + switch kind { + case setKind: + return r.validateExactSetType(typ, key, readTS) + case hllKind: + return r.validateExactHLLType(typ, key, readTS) + default: + return errors.New("ERR unsupported exact set kind") + } +} + +func (r *RedisServer) hllExistsAt(key []byte, readTS uint64) (bool, error) { + exists, err := r.store.ExistsAt(context.Background(), redisHLLKey(key), readTS) + if err != nil { + return false, fmt.Errorf("exists hll: %w", err) + } + return exists, nil +} + +func (r *RedisServer) validateExactSetType(typ redisValueType, key []byte, readTS uint64) error { + if typ == redisTypeSet { + return nil + } + if typ != redisTypeNone { + return wrongTypeError() + } + + hllExists, err := r.hllExistsAt(key, readTS) + if err != nil { + return err + } + if hllExists { + return wrongTypeError() + } + return nil +} + +func (r *RedisServer) validateExactHLLType(typ redisValueType, key []byte, readTS uint64) error { + if typ == redisTypeNone { + return nil + } + + hllExists, err := r.hllExistsAt(key, readTS) + if err != nil { + return err + } + if !hllExists { + return wrongTypeError() + } + return nil +} + +func exactSetMembers(value redisSetValue) map[string]struct{} { + members := make(map[string]struct{}, len(value.Members)) + for _, member := range value.Members { + members[member] = struct{}{} + } + return members +} + +func applyExactSetMutation(existing map[string]struct{}, members [][]byte, add bool) int { + changed := 0 + for _, member := range members { + memberKey := string(member) + _, ok := existing[memberKey] + if add { + if ok { + continue + } + existing[memberKey] = struct{}{} + changed++ + continue + } + if ok { + delete(existing, memberKey) + changed++ + } + } + return changed +} + +func sortedExactSetMembers(existing map[string]struct{}) []string { + out := make([]string, 0, len(existing)) + for member := range existing { + out = append(out, member) + } + sort.Strings(out) + return out +} + +func (r *RedisServer) persistExactSetMembersTxn(ctx context.Context, kind string, key []byte, readTS uint64, members map[string]struct{}) error { + if kind != setKind { + // HLL and other non-set kinds keep using the legacy blob format. + if len(members) == 0 { + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, elems) + } + payload, err := marshalSetValue(redisSetValue{Members: sortedExactSetMembers(members)}) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: redisExactSetStorageKey(kind, key), Value: payload}, + }) + } + // Wide-column set: full rewrite (used when the whole state is available). + if len(members) == 0 { + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, elems) + } + elems := make([]*kv.Elem[kv.OP], 0, len(members)+setWideColOverhead) + for member := range members { + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.SetMemberKey(key, []byte(member)), + Value: []byte{}, + }) + } + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.SetMetaKey(key), + Value: store.MarshalSetMeta(store.SetMeta{Len: int64(len(members))}), + }) + // Remove legacy blob if present. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisSetKey(key)}) + return r.dispatchElems(ctx, true, readTS, elems) +} + +// applySetMemberMutation emits a Put or Del for one set member and returns the +// change count (1) and the signed length delta (+1 or -1), or (0, 0) if no change. +func applySetMemberMutation(elems []*kv.Elem[kv.OP], memberKey []byte, exists, add bool) ([]*kv.Elem[kv.OP], int, int64) { + if add && !exists { + return append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: []byte{}}), 1, 1 + } + if !add && exists { + return append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: memberKey}), 1, -1 + } + return elems, 0, 0 +} + +// mutateExactSetLegacy handles SADD/SREM for non-set kinds (e.g. HLL) via the legacy blob path. +func (r *RedisServer) mutateExactSetLegacy(conn redcon.Conn, ctx context.Context, kind string, key []byte, members [][]byte, add bool) { + var changed int + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + if err := r.validateExactSetKind(kind, key, readTS); err != nil { + return err + } + value, err := r.loadSetAt(context.Background(), kind, key, readTS) + if err != nil { + return err + } + existing := exactSetMembers(value) + changed = applyExactSetMutation(existing, members, add) + if changed == 0 { + return nil + } + return r.persistExactSetMembersTxn(ctx, kind, key, readTS, existing) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(changed) +} + +// mutateExactSetWide handles SADD/SREM for the wide-column set path. +func (r *RedisServer) mutateExactSetWide(conn redcon.Conn, ctx context.Context, key []byte, members [][]byte, add bool) { + var changed int + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + if err := r.validateExactSetKind(setKind, key, readTS); err != nil { + return err + } + + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return cockerrors.Wrap(err, "mutateExactSetWide: allocate commitTS") + } + + migrationElems, migErr := r.buildSetLegacyMigrationElems(ctx, key, readTS) + if migErr != nil { + return migErr + } + elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(members)+setWideColOverhead) + elems = append(elems, migrationElems...) + + // Extract legacy member names from migration ops so that applySetMemberMutations + // can treat them as already-existing (they are not yet visible at readTS). + legacyMemberBase := buildLegacySetMemberBase(migrationElems, key) + + var lenDelta int64 + var mutErr error + elems, changed, lenDelta, mutErr = r.applySetMemberMutations(ctx, key, members, add, readTS, elems, legacyMemberBase) + if mutErr != nil { + return mutErr + } + + if changed == 0 && len(migrationElems) == 0 { + return nil + } + + if lenDelta != 0 { + deltaVal := store.MarshalSetMetaDelta(store.SetMetaDelta{LenDelta: lenDelta}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.SetMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + + if len(elems) == 0 { + return nil + } + + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return cockerrors.WithStack(dispatchErr) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(changed) +} + +// scanSetMemberExistsMap does a paginated prefix scan of all member keys for +// the given set and returns a map from member name to struct{}{}. +// Using a single prefix scan eliminates the per-member ExistsAt round-trip. +func (r *RedisServer) scanSetMemberExistsMap(ctx context.Context, key []byte, readTS uint64) (map[string]struct{}, error) { + return r.scanKeyExistsMap(ctx, store.SetMemberScanPrefix(key), readTS, + func(k []byte) []byte { return store.ExtractSetMemberName(k, key) }) +} + +// scanHashFieldExistsMap does a paginated prefix scan of all field keys for +// the given hash and returns a map from field name to struct{}{}. +// Using a single prefix scan eliminates per-field ExistsAt round-trips. +func (r *RedisServer) scanHashFieldExistsMap(ctx context.Context, key []byte, readTS uint64) (map[string]struct{}, error) { + return r.scanKeyExistsMap(ctx, store.HashFieldScanPrefix(key), readTS, + func(k []byte) []byte { return store.ExtractHashFieldName(k, key) }) +} + +// mergeZSetBulkScores performs a single prefix scan of ZSet member keys and +// merges the store scores into inTxnView when pairCount >= wideColumnBulkScanThreshold. +// This avoids O(pairCount) individual GetAt round-trips inside applyZAddPair. +// Members already in inTxnView (migration elems or earlier pairs) take precedence. +// Returns inTxnView unchanged when the batch is below the threshold. +func (r *RedisServer) mergeZSetBulkScores(ctx context.Context, key []byte, readTS uint64, pairCount int, inTxnView map[string]float64) (map[string]float64, error) { + if pairCount < wideColumnBulkScanThreshold { + return inTxnView, nil + } + bulkScores, err := r.scanZSetMemberScoreMap(ctx, key, readTS) + if err != nil { + return nil, err + } + if inTxnView == nil { + return bulkScores, nil + } + for m, s := range bulkScores { + if _, alreadySeen := inTxnView[m]; !alreadySeen { + inTxnView[m] = s + } + } + return inTxnView, nil +} + +// scanZSetMemberScoreMap does a paginated prefix scan of all member keys for +// the given ZSet and returns a map from member name to its current score. +// Using a single prefix scan eliminates O(N) GetAt round-trips in ZADD for +// large batches (>= wideColumnBulkScanThreshold pairs). +func (r *RedisServer) scanZSetMemberScoreMap(ctx context.Context, key []byte, readTS uint64) (map[string]float64, error) { + scanPrefix := store.ZSetMemberScanPrefix(key) + scanEnd := store.PrefixScanEnd(scanPrefix) + scores := make(map[string]float64) + cursor := scanPrefix + for { + scanKVs, err := r.store.ScanAt(ctx, cursor, scanEnd, store.MaxDeltaScanLimit, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + for _, pair := range scanKVs { + m := store.ExtractZSetMemberName(pair.Key, key) + if m == nil { + continue + } + if s, decodeErr := store.UnmarshalZSetScore(pair.Value); decodeErr == nil { + scores[string(m)] = s + } + } + if len(scanKVs) < store.MaxDeltaScanLimit { + break + } + lastKey := scanKVs[len(scanKVs)-1].Key + next := make([]byte, len(lastKey)+1) + copy(next, lastKey) + cursor = next + } + return scores, nil +} + +// scanKeyExistsMap paginates through all keys under scanPrefix, extracts a +// name from each key using extractName, and builds a set of existing names. +// It is used by scanSetMemberExistsMap and scanHashFieldExistsMap to eliminate +// per-key ExistsAt round-trips during SADD/SREM/HDEL operations. +func (r *RedisServer) scanKeyExistsMap(ctx context.Context, scanPrefix []byte, readTS uint64, extractName func([]byte) []byte) (map[string]struct{}, error) { + scanEnd := store.PrefixScanEnd(scanPrefix) + existsMap := make(map[string]struct{}) + cursor := scanPrefix + for { + scanKVs, err := r.store.ScanAt(ctx, cursor, scanEnd, store.MaxDeltaScanLimit, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + for _, pair := range scanKVs { + if name := extractName(pair.Key); name != nil { + existsMap[string(name)] = struct{}{} + } + } + if len(scanKVs) < store.MaxDeltaScanLimit { + break + } + lastKey := scanKVs[len(scanKVs)-1].Key + next := make([]byte, len(lastKey)+1) + copy(next, lastKey) + cursor = next + } + return existsMap, nil +} + +// initSetExistsMap builds the initial existence map for a set mutation batch. +// For large batches or when legacy members are present it does a bulk prefix +// scan; otherwise it returns an empty (non-nil) map for per-member ExistsAt +// fallback. Legacy members from migration elems are merged in so that members +// already in-flight in the same transaction are treated as existing. +func (r *RedisServer) initSetExistsMap(ctx context.Context, key []byte, members [][]byte, readTS uint64, legacyBase map[string]struct{}) (map[string]struct{}, error) { + existsMap := make(map[string]struct{}) + if len(members) >= wideColumnBulkScanThreshold || len(legacyBase) > 0 { + var err error + existsMap, err = r.scanSetMemberExistsMap(ctx, key, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + } + for m := range legacyBase { + existsMap[m] = struct{}{} + } + return existsMap, nil +} + +// lookupSetMemberExists reports whether memberStr is present, updating +// existsMap as a cache. For small clean batches (no bulk scan, no legacy +// migration) it falls back to an ExistsAt store read; otherwise it relies +// solely on the pre-built map. +func (r *RedisServer) lookupSetMemberExists(ctx context.Context, memberStr string, memberKey []byte, readTS uint64, existsMap map[string]struct{}, isSmallClean bool) (bool, error) { + if _, ok := existsMap[memberStr]; ok { + return true, nil + } + if !isSmallClean { + return false, nil + } + exists, err := r.store.ExistsAt(ctx, memberKey, readTS) + if err != nil { + return false, cockerrors.WithStack(err) + } + if exists { + existsMap[memberStr] = struct{}{} + } + return exists, nil +} + +// applySetMemberMutations resolves existence for each member using either a +// pre-built bulk scan (for large batches) or individual ExistsAt calls (for +// small batches), then applies the mutation to elems. +// The bulk scan threshold is wideColumnBulkScanThreshold. +// legacyBase contains members from a legacy blob being migrated in the same +// transaction; they are not visible at readTS and must be treated as existing. +func (r *RedisServer) applySetMemberMutations(ctx context.Context, key []byte, members [][]byte, add bool, readTS uint64, elems []*kv.Elem[kv.OP], legacyBase map[string]struct{}) ([]*kv.Elem[kv.OP], int, int64, error) { + existsMap, err := r.initSetExistsMap(ctx, key, members, readTS, legacyBase) + if err != nil { + return nil, 0, 0, err + } + isSmallClean := len(members) < wideColumnBulkScanThreshold && len(legacyBase) == 0 + changed := 0 + lenDelta := int64(0) + for _, member := range members { + memberStr := string(member) + memberKey := store.SetMemberKey(key, member) + exists, lookupErr := r.lookupSetMemberExists(ctx, memberStr, memberKey, readTS, existsMap, isSmallClean) + if lookupErr != nil { + return nil, 0, 0, lookupErr + } + var cnt int + var d int64 + elems, cnt, d = applySetMemberMutation(elems, memberKey, exists, add) + changed += cnt + lenDelta += d + // Update existsMap to reflect this mutation so that subsequent + // duplicate members in this call observe the correct in-txn state. + if add { + existsMap[memberStr] = struct{}{} + } else { + delete(existsMap, memberStr) + } + } + return elems, changed, lenDelta, nil +} + +func (r *RedisServer) mutateExactSet(conn redcon.Conn, kind string, key []byte, members [][]byte, add bool) { + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + + if kind != setKind { + r.mutateExactSetLegacy(conn, ctx, kind, key, members, add) + return + } + r.mutateExactSetWide(conn, ctx, key, members, add) +} + +func (r *RedisServer) sismember(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + key := cmd.Args[1] + member := cmd.Args[2] + readTS := r.readTS() + ctx := context.Background() + + hit, alive, err := r.setMemberFastExists(ctx, key, member, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if hit { + if alive { + conn.WriteInt(1) + } else { + conn.WriteInt(0) + } + return + } + r.sismemberSlow(conn, ctx, key, member, readTS) +} + +func (r *RedisServer) setMemberFastExists(ctx context.Context, key, member []byte, readTS uint64) (hit, alive bool, err error) { + // Probe FIRST; guard only on hit. See hashFieldFastLookup for the + // regression rationale. + exists, err := r.store.ExistsAt(ctx, store.SetMemberKey(key, member), readTS) + if err != nil { + return false, false, cockerrors.WithStack(err) + } + if !exists { + return false, false, nil + } + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return false, false, hErr + } else if higher { + return false, false, nil + } + expired, expErr := r.hasExpired(ctx, key, readTS, true) + if expErr != nil { + return false, false, cockerrors.WithStack(expErr) + } + return true, !expired, nil +} + +func (r *RedisServer) sismemberSlow(conn redcon.Conn, ctx context.Context, key, member []byte, readTS uint64) { + typ, err := r.keyTypeAt(ctx, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteInt(0) + return + } + if typ != redisTypeSet { + conn.WriteError(wrongTypeMessage) + return + } + value, err := r.loadSetAt(ctx, setKind, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if slices.Contains(value.Members, string(member)) { + conn.WriteInt(1) + return + } + conn.WriteInt(0) +} + +func (r *RedisServer) smembers(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + typ, err := r.keyTypeAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteArray(0) + return + } + if typ != redisTypeSet { + conn.WriteError(wrongTypeMessage) + return + } + + value, err := r.loadSetAt(context.Background(), setKind, cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteArray(len(value.Members)) + for _, member := range value.Members { + conn.WriteBulkString(member) + } +} + +func (r *RedisServer) pfadd(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var changed int + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + if err := r.validateExactSetKind(hllKind, cmd.Args[1], readTS); err != nil { + return err + } + + value, err := r.loadSetAt(context.Background(), hllKind, cmd.Args[1], readTS) + if err != nil { + return err + } + existing := exactSetMembers(value) + changed = applyExactSetMutation(existing, cmd.Args[2:], true) + if changed == 0 { + return nil + } + + return r.persistExactSetMembersTxn(ctx, hllKind, cmd.Args[1], readTS, existing) + }); err != nil { + writeRedisError(conn, err) + return + } + if changed == 0 { + conn.WriteInt(0) + } else { + conn.WriteInt(1) + } +} + +func (r *RedisServer) pfcount(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + union := map[string]struct{}{} + for _, key := range cmd.Args[1:] { + typ, err := r.keyTypeAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ != redisTypeNone { + hllExists, err := r.store.ExistsAt(context.Background(), redisHLLKey(key), readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if !hllExists { + conn.WriteError(wrongTypeMessage) + return + } + } + value, err := r.loadSetAt(context.Background(), hllKind, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + for _, member := range value.Members { + union[member] = struct{}{} + } + } + conn.WriteInt(len(union)) +} diff --git a/adapter/redis_stream_cmds.go b/adapter/redis_stream_cmds.go new file mode 100644 index 00000000..9275d024 --- /dev/null +++ b/adapter/redis_stream_cmds.go @@ -0,0 +1,1492 @@ +package adapter + +import ( + "bytes" + "context" + "errors" + "sort" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +type xreadRequest struct { + block time.Duration + count int + keys [][]byte + afterIDs []string +} + +type xreadOptions struct { + block time.Duration + count int + streamsIndex int +} + +type xreadResult struct { + key []byte + entries []redisStreamEntry +} + +type xaddRequest struct { + // maxLen is -1 when no MAXLEN clause was given, 0 for explicit MAXLEN 0, + // or a positive value for MAXLEN . + maxLen int + id string + fields []string +} + +func parseXAddMaxLen(args [][]byte) (int, int, error) { + argIndex := redisPairWidth + if len(args) < 5 || !strings.EqualFold(string(args[argIndex]), "MAXLEN") { + return -1, argIndex, nil + } + + argIndex++ + if argIndex < len(args) && string(args[argIndex]) == "~" { + argIndex++ + } + if argIndex >= len(args) { + return 0, 0, errors.New("ERR syntax error") + } + + maxLen, err := strconv.Atoi(string(args[argIndex])) + if err != nil || maxLen < 0 { + return 0, 0, errors.New("ERR syntax error") + } + return maxLen, argIndex + 1, nil +} + +func parseXAddFields(args [][]byte, argIndex int) ([]string, error) { + if argIndex >= len(args) { + return nil, errors.New("ERR syntax error") + } + if (len(args)-argIndex)%redisPairWidth != 0 { + return nil, errors.New("ERR wrong number of arguments for 'XADD' command") + } + + fields := make([]string, 0, len(args)-argIndex) + for _, arg := range args[argIndex:] { + fields = append(fields, string(arg)) + } + return fields, nil +} + +func parseXAddRequest(args [][]byte) (xaddRequest, error) { + maxLen, argIndex, err := parseXAddMaxLen(args) + if err != nil { + return xaddRequest{}, err + } + if argIndex >= len(args) { + return xaddRequest{}, errors.New("ERR syntax error") + } + fields, err := parseXAddFields(args, argIndex+1) + if err != nil { + return xaddRequest{}, err + } + return xaddRequest{maxLen: maxLen, id: string(args[argIndex]), fields: fields}, nil +} + +// nextXAddID computes the ID the next XADD should assign. +// +// hasLast reports whether the stream currently tracks a "last" ID (i.e. at +// least one XADD has ever succeeded). last{Ms,Seq} must be the highest ID +// the stream has ever seen — not merely the current tail — so that XADD '*' +// stays strictly monotonic even after XTRIM removes the current tail. +func nextXAddID(hasLast bool, lastMs, lastSeq uint64, requested string) (string, error) { + if requested != "*" { + requestedID, requestedValid := tryParseRedisStreamID(requested) + if !requestedValid { + return "", errors.New("ERR Invalid stream ID specified as stream command argument") + } + // Redis rejects IDs <= 0-0 unconditionally; a stream entry with + // ID "0-0" is unreachable via XREAD ... 0 (which means "after 0-0"). + if requestedID.ms == 0 && requestedID.seq == 0 { + return "", errors.New("ERR The ID specified in XADD must be greater than 0-0") + } + if hasLast && compareStreamIDs(requestedID.ms, requestedID.seq, lastMs, lastSeq) <= 0 { + return "", errors.New("ERR The ID specified in XADD is equal or smaller than the target stream top item") + } + return requested, nil + } + return autoXAddID(safeUnixMilliToUint64(time.Now().UnixMilli()), hasLast, lastMs, lastSeq) +} + +// autoXAddID resolves XADD '*' to a concrete stream ID given a wall-clock +// nowMs. Pulled out of nextXAddID so the auto-ID branch is testable +// without depending on time.Now() — the only un-injectable dependency is +// already isolated in the caller. +// +// Two corner cases the caller cannot rely on the wall clock to avoid: +// +// - nowMs == 0 on a fresh stream (!hasLast). A naive "-0" reply +// yields "0-0", which Redis explicitly rejects as a stream ID and +// which XREAD ... 0 would treat as the empty after-marker. Bump the +// seq to 1 so the first auto-generated entry is "0-1" — strictly +// greater than 0-0 and reachable via XREAD ... 0. (This case fires +// only when safeUnixMilliToUint64 clamped a pre-epoch clock to 0; +// under any sane clock, nowMs is well above 0.) +// +// - nowMs <= lastMs. Advance past lastMs/lastSeq via bumpStreamID so +// the stream stays strictly monotonic even across a backwards clock +// step or a corrupted meta where lastMs is far in the future. +func autoXAddID(nowMs uint64, hasLast bool, lastMs, lastSeq uint64) (string, error) { + if !hasLast || nowMs > lastMs { + seq := uint64(0) + if nowMs == 0 { + seq = 1 + } + return strconv.FormatUint(nowMs, 10) + "-" + strconv.FormatUint(seq, 10), nil + } + // Either nowMs == lastMs (same millisecond), or lastMs is in the future + // (monotonic guarantee across a backwards clock step or a corrupted + // meta). Advance past lastMs-lastSeq via bumpStreamID; if the ID space + // is exhausted, surface an error rather than wrap to 0. + ms, seq, err := bumpStreamID(lastMs, lastSeq) + if err != nil { + return "", err + } + return strconv.FormatUint(ms, 10) + "-" + strconv.FormatUint(seq, 10), nil +} + +// safeUnixMilliToUint64 returns ms as uint64, clamping any negative value +// (caused by a system clock set before the Unix epoch) to 0. Without this +// clamp, a direct uint64 cast of a negative int64 would yield a value +// near math.MaxUint64, which would then make nextXAddID's "future-ms" +// branch chase that pathological value forever — effectively wedging +// every subsequent XADD '*' on the stream until the clock recovers. +// The lastMs/lastSeq monotonic guarantee carries the stream forward +// from there via bumpStreamID. +func safeUnixMilliToUint64(ms int64) uint64 { + if ms < 0 { + return 0 + } + return uint64(ms) //nolint:gosec // negative values handled above +} + +// bumpStreamID returns the strictly-greater successor of (ms, seq) within +// the uint64-uint64 stream ID space. Bumps seq; on seq overflow carries +// to ms+1, seq=0; on ms overflow returns an error (no representable +// successor) instead of wrapping to 0-0, which would produce a duplicate +// or non-monotonic ID. +func bumpStreamID(ms, seq uint64) (uint64, uint64, error) { + switch { + case seq < ^uint64(0): + return ms, seq + 1, nil + case ms < ^uint64(0): + return ms + 1, 0, nil + default: + return 0, 0, errors.New("ERR The stream has exhausted the ID space") + } +} + +func compareStreamIDs(lms, lseq, rms, rseq uint64) int { + switch { + case lms < rms: + return -1 + case lms > rms: + return 1 + case lseq < rseq: + return -1 + case lseq > rseq: + return 1 + default: + return 0 + } +} + +func (r *RedisServer) xadd(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + req, err := parseXAddRequest(cmd.Args) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var id string + if err := r.retryRedisWrite(ctx, func() error { + id, err = r.xaddTxn(ctx, cmd.Args[1], req) + return err + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteBulkString(id) +} + +func (r *RedisServer) xaddTxn(ctx context.Context, key []byte, req xaddRequest) (string, error) { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) + if err != nil { + return "", err + } + if typ != redisTypeNone && typ != redisTypeStream { + return "", wrongTypeError() + } + + legacyCleanup, meta, metaFound, err := r.streamWriteBase(ctx, key, readTS) + if err != nil { + return "", err + } + + id, parsedID, err := resolveXAddID(meta, metaFound, req.id) + if err != nil { + return "", err + } + + if err := xaddEnforceMaxWideColumn(key, meta.Length, req.maxLen); err != nil { + return "", err + } + + entryValue, err := marshalStreamEntry(newRedisStreamEntry(id, req.fields)) + if err != nil { + return "", err + } + + // Capacity hint covers: optional legacy-cleanup Del + one entry Put + + // one meta Put + the trim Dels. legacyCleanup is at most one element, + // and only non-empty on the very first write against a stream whose + // pre-migration blob is still on disk. + const xaddFixedElemCount = 2 + elems := make([]*kv.Elem[kv.OP], 0, + len(legacyCleanup)+xaddFixedElemCount+estimateXAddTrimCount(req.maxLen, meta.Length)) + elems = append(elems, legacyCleanup...) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.StreamEntryKey(key, parsedID.ms, parsedID.seq), + Value: entryValue, + }) + + nextLen, trim, err := r.xaddTrimIfNeeded(ctx, key, readTS, req.maxLen, meta.Length+1) + if err != nil { + return "", err + } + elems = append(elems, trim...) + elems = appendMaxLenZeroSelfDel(elems, req.maxLen, key, parsedID) + + metaBytes, err := store.MarshalStreamMeta(store.StreamMeta{ + Length: nextLen, + LastMs: parsedID.ms, + LastSeq: parsedID.seq, + }) + if err != nil { + return "", cockerrors.WithStack(err) + } + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) + + return id, r.dispatchAndSignalStream(ctx, true, readTS, elems, key) +} + +// dispatchAndSignalStream dispatches the elems through the coordinator +// and, on success, wakes any XREAD BLOCK waiter on the same node. +// dispatchElems blocks until the FSM applies locally, so by the time +// Signal fires the new entries are visible at the readTS the woken +// waiter will pick on its next iteration. Pulled out of xaddTxn so the +// parent function stays under the cyclop budget — the signal step +// would otherwise add an extra branch on the dispatch error path. +func (r *RedisServer) dispatchAndSignalStream( + ctx context.Context, + isTxn bool, + startTS uint64, + elems []*kv.Elem[kv.OP], + streamKey []byte, +) error { + if err := r.dispatchElems(ctx, isTxn, startTS, elems); err != nil { + return err + } + r.streamWaiters.Signal(streamKey) + return nil +} + +// appendMaxLenZeroSelfDel handles the MAXLEN 0 edge case. The trim loop +// runs scans at readTS and therefore cannot see the entry we just queued, +// so without this follow-up Del the freshly-added entry would survive +// while meta.Length said 0. The coordinator applies elems in order at a +// single commitTS, so appending Del after the Put tombstones it cleanly. +func appendMaxLenZeroSelfDel(elems []*kv.Elem[kv.OP], maxLen int, key []byte, parsedID redisStreamID) []*kv.Elem[kv.OP] { + if maxLen != 0 { + return elems + } + return append(elems, &kv.Elem[kv.OP]{ + Op: kv.Del, + Key: store.StreamEntryKey(key, parsedID.ms, parsedID.seq), + }) +} + +// xaddEnforceMaxWideColumn rejects an XADD that would push the stream past +// maxWideColumnItems when no MAXLEN clause could rescue it. A MAXLEN >= 0 +// and <= the cap keeps the committed length bounded even when meta.Length is +// already at the ceiling, so we only reject on the ungated path. +func xaddEnforceMaxWideColumn(key []byte, currentLength int64, maxLen int) error { + if maxLen >= 0 && maxLen <= maxWideColumnItems { + return nil + } + if currentLength < int64(maxWideColumnItems) { + return nil + } + return cockerrors.Wrapf(ErrCollectionTooLarge, + "stream %q would exceed %d entries", key, maxWideColumnItems) +} + +// xaddTrimIfNeeded returns (finalLength, trimElems, err) for an XADD. +// estimateXAddTrimCount returns how many entries the XADD's MAXLEN trim +// will remove, or 0 when maxLen is unset or the current length fits under +// it. Used only as a capacity hint for the elems slice; the actual trim +// list is computed by xaddTrimIfNeeded. +func estimateXAddTrimCount(maxLen int, currentLength int64) int { + if maxLen < 0 { + return 0 + } + nextLen := currentLength + 1 + if nextLen <= int64(maxLen) { + return 0 + } + // Compute in int64 and clamp at maxWideColumnItems. A capacity hint + // of math.MaxInt would let make() try to allocate ~16 EiB on 64-bit + // targets and either panic or OOM; capping at the wide-column ceiling + // keeps the hint useful (saves slice growth in the common case) while + // preventing pathological allocation when meta.Length is corrupted. + // xaddTrimIfNeeded enforces the same cap on the actual trim count; + // this hint just sizes the elems slice. + diff := nextLen - int64(maxLen) + if diff <= 0 { + return 0 + } + if diff > int64(maxWideColumnItems) { + return maxWideColumnItems + } + return int(diff) +} + +// When maxLen < 0 (unset) or the new length fits under it, no trim is +// emitted and trimElems is nil; otherwise Del operations for the oldest +// entries are returned and finalLength equals maxLen. All scans use the +// caller's ctx and readTS so the trim happens at the same MVCC snapshot +// as the write. +func (r *RedisServer) xaddTrimIfNeeded( + ctx context.Context, + key []byte, + readTS uint64, + maxLen int, + candidateLen int64, +) (int64, []*kv.Elem[kv.OP], error) { + if maxLen < 0 || candidateLen <= int64(maxLen) { + return candidateLen, nil, nil + } + // int64 arithmetic + clamp at maxWideColumnItems. A single XADD must + // not emit more than maxWideColumnItems Del operations: it would risk + // exceeding the Raft message-size limit and would force a single + // commit to materialise an unbounded list of keys. The cap is loose + // enough that it never bites in normal operation (xaddEnforceMaxWideColumn + // rejects streams whose committed length is already at the ceiling), + // but defends against a corrupted meta.Length feeding the trim path. + diff := candidateLen - int64(maxLen) + if diff <= 0 { + return candidateLen, nil, nil + } + count := maxWideColumnItems + if diff <= int64(maxWideColumnItems) { + count = int(diff) + } + trim, err := r.buildXTrimHeadElems(ctx, key, readTS, count) + if err != nil { + return 0, nil, err + } + // Final length must reflect the trim that actually committed, not + // the requested maxLen, so that meta.Length stays consistent with + // the entries on disk when the cap kicks in or the scan returns + // fewer rows than requested. MAXLEN 0 is a special case: the + // freshly-added entry is removed by appendMaxLenZeroSelfDel in the + // caller, so the post-commit length is 0 regardless of what trim + // did to the pre-existing rows. + if maxLen == 0 { + return 0, trim, nil + } + return candidateLen - int64(len(trim)), trim, nil +} + +// streamWriteBase prepares a write to a stream. Returns the loaded meta +// (zero value when the stream has never been written) and, when a legacy +// single-blob key is still present on disk, a Del elem that the caller +// must include in the write transaction. No migration is performed: +// legacy entries are discarded, not re-materialised into the new layout. +// This matches the PR #620 operator directive that pre-migration data is +// expendable and is cleared explicitly rather than saved. +func (r *RedisServer) streamWriteBase(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], store.StreamMeta, bool, error) { + meta, metaFound, err := r.loadStreamMetaAt(ctx, key, readTS) + if err != nil { + return nil, store.StreamMeta{}, false, err + } + if metaFound { + return nil, meta, true, nil + } + legacyCleanup, err := r.legacyStreamCleanupElems(ctx, key, readTS) + if err != nil { + return nil, store.StreamMeta{}, false, err + } + return legacyCleanup, store.StreamMeta{}, false, nil +} + +// legacyStreamCleanupElems returns a Del elem for the legacy single-blob +// key if one is still present on disk, or nil otherwise. Called by +// streamWriteBase and deleteStreamWideColumnElems so every write or delete +// that touches a stream also evicts any stale legacy data. +func (r *RedisServer) legacyStreamCleanupElems(ctx context.Context, key []byte, readTS uint64) ([]*kv.Elem[kv.OP], error) { + legacyKey := redisStreamKey(key) + exists, err := r.store.ExistsAt(ctx, legacyKey, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + if !exists { + return nil, nil + } + return []*kv.Elem[kv.OP]{{Op: kv.Del, Key: legacyKey}}, nil +} + +// resolveXAddID resolves the requested ID (possibly '*') against the current +// stream meta and returns the assigned string ID plus its parsed form. +func resolveXAddID(meta store.StreamMeta, hasMeta bool, requested string) (string, redisStreamID, error) { + var ( + hasLast bool + lastMs, lastSeq uint64 + ) + if hasMeta { + // LastMs/LastSeq carry the highest ID ever assigned even when the + // stream was trimmed to empty, so auto-ID generation stays + // monotonic across MAXLEN=0 / XDEL-all cycles. + hasLast = meta.Length > 0 || meta.LastMs != 0 || meta.LastSeq != 0 + lastMs, lastSeq = meta.LastMs, meta.LastSeq + } + id, err := nextXAddID(hasLast, lastMs, lastSeq, requested) + if err != nil { + return "", redisStreamID{}, err + } + parsed, ok := tryParseRedisStreamID(id) + if !ok { + return "", redisStreamID{}, errors.New("ERR Invalid stream ID specified as stream command argument") + } + return id, parsed, nil +} + +// buildXTrimHeadElems emits Del operations for the oldest `count` entries +// in the entry-per-key layout via a bounded range scan at the caller's +// MVCC snapshot (ctx, readTS). Mixing a later timestamp here would let us +// tombstone keys the caller's view never saw. +func (r *RedisServer) buildXTrimHeadElems( + ctx context.Context, + key []byte, + readTS uint64, + count int, +) ([]*kv.Elem[kv.OP], error) { + if count <= 0 { + return nil, nil + } + // Defense-in-depth cap on the per-trim scan so a caller that asked + // for math.MaxInt (corrupted meta upstream) cannot try to materialise + // an unbounded list of Del elems in a single transaction. Callers + // (xaddTrimIfNeeded, xtrimTxn) already cap; this is a belt-and-braces + // guard on the boundary that actually allocates. + if count > maxWideColumnItems { + count = maxWideColumnItems + } + prefix := store.StreamEntryScanPrefix(key) + end := store.PrefixScanEnd(prefix) + kvs, err := r.store.ScanAt(ctx, prefix, end, count, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + elems := make([]*kv.Elem[kv.OP], 0, len(kvs)) + for _, pair := range kvs { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: append([]byte(nil), pair.Key...)}) + } + return elems, nil +} + +func parseXTrimMaxLen(args [][]byte) (int, error) { + if !strings.EqualFold(string(args[2]), "MAXLEN") { + return 0, errors.New("ERR syntax error") + } + + argIndex := 3 + if argIndex < len(args) && (string(args[argIndex]) == "~" || string(args[argIndex]) == "=") { + argIndex++ + } + if argIndex != len(args)-1 { + return 0, errors.New("ERR syntax error") + } + + maxLen, err := strconv.Atoi(string(args[argIndex])) + if err != nil || maxLen < 0 { + return 0, errors.New("ERR syntax error") + } + return maxLen, nil +} + +func (r *RedisServer) xtrim(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + maxLen, err := parseXTrimMaxLen(cmd.Args) + if err != nil { + conn.WriteError("ERR syntax error") + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var removed int + if err := r.retryRedisWrite(ctx, func() error { + var err error + removed, err = r.xtrimTxn(ctx, cmd.Args[1], maxLen) + return err + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(removed) +} + +// streamTypeForWrite returns (true, nil) when the key is either absent +// (no-op write) or already a stream, (false, nil) when the caller should +// short-circuit with "no stream here", and (_, err) for wrong-type or +// store errors. Extracted from xtrimTxn so the outer function stays +// within the cyclop budget. +func (r *RedisServer) streamTypeForWrite(ctx context.Context, key []byte, readTS uint64) (bool, error) { + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) + if err != nil { + return false, err + } + switch typ { + case redisTypeNone: + return false, nil + case redisTypeStream: + return true, nil + case redisTypeString, redisTypeList, redisTypeHash, redisTypeSet, redisTypeZSet: + return false, wrongTypeError() + default: + return false, wrongTypeError() + } +} + +// flushLegacyCleanupOnTrimNoOp commits the legacy-blob Del + meta Put +// for an XTRIM whose length is already under maxLen. Without this +// flush a subsequent read would still find the stale legacy blob. +// Returns 0 removed entries; callers use that directly. +func (r *RedisServer) flushLegacyCleanupOnTrimNoOp( + ctx context.Context, readTS uint64, key []byte, + meta store.StreamMeta, legacyCleanup []*kv.Elem[kv.OP], +) (int, error) { + if len(legacyCleanup) == 0 { + return 0, nil + } + metaBytes, err := store.MarshalStreamMeta(meta) + if err != nil { + return 0, cockerrors.WithStack(err) + } + elems := make([]*kv.Elem[kv.OP], 0, len(legacyCleanup)+1) + elems = append(elems, legacyCleanup...) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) + return 0, r.dispatchElems(ctx, true, readTS, elems) +} + +func (r *RedisServer) xtrimTxn(ctx context.Context, key []byte, maxLen int) (int, error) { + readTS := r.readTS() + proceed, err := r.streamTypeForWrite(ctx, key, readTS) + if err != nil || !proceed { + return 0, err + } + + legacyCleanup, meta, _, err := r.streamWriteBase(ctx, key, readTS) + if err != nil { + return 0, err + } + + if meta.Length <= int64(maxLen) { + return r.flushLegacyCleanupOnTrimNoOp(ctx, readTS, key, meta, legacyCleanup) + } + + // Cap the trim request at maxWideColumnItems so a single XTRIM cannot + // emit an unbounded list of Del operations in one Raft commit. int64 + // arithmetic upfront also keeps a corrupted meta.Length (>MaxInt) + // from wrapping into a negative scan count. + diff := meta.Length - int64(maxLen) + requestedRemoved := maxWideColumnItems + if diff <= int64(maxWideColumnItems) { + requestedRemoved = int(diff) + } + trim, err := r.buildXTrimHeadElems(ctx, key, readTS, requestedRemoved) + if err != nil { + return 0, err + } + + // Use len(trim) — the actual entries we are about to delete — for + // both the meta.Length update and the XTRIM return value. The + // requested count and the actual count can diverge when the trim + // hits the per-txn cap or the underlying scan returns fewer rows + // than requested (concurrent writes / partial consistency); using + // the actual count keeps meta.Length consistent with on-disk state + // and reports the truth back to the client. + actualRemoved := len(trim) + elems := make([]*kv.Elem[kv.OP], 0, len(legacyCleanup)+actualRemoved+1) + elems = append(elems, legacyCleanup...) + elems = append(elems, trim...) + meta.Length -= int64(actualRemoved) + metaBytes, err := store.MarshalStreamMeta(meta) + if err != nil { + return 0, cockerrors.WithStack(err) + } + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: store.StreamMetaKey(key), Value: metaBytes}) + return actualRemoved, r.dispatchElems(ctx, true, readTS, elems) +} + +func (r *RedisServer) xrange(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.rangeStream(conn, cmd, false) +} + +func (r *RedisServer) xrevrange(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + r.rangeStream(conn, cmd, true) +} + +func parseXReadCountArg(args [][]byte, index int) (int, error) { + if index+1 >= len(args) { + return 0, errors.New("ERR syntax error") + } + count, err := strconv.Atoi(string(args[index+1])) + if err != nil || count <= 0 { + return 0, errors.New("ERR syntax error") + } + // Clamp client-supplied COUNT to the wide-column ceiling so a single + // XREAD cannot pre-allocate a maxInt-sized []redisStreamEntry slice or + // pull more entries than the store will return for the equivalent + // uncapped scan. Cap is silent (Redis-compatible): the client always + // sees at most maxWideColumnItems entries per stream per call. + if count > maxWideColumnItems { + count = maxWideColumnItems + } + return count, nil +} + +func parseXReadBlockArg(args [][]byte, index int) (time.Duration, error) { + if index+1 >= len(args) { + return 0, errors.New("ERR syntax error") + } + ms, err := strconv.Atoi(string(args[index+1])) + if err != nil || ms < 0 { + return 0, errors.New("ERR syntax error") + } + return time.Duration(ms) * time.Millisecond, nil +} + +func parseXReadOptions(args [][]byte) (xreadOptions, error) { + opts := xreadOptions{count: -1, streamsIndex: -1} + for i := 1; i < len(args); { + next, done, err := parseXReadOption(&opts, args, i) + if err != nil { + return xreadOptions{}, err + } + if done { + return opts, nil + } + i = next + } + return opts, nil +} + +func parseXReadOption(opts *xreadOptions, args [][]byte, i int) (int, bool, error) { + switch strings.ToUpper(string(args[i])) { + case redisKeywordCount: + count, err := parseXReadCountArg(args, i) + if err != nil { + return 0, false, err + } + opts.count = count + return i + redisPairWidth, false, nil + case "BLOCK": + block, err := parseXReadBlockArg(args, i) + if err != nil { + return 0, false, err + } + opts.block = block + return i + redisPairWidth, false, nil + case "STREAMS": + opts.streamsIndex = i + 1 + return len(args), true, nil + default: + return 0, false, errors.New("ERR syntax error") + } +} + +func splitXReadStreams(args [][]byte, streamsIndex int) ([][]byte, []string, error) { + if streamsIndex < 0 || streamsIndex >= len(args) { + return nil, nil, errors.New("ERR syntax error") + } + remaining := len(args) - streamsIndex + if remaining%redisPairWidth != 0 { + return nil, nil, errors.New("ERR syntax error") + } + + streamCount := remaining / redisPairWidth + keys := make([][]byte, streamCount) + afterIDs := make([]string, streamCount) + for i := range streamCount { + keys[i] = args[streamsIndex+i] + afterIDs[i] = string(args[streamsIndex+streamCount+i]) + } + return keys, afterIDs, nil +} + +func parseXReadRequest(args [][]byte) (xreadRequest, error) { + opts, err := parseXReadOptions(args) + if err != nil { + return xreadRequest{}, err + } + keys, afterIDs, err := splitXReadStreams(args, opts.streamsIndex) + if err != nil { + return xreadRequest{}, err + } + return xreadRequest{block: opts.block, count: opts.count, keys: keys, afterIDs: afterIDs}, nil +} + +func (r *RedisServer) resolveXReadAfterIDs(ctx context.Context, req *xreadRequest) error { + for i, afterID := range req.afterIDs { + if afterID != "$" { + continue + } + resolved, err := r.resolveXReadDollarID(ctx, req.keys[i]) + if err != nil { + return err + } + req.afterIDs[i] = resolved + } + return nil +} + +// resolveXReadDollarID resolves the "$" after-ID for a single stream by +// asking the store for the highest ID ever assigned. The new-layout meta +// answers in one read; when meta is absent the stream is treated as +// empty — legacy single-blob data is intentionally ignored under the +// "discard-on-read, delete-on-write" contract documented on +// dollarIDFromState (and matching loadStreamAt). Returns streamZeroID +// for non-existent and empty-never-written streams. ctx threads through +// the caller's cancellation/deadline so the resolve step doesn't survive +// past a BLOCK-window cancel. +func (r *RedisServer) resolveXReadDollarID(ctx context.Context, key []byte) (string, error) { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) + if err != nil { + return "", err + } + if typ == redisTypeNone { + return streamZeroID, nil + } + if typ != redisTypeStream { + return "", wrongTypeError() + } + return r.dollarIDFromState(ctx, key, readTS) +} + +// dollarIDFromState returns the highest-ever-assigned stream ID as a string. +// Reads the new-layout meta record (O(1)); when meta is absent the stream +// is treated as empty — legacy single-blob data is intentionally ignored +// under the "discard-on-read, delete-on-write" contract (see loadStreamAt +// and the PR #620 writeup), so $ resolves to streamZeroID for any stream +// that has never been written in the new layout. +func (r *RedisServer) dollarIDFromState(ctx context.Context, key []byte, readTS uint64) (string, error) { + meta, found, err := r.loadStreamMetaAt(ctx, key, readTS) + if err != nil { + return "", err + } + if !found { + return streamZeroID, nil + } + if meta.Length == 0 && meta.LastMs == 0 && meta.LastSeq == 0 { + return streamZeroID, nil + } + return strconv.FormatUint(meta.LastMs, 10) + "-" + strconv.FormatUint(meta.LastSeq, 10), nil +} + +func selectXReadEntries(entries []redisStreamEntry, afterID string, count int) []redisStreamEntry { + afterParsedID, afterParsedValid := tryParseRedisStreamID(afterID) + start := sort.Search(len(entries), func(i int) bool { + return entries[i].compareID(afterID, afterParsedID, afterParsedValid) > 0 + }) + if start >= len(entries) { + return nil + } + end := len(entries) + if count > 0 && start+count < end { + end = start + count + } + return entries[start:end] +} + +func (r *RedisServer) xreadOnce(ctx context.Context, req xreadRequest) ([]xreadResult, error) { + results := make([]xreadResult, 0, len(req.keys)) + for i, key := range req.keys { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, key, readTS, redisTypeStream) + if err != nil { + return nil, err + } + if typ == redisTypeNone { + continue + } + if typ != redisTypeStream { + return nil, wrongTypeError() + } + + entries, err := r.readStreamAfter(ctx, key, readTS, req.afterIDs[i], req.count) + if err != nil { + return nil, err + } + if len(entries) > 0 { + results = append(results, xreadResult{key: key, entries: entries}) + } + } + return results, nil +} + +// readStreamAfter returns up to `count` entries with ID strictly greater +// than afterID via the entry-per-key range scan. When the meta key is +// absent the stream is treated as empty; legacy single-blob data is +// intentionally ignored under the "discard-on-read, delete-on-write" +// contract documented on loadStreamAt. A subsequent XADD or XTRIM will +// delete any lingering legacy blob in the same transaction, so a stream +// whose meta is still missing here cannot have live legacy data from the +// caller's perspective. +func (r *RedisServer) readStreamAfter(ctx context.Context, key []byte, readTS uint64, afterID string, count int) ([]redisStreamEntry, error) { + _, found, err := r.loadStreamMetaAt(ctx, key, readTS) + if err != nil { + return nil, err + } + if !found { + return nil, nil + } + return r.scanStreamEntriesAfter(ctx, key, readTS, afterID, count) +} + +// scanStreamEntriesAfter runs a [strictly-after(afterID), ∞) range scan over +// entry keys, capped by count (when positive) or maxWideScanLimit otherwise. +// When count is non-positive, we mirror scanStreamEntriesAt's guard: request +// maxWideScanLimit (which is maxWideColumnItems+1) and reject if the scan +// filled, so an XREAD without COUNT cannot OOM the server on a pathological +// stream. +// +// afterID must be a parseable stream ID in either the strict "ms-seq" form or +// the shorthand "ms" form (no dash), which Redis normalises to "ms-0". +// Genuinely malformed IDs are rejected immediately so the caller never +// receives a full-stream result set for invalid input. +func (r *RedisServer) scanStreamEntriesAfter(ctx context.Context, key []byte, readTS uint64, afterID string, count int) ([]redisStreamEntry, error) { + afterID, ok := normalizeStreamAfterID(afterID) + if !ok { + return nil, errors.New("ERR Invalid stream ID specified as stream command argument") + } + prefix := store.StreamEntryScanPrefix(key) + end := store.PrefixScanEnd(prefix) + start := streamScanStartForAfter(prefix, afterID) + limit := count + unbounded := limit <= 0 + if unbounded { + limit = maxWideScanLimit + } + kvs, err := r.store.ScanAt(ctx, start, end, limit, readTS) + if err != nil { + return nil, cockerrors.WithStack(err) + } + if unbounded && len(kvs) > maxWideColumnItems { + return nil, cockerrors.Wrapf(ErrCollectionTooLarge, "stream %q exceeds %d entries", key, maxWideColumnItems) + } + entries := make([]redisStreamEntry, 0, len(kvs)) + for _, pair := range kvs { + entry, err := unmarshalStreamEntry(pair.Value) + if err != nil { + return nil, err + } + entries = append(entries, entry) + } + return entries, nil +} + +// streamScanStartForAfter returns the inclusive start key to use for an +// XREAD-style "after afterID" range scan. If afterID parses cleanly we +// start at ID+1 so the scan is exclusive of afterID. Callers must validate +// afterID before calling this function; if afterID is unparseable, the +// returned prefix is the entry-prefix start, which gives a full scan. +// +// Edge case: if afterID is (math.MaxUint64-math.MaxUint64), there is no +// successor ID inside the entry-prefix keyspace, so the correct start is +// one past the prefix (empty scan). Returning the afterID key itself +// would make the inclusive scan include it, which is the opposite of +// "strictly after." +func streamScanStartForAfter(prefix []byte, afterID string) []byte { + parsed, ok := tryParseRedisStreamID(afterID) + if !ok { + return prefix + } + ms, seq := parsed.ms, parsed.seq + switch { + case seq < ^uint64(0): + seq++ + case ms < ^uint64(0): + ms++ + seq = 0 + default: + // afterID is the largest representable stream ID. No entry can be + // strictly after it; return the scan-end sentinel so the scan is + // empty instead of silently inclusive. + return store.PrefixScanEnd(prefix) + } + start := make([]byte, 0, len(prefix)+store.StreamIDBytes) + start = append(start, prefix...) + start = append(start, store.EncodeStreamID(ms, seq)...) + return start +} + +// normalizeStreamAfterID normalises an XREAD afterID to the strict "ms-seq" +// form used by tryParseRedisStreamID. Redis accepts a shorthand "ms" form +// (no dash) as meaning "ms-0". Truly invalid IDs — those that are neither +// valid "ms-seq" strings nor parseable as a bare uint64 — return ("", false). +func normalizeStreamAfterID(id string) (string, bool) { + if strings.IndexByte(id, '-') >= 0 { + _, ok := tryParseRedisStreamID(id) + return id, ok + } + // Shorthand: bare millisecond component only. Redis treats "ms" as "ms-0" + // for XREAD after-IDs (entries strictly after ms-0). + if _, err := strconv.ParseUint(id, 10, 64); err != nil { + return "", false + } + return id + "-0", true +} + +func writeStreamEntry(conn redcon.Conn, entry redisStreamEntry) { + conn.WriteArray(redisPairWidth) + conn.WriteBulkString(entry.ID) + conn.WriteArray(len(entry.Fields)) + for _, field := range entry.Fields { + conn.WriteBulkString(field) + } +} + +func writeStreamEntries(conn redcon.Conn, entries []redisStreamEntry) { + conn.WriteArray(len(entries)) + for _, entry := range entries { + writeStreamEntry(conn, entry) + } +} + +func writeXReadResults(conn redcon.Conn, results []xreadResult) { + conn.WriteArray(len(results)) + for _, result := range results { + conn.WriteArray(redisPairWidth) + conn.WriteBulk(result.key) + writeStreamEntries(conn, result.entries) + } +} + +// isXReadIterCtxError reports whether err originates from the per-iteration +// XREAD context firing (BLOCK budget consumed mid-call). The check covers +// the bare context sentinels, cockroachdb/errors-wrapped variants, and +// gRPC's status.Error(codes.DeadlineExceeded / codes.Canceled, ...) which +// is what bubbles up through coordinator.Dispatch when the iter ctx fires +// during a Raft-mediated read. Hits on this path must be silently +// translated to "empty iteration" so the BLOCK-window null contract holds. +func isXReadIterCtxError(err error) bool { + if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { + return true + } + if cockerrors.Is(err, context.DeadlineExceeded) || cockerrors.Is(err, context.Canceled) { + return true + } + switch status.Code(err) { //nolint:exhaustive // only the two ctx-related codes matter; the rest must propagate as real errors + case codes.DeadlineExceeded, codes.Canceled: + return true + default: + return false + } +} + +func (r *RedisServer) xread(conn redcon.Conn, cmd redcon.Command) { + req, err := parseXReadRequest(cmd.Args) + if err != nil { + writeRedisError(conn, err) + return + } + + blockDuration := req.block + // block=0 means infinite wait in Redis; cap at redisDispatchTimeout to prevent goroutine leak. + if blockDuration == 0 { + blockDuration = redisDispatchTimeout + } + deadline := time.Now().Add(blockDuration) + + // $ resolution uses a short fixed timeout rather than the BLOCK + // window: it's a single bounded read per key, not a wait. A tight + // BLOCK (e.g. `BLOCK 1`) used to turn any slow $-resolve into a + // protocol-level error on this path; use redisDispatchTimeout so + // the resolve either succeeds quickly or fails cleanly, leaving + // the BLOCK-window timeout semantics (null on expiry) to the + // busy-poll below. + // + // Parent on r.handlerContext() (not context.Background()) so an + // in-flight resolve aborts promptly when the server is shutting + // down — otherwise the per-resolve ScanAt could survive past + // graceful-shutdown's drain window. + resolveCtx, resolveCancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + err = r.resolveXReadAfterIDs(resolveCtx, &req) + resolveCancel() + if err != nil { + writeRedisError(conn, err) + return + } + + r.xreadBusyPoll(conn, req, deadline) +} + +// xreadBusyPoll runs the BLOCK-window wait loop. Extracted from xread so +// the parent function stays under the cyclop budget. Uses an event-driven +// signal from the in-process XADD path with a fallback timer for paths +// that bypass the signal (Lua flush, follower-side FSM apply). +// +// Registration happens BEFORE the first xreadOnce so a signal that fires +// between the check and the wait cannot be lost: the buffered channel +// holds it, and the next select wakes immediately. +func (r *RedisServer) xreadBusyPoll(conn redcon.Conn, req xreadRequest, deadline time.Time) { + handlerCtx := r.handlerContext() + w, release := r.streamWaiters.Register(req.keys) + defer release() + for { + // Server-shutdown short-circuit: if the parent handlerContext + // has been cancelled, abandon the wait loop immediately rather + // than block until the BLOCK deadline. iterCtx below is rooted + // in handlerCtx, so it would cancel-on-call too — but routing + // through isXReadIterCtxError silently translates that into an + // empty iteration and the loop would otherwise wait at + // redisBlockWaitFallback cadence until the deadline. + if handlerCtx.Err() != nil { + conn.WriteNull() + return + } + // BLOCK-expired before the loop body: respect the Redis contract + // that a BLOCK timeout returns null, not an error. If we fell + // through here without remaining time (very small BLOCK, or + // $-resolution consumed the budget) creating an + // already-expired context.WithTimeout would make xreadOnce + // return DeadlineExceeded, which we'd then surface as an error. + iterTimeout := time.Until(deadline) + if iterTimeout <= 0 { + conn.WriteNull() + return + } + // Cap each iteration at redisDispatchTimeout to avoid holding + // storage resources longer than a single dispatch. + if iterTimeout > redisDispatchTimeout { + iterTimeout = redisDispatchTimeout + } + // iterCtx is rooted in handlerCtx so its underlying storage + // scans abort promptly on server shutdown rather than running + // until iterTimeout fires. The handlerCtx.Err() guard at the + // top of each iteration prevents the loop from spinning once + // the parent ctx is cancelled. + iterCtx, iterCancel := context.WithTimeout(handlerCtx, iterTimeout) + results, err := r.xreadOnce(iterCtx, req) + iterCancel() + // Per-iteration ctx hitting its deadline (or being cancelled by + // the upstream BLOCK timeout) is not a client-facing error — it + // just means this poll round did not see any new entries. Treat + // it as an empty iteration so the loop continues to the next + // round (or falls through to the null-on-deadline branch below). + // Without this, a tight BLOCK (e.g. BLOCK 10 against a busy / + // slow node) leaks the iteration ctx-deadline into a -ERR reply, + // which violates the Redis BLOCK-timeout contract (null on + // timeout). xreadOnce returns nil results on any error, so + // suppressing iter-ctx errors here is sound. + if err != nil && !isXReadIterCtxError(err) { + writeRedisError(conn, err) + return + } + if len(results) > 0 { + writeXReadResults(conn, results) + return + } + + if !time.Now().Before(deadline) { + conn.WriteNull() + return + } + waitForBlockedCommandUpdate(handlerCtx, w.C, deadline) + } +} + +func (r *RedisServer) xlen(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeStream) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteInt(0) + return + } + if typ != redisTypeStream { + conn.WriteError(wrongTypeMessage) + return + } + meta, found, err := r.loadStreamMetaAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if found { + conn.WriteInt64(meta.Length) + return + } + stream, err := r.loadStreamAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt64(int64(len(stream.Entries))) +} + +func parseRangeStreamCount(args [][]byte) (int, error) { + count := -1 + for i := 4; i < len(args); i += redisPairWidth { + // args[i] is safe: the for-loop guard `i < len(args)` ensures it. + // gosec G602 false-positives here under flow analysis. + if i+1 >= len(args) || !strings.EqualFold(string(args[i]), redisKeywordCount) { //nolint:gosec + return 0, errors.New("ERR syntax error") + } + nextCount, err := strconv.Atoi(string(args[i+1])) + if err != nil || nextCount < 0 { + return 0, errors.New("ERR syntax error") + } + count = nextCount + } + // Clamp client-supplied COUNT for XRANGE / XREVRANGE the same way XREAD + // clamps it (parseXReadCountArg). The negative sentinel -1 (no COUNT) + // is preserved unchanged so the unbounded path still trips + // maxWideColumnItems guard inside rangeStreamNewLayout. + if count > maxWideColumnItems { + count = maxWideColumnItems + } + return count, nil +} + +func streamEntryMatchesRange(entryID, startRaw, endRaw string, reverse bool) bool { + if reverse { + return streamWithinUpper(entryID, startRaw) && streamWithinLower(entryID, endRaw) + } + return streamWithinLower(entryID, startRaw) && streamWithinUpper(entryID, endRaw) +} + +func selectForwardStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, count int) []redisStreamEntry { + selected := make([]redisStreamEntry, 0, len(entries)) + for _, entry := range entries { + if !streamEntryMatchesRange(entry.ID, startRaw, endRaw, false) { + continue + } + selected = append(selected, entry) + if count >= 0 && len(selected) >= count { + break + } + } + return selected +} + +func selectReverseStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, count int) []redisStreamEntry { + selected := make([]redisStreamEntry, 0, len(entries)) + for i := len(entries) - 1; i >= 0; i-- { + if !streamEntryMatchesRange(entries[i].ID, startRaw, endRaw, true) { + continue + } + selected = append(selected, entries[i]) + if count >= 0 && len(selected) >= count { + break + } + } + return selected +} + +func selectStreamRangeEntries(entries []redisStreamEntry, startRaw, endRaw string, reverse bool, count int) []redisStreamEntry { + if reverse { + return selectReverseStreamRangeEntries(entries, startRaw, endRaw, count) + } + return selectForwardStreamRangeEntries(entries, startRaw, endRaw, count) +} + +func (r *RedisServer) rangeStream(conn redcon.Conn, cmd redcon.Command, reverse bool) { + count, err := parseRangeStreamCount(cmd.Args) + if err != nil { + writeRedisError(conn, err) + return + } + + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), cmd.Args[1], readTS, redisTypeStream) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteArray(0) + return + } + if typ != redisTypeStream { + conn.WriteError(wrongTypeMessage) + return + } + + startRaw, endRaw := string(cmd.Args[2]), string(cmd.Args[3]) + + _, metaFound, err := r.loadStreamMetaAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if metaFound { + selected, err := r.rangeStreamNewLayout(context.Background(), cmd.Args[1], readTS, startRaw, endRaw, reverse, count) + if err != nil { + writeRedisError(conn, err) + return + } + writeStreamEntries(conn, selected) + return + } + + stream, err := r.loadStreamAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + writeRedisError(conn, err) + return + } + selected := selectStreamRangeEntries(stream.Entries, startRaw, endRaw, reverse, count) + writeStreamEntries(conn, selected) +} + +// rangeStreamNewLayout serves XRANGE / XREVRANGE from the entry-per-key +// layout via a bounded range scan. The (start, end) inputs are the raw +// command bounds — "-", "+", "(1000-0", or "1000-0" — and are converted to +// binary scan bounds so only the selected entries are unmarshaled. +func (r *RedisServer) rangeStreamNewLayout( + ctx context.Context, key []byte, readTS uint64, + startRaw, endRaw string, reverse bool, count int, +) ([]redisStreamEntry, error) { + prefix := store.StreamEntryScanPrefix(key) + scanStart, scanEnd, ok, err := streamScanBounds(prefix, startRaw, endRaw, reverse) + if err != nil { + return nil, err + } + if !ok { + return nil, nil + } + limit := count + unbounded := limit <= 0 + if unbounded { + limit = maxWideScanLimit + } + var kvs []*store.KVPair + if reverse { + kvs, err = r.store.ReverseScanAt(ctx, scanStart, scanEnd, limit, readTS) + } else { + kvs, err = r.store.ScanAt(ctx, scanStart, scanEnd, limit, readTS) + } + if err != nil { + return nil, cockerrors.WithStack(err) + } + // An XRANGE/XREVRANGE without COUNT on a pathological stream must + // not be able to pull maxWideScanLimit entries into a single reply. + // Mirror scanStreamEntriesAt's guard. + if unbounded && len(kvs) > maxWideColumnItems { + return nil, cockerrors.Wrapf(ErrCollectionTooLarge, "stream %q exceeds %d entries", key, maxWideColumnItems) + } + entries := make([]redisStreamEntry, 0, len(kvs)) + for _, pair := range kvs { + entry, err := unmarshalStreamEntry(pair.Value) + if err != nil { + return nil, err + } + entries = append(entries, entry) + } + return entries, nil +} + +// streamScanBounds maps the raw XRANGE / XREVRANGE bounds to half-open +// [start, end) scan bounds over the entry prefix. For reverse scans, +// the ReverseScanAt convention is still [start, end) with results in +// descending order starting from just-before(end). +// +// Returns ok=false when the bounds define an empty range (e.g. start > end), +// in which case the caller should emit an empty array. +func streamScanBounds(prefix []byte, startRaw, endRaw string, reverse bool) ([]byte, []byte, bool, error) { + var lowRaw, highRaw string + if reverse { + // XREVRANGE takes (high, low). + highRaw, lowRaw = startRaw, endRaw + } else { + lowRaw, highRaw = startRaw, endRaw + } + + start, err := streamBoundLow(prefix, lowRaw) + if err != nil { + return nil, nil, false, err + } + end, err := streamBoundHigh(prefix, highRaw) + if err != nil { + return nil, nil, false, err + } + if bytes.Compare(start, end) >= 0 { + return nil, nil, false, nil + } + return start, end, true, nil +} + +// streamBoundLow returns the inclusive lower bound of the scan in binary form. +// When the bound is "(ID" (exclusive) and ID is the largest representable +// stream ID, the scan-end sentinel is returned so streamScanBounds' +// start >= end check collapses the range to empty; otherwise the scan +// would silently include the exclusive bound entry. +func streamBoundLow(prefix []byte, raw string) ([]byte, error) { + if raw == "-" { + return prefix, nil + } + exclusive := strings.HasPrefix(raw, "(") + if exclusive { + raw = raw[1:] + } + ms, seq, ok := parseStreamBoundID(raw, false, exclusive) + if !ok { + return nil, errors.New("ERR Invalid stream ID specified as stream command argument") + } + if exclusive { + switch { + case seq < ^uint64(0): + seq++ + case ms < ^uint64(0): + ms++ + seq = 0 + default: + return store.PrefixScanEnd(prefix), nil + } + } + return appendStreamKey(prefix, ms, seq), nil +} + +// streamBoundHigh returns the exclusive upper bound of the scan in binary form. +func streamBoundHigh(prefix []byte, raw string) ([]byte, error) { + if raw == "+" { + return store.PrefixScanEnd(prefix), nil + } + exclusive := strings.HasPrefix(raw, "(") + if exclusive { + raw = raw[1:] + } + ms, seq, ok := parseStreamBoundID(raw, true, exclusive) + if !ok { + return nil, errors.New("ERR Invalid stream ID specified as stream command argument") + } + if !exclusive { + switch { + case seq < ^uint64(0): + seq++ + case ms < ^uint64(0): + ms++ + seq = 0 + default: + return store.PrefixScanEnd(prefix), nil + } + } + return appendStreamKey(prefix, ms, seq), nil +} + +// parseStreamBoundID accepts both the strict ms-seq form and the shorthand +// "ms" form that Redis XRANGE/XREVRANGE allow. Redis interprets a shorthand +// ID differently depending on position and exclusivity: +// +// - Lower bound inclusive ("5"): expand to 5-0; scan starts at 5-0. +// - Lower bound exclusive ("(5"): expand to 5-0; caller shifts +1 → 5-1. +// - Upper bound inclusive ("5"): expand to 5-MaxUint64; caller shifts +1 → 6-0 (exclusive upper). +// - Upper bound exclusive ("(5"): expand to 5-0; scan stops at 5-0 (excludes all ms=5 entries). +// +// The rule is: seq = MaxUint64 when upper && !exclusive (need to include the +// full ms row before the caller's inclusive→exclusive shift), seq = 0 +// otherwise. Full ms-seq IDs pass through unchanged. +func parseStreamBoundID(raw string, upper, exclusive bool) (uint64, uint64, bool) { + if strings.IndexByte(raw, '-') >= 0 { + parsed, ok := tryParseRedisStreamID(raw) + if !ok { + return 0, 0, false + } + return parsed.ms, parsed.seq, true + } + ms, err := strconv.ParseUint(raw, 10, 64) + if err != nil { + return 0, 0, false + } + // Upper inclusive bounds need seq=MaxUint64 so the caller's +1 shift + // produces (ms+1)-0, covering the entire ms row. All other + // combinations use seq=0: lower inclusive starts at ms-0, lower + // exclusive starts at ms-0 then the caller shifts to ms-1, and upper + // exclusive stops before ms-0 (excluding the whole ms). + if upper && !exclusive { + return ms, ^uint64(0), true + } + return ms, 0, true +} + +func appendStreamKey(prefix []byte, ms, seq uint64) []byte { + out := make([]byte, 0, len(prefix)+store.StreamIDBytes) + out = append(out, prefix...) + out = append(out, store.EncodeStreamID(ms, seq)...) + return out +} + +func streamWithinLower(entryID, raw string) bool { + if raw == "-" { + return true + } + exclusive := strings.HasPrefix(raw, "(") + if exclusive { + raw = raw[1:] + } + cmp := compareRedisStreamID(entryID, raw) + if exclusive { + return cmp > 0 + } + return cmp >= 0 +} + +func streamWithinUpper(entryID, raw string) bool { + if raw == "+" { + return true + } + exclusive := strings.HasPrefix(raw, "(") + if exclusive { + raw = raw[1:] + } + cmp := compareRedisStreamID(entryID, raw) + if exclusive { + return cmp < 0 + } + return cmp <= 0 +} diff --git a/adapter/redis_strings.go b/adapter/redis_strings.go new file mode 100644 index 00000000..a871470e --- /dev/null +++ b/adapter/redis_strings.go @@ -0,0 +1,644 @@ +package adapter + +import ( + "bytes" + "context" + "math" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +type redisSetOptions struct { + existsCond bool + missingCond bool + returnOld bool + ttl *time.Time +} + +type redisSetState struct { + rawTyp redisValueType // TTL-unaware type, for internal-key cleanup + typ redisValueType // TTL-aware type, for NX/XX/GET semantics + oldValue []byte +} + +type redisSetExecution struct { + state redisSetState + wroteNull bool + wroteOldBulk bool +} + +func parseRedisSetOptions(args [][]byte, now time.Time) (redisSetOptions, error) { + opts := redisSetOptions{} + for i := 0; i < len(args); i++ { + opt := strings.ToUpper(string(args[i])) + switch opt { + case "EX", "PX": + ttl, nextIndex, err := parseRedisSetTTL(args, i, opt, now) + if err != nil { + return redisSetOptions{}, err + } + opts.ttl = ttl + i = nextIndex + case "NX": + opts.missingCond = true + case "XX": + opts.existsCond = true + case "GET": + opts.returnOld = true + default: + return redisSetOptions{}, errors.New("ERR syntax error") + } + } + if opts.existsCond && opts.missingCond { + return redisSetOptions{}, errors.New("ERR syntax error") + } + return opts, nil +} + +func parseRedisSetTTL(args [][]byte, index int, opt string, now time.Time) (*time.Time, int, error) { + if index+1 >= len(args) { + return nil, index, errors.New("ERR syntax error") + } + n, err := strconv.ParseInt(string(args[index+1]), 10, 64) + if err != nil { + // Match Redis behavior: invalid numeric TTL value should not expose + // internal parsing errors, but return a stable protocol error. + return nil, index, errors.New("ERR value is not an integer or out of range") + } + if n <= 0 { + return nil, index, errors.New("ERR invalid expire time in 'set' command") + } + + unit := time.Millisecond + if opt == "EX" { + unit = time.Second + } + if n > math.MaxInt64/int64(unit) { + return nil, index, errors.New("ERR invalid expire time in 'set' command") + } + + expireAt := now.Add(time.Duration(n) * unit) + return &expireAt, index + 1, nil +} + +func (o redisSetOptions) isFastPath() bool { + return !o.returnOld && !o.existsCond && !o.missingCond +} + +func (o redisSetOptions) allows(exists bool) bool { + if o.existsCond && !exists { + return false + } + if o.missingCond && exists { + return false + } + return true +} + +func (r *RedisServer) loadRedisSetState(ctx context.Context, key []byte, readTS uint64, returnOld bool) (redisSetState, error) { + // Probe type ONCE (rawKeyTypeAt issues up to ~17 pebble seeks), + // then derive both the raw and TTL-filtered views from it. The + // previous implementation called rawKeyTypeAt + keyTypeAt, which + // called rawKeyTypeAt again inside -- doubling every SET to ~34 + // seeks for purely redundant work. + rawTyp, err := r.rawKeyTypeAt(ctx, key, readTS) + if err != nil { + return redisSetState{}, err + } + // typ (TTL-aware) drives NX/XX/GET Redis semantics: expired keys are "gone". + typ, err := r.applyTTLFilter(ctx, key, readTS, rawTyp) + if err != nil { + return redisSetState{}, err + } + + state := redisSetState{rawTyp: rawTyp, typ: typ} + if !returnOld || typ != redisTypeString { + return state, nil + } + + oldValue, _, err := r.readRedisStringAt(key, readTS) + if err != nil && !errors.Is(err, store.ErrKeyNotFound) { + return redisSetState{}, err + } + state.oldValue = oldValue + return state, nil +} + +func (r *RedisServer) replaceWithStringTxn(ctx context.Context, key, value []byte, ttl *time.Time, typ redisValueType, readTS uint64) error { + var elems []*kv.Elem[kv.OP] + if isNonStringCollectionType(typ) { + delElems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + elems = append(elems, delElems...) + } + // Embed TTL in the string value; write !redis|ttl| as a secondary scan index. + encoded := encodeRedisStr(bytes.Clone(value), ttl) + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisStrKey(key), Value: encoded}) + if ttl != nil { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(key), Value: encodeRedisTTL(*ttl)}) + } else { + // Clear any prior scan index so a persistent string is not later expired. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(key)}) + } + return r.dispatchElems(ctx, true, readTS, elems) +} + +func (r *RedisServer) executeSet(ctx context.Context, key, value []byte, opts redisSetOptions) (redisSetExecution, error) { + var result redisSetExecution + err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + state, err := r.loadRedisSetState(ctx, key, readTS, opts.returnOld) + if err != nil { + return err + } + + exists := state.typ != redisTypeNone + if !opts.allows(exists) { + result = redisSetExecution{wroteNull: true} + return nil + } + if opts.returnOld && exists && state.typ != redisTypeString { + return wrongTypeError() + } + // Use rawTyp for cleanup so expired-but-lingering internal keys are deleted. + if err := r.replaceWithStringTxn(ctx, key, value, opts.ttl, state.rawTyp, readTS); err != nil { + return err + } + result = redisSetExecution{state: state, wroteOldBulk: opts.returnOld} + return nil + }) + return result, err +} + +// trySetFastPath attempts the fast-path for SET (no NX/XX/GET flags) when the +// key is a string or absent. Returns true if the fast-path handled the command. +// When the key holds a non-string type, returns false so the caller can fall +// through to executeSet which cleans up internal keys before overwriting. +func (r *RedisServer) trySetFastPath(conn redcon.Conn, ctx context.Context, key, value []byte, ttl *time.Time) bool { + // Only use the fast path when we are the leader for this key so the local + // type check is authoritative. On followers, stale MVCC state could miss a + // non-string type, leaving orphaned internal keys after overwrite. + if !r.coordinator.IsLeaderForKey(key) { + return false + } + readTS := r.readTS() + // Use rawKeyTypeAt (TTL-unaware) so that expired keys whose internal data + // still exists are detected and routed through the full cleanup path. + typ, err := r.rawKeyTypeAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return true + } + if isNonStringCollectionType(typ) { + return false + } + if err := r.saveString(ctx, key, value, ttl); err != nil { + writeRedisError(conn, err) + return true + } + conn.WriteString("OK") + return true +} + +func (r *RedisServer) set(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + // Option-2 dedup for standalone SET: route through runTransactionWithDedup + // as a single-mop EXEC body when the gate is on. SET inside MULTI/EXEC + // already has full dedup coverage via applySet (§M3 in the design doc), + // so we just reuse that machinery instead of building a per-handler + // reusableSetTxn + dispatchSetReuse shape. The fast-path optimization is + // intentionally bypassed under the gate — dedup is opt-in, and a + // non-dedup'd fast path under a dedup-on cluster would split the + // idempotency contract. + // + // Result translation: runTransactionWithDedup returns []redisResult; for + // SET there is exactly one element with the same redisResult shape as + // the standalone reply (resultString OK / resultNil for NX/XX miss / + // resultBulk for GET). + // Both gates must be on to route standalone SET through the dedup path. + // onePhaseTxnDedup covers the MULTI/EXEC and list-push retries that the + // parent design's M4 validated; standaloneSetDedup is a separate sub-gate + // (default off) because applySet diverges from executeSet on SET-over- + // collection — flipping onePhaseTxnDedup default-on without this guard + // would change normal Redis overwrite behaviour (PR #943 round-1 codex P1). + if r.onePhaseTxnDedup && r.standaloneSetDedup { + // Call runTransactionWithDedup directly instead of going through + // runTransaction. runTransaction re-checks the same + // r.onePhaseTxnDedup gate and routes here anyway; the indirection + // would make the call chain misleading ("dispatches via + // runTransactionWithDedup" being true only by indirection). + // Direct call makes the intent explicit and removes the double + // gate check. + results, err := r.runTransactionWithDedup([]redcon.Command{cmd}) + if err != nil { + writeRedisError(conn, err) + return + } + writeRedisStandaloneResult(conn, results) + return + } + r.setLegacy(conn, cmd) +} + +// setLegacy is the pre-dedup standalone SET path. Extracted from set() so +// the gate-on routing through runTransactionWithDedup keeps set() under the +// cyclop budget (the gate-off branch's parse + fast-path + executeSet +// shape carries its own decision points). Behaviour is byte-identical to +// the pre-PR set() body. +func (r *RedisServer) setLegacy(conn redcon.Conn, cmd redcon.Command) { + opts, err := parseRedisSetOptions(cmd.Args[3:], time.Now()) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + if opts.isFastPath() && r.trySetFastPath(conn, ctx, cmd.Args[1], cmd.Args[2], opts.ttl) { + return + } + + result, err := r.executeSet(ctx, cmd.Args[1], cmd.Args[2], opts) + if err != nil { + writeRedisError(conn, err) + return + } + if result.wroteNull { + conn.WriteNull() + return + } + if result.wroteOldBulk { + if result.state.oldValue == nil { + conn.WriteNull() + return + } + conn.WriteBulk(result.state.oldValue) + return + } + conn.WriteString("OK") +} + +// writeRedisStandaloneResult translates a single-element results array from +// runTransactionWithDedup into a redcon response, mirroring the shape a +// standalone handler would write directly. Used by SET / future standalone +// commands routed through the dedup loop. Differs from writeResults in NOT +// wrapping the response in conn.WriteArray — the standalone protocol returns +// the bare element. +// +// Empty or multi-element input is degenerate for standalone callers; we +// default to nil so a misuse never leaks a malformed reply to the wire. +// +// Array-element constraint: the resultArray arm writes each element via +// WriteBulkString, which is correct for flat arrays of strings (the +// shape applySet / future SET-pattern callers produce). It does NOT +// recurse into nested arrays. A future caller whose applyXxx emits +// resultArray with non-string elements (e.g. HGETALL-like nested +// responses) must either pre-flatten its result or extend this switch +// with a recursive arm; reusing it as-is would silently mangle the +// wire reply. +func writeRedisStandaloneResult(conn redcon.Conn, results []redisResult) { + if len(results) != 1 { + conn.WriteNull() + return + } + res := results[0] + switch res.typ { + case resultNil: + conn.WriteNull() + case resultError: + writeRedisError(conn, res.err) + case resultBulk: + conn.WriteBulk(res.bulk) + case resultString: + conn.WriteString(res.str) + case resultArray: + conn.WriteArray(len(res.arr)) + for _, s := range res.arr { + conn.WriteBulkString(s) + } + case resultInt: + conn.WriteInt64(res.integer) + default: + conn.WriteNull() + } +} + +func (r *RedisServer) get(conn redcon.Conn, cmd redcon.Command) { + key := cmd.Args[1] + if r.proxyToLeader(conn, cmd, key) { + return + } + + // Single bounded context for the slow paths in this handler, + // derived from the server's base context so Close() cancels any + // in-flight handler instead of leaving it running on a detached + // context.Background(). Only LeaseReadForKey and keyTypeAt accept + // a context; readRedisStringAt is a local-store read that does + // not take one. The shared deadline bounds the only branches + // that can actually block on quorum / I/O. + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + if _, err := kv.LeaseReadForKeyThrough(r.coordinator, ctx, key); err != nil { + writeRedisError(conn, err) + return + } + readTS := r.readTS() + + // Fast path: attempt the string read directly instead of probing + // every possible Redis encoding first. rawKeyTypeAt issues up to + // ~17 pebble seeks (list meta + list delta + 3×wide-column probes + // each doing 3 seeks + hash/set/zset/stream/HLL/str/bare); that + // overhead dominated every GET on a hot cluster (see + // docs/design/2026_04_20_implemented_lease_read.md). A live string key resolves in 1-2 + // seeks here, and we only fall back to keyTypeAt when the string + // path returns ErrKeyNotFound (meaning either missing, expired, + // or a non-string type is present under this user-key). + // + // Use the snapshot variant: LeaseReadForKeyThrough above already + // established the ReadIndex fence, so a per-call VerifyLeaderForKey + // (inside leaderAwareGetAt) would duplicate the quorum work. + v, _, err := r.readRedisStringAtSnapshot(key, readTS) + if err == nil { + conn.WriteBulk(v) + return + } + if !errors.Is(err, store.ErrKeyNotFound) { + writeRedisError(conn, err) + return + } + + // Slow path: disambiguate "missing / expired" from WRONGTYPE. + // keyTypeAt applies the TTL filter, so an expired string reports + // as redisTypeNone here and we return nil -- matching the + // pre-optimisation behaviour. + typ, err := r.keyTypeAt(ctx, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteNull() + return + } + // If keyTypeAt disagrees with the fast path and classifies the key + // as a live string (e.g. a rare TTL-filter discrepancy between + // decodePrefixedStringWith/readBareLegacyStringWith and + // hasExpiredTTLAt), match the pre-optimisation behaviour and + // return nil rather than WRONGTYPE. + if typ == redisTypeString { + conn.WriteNull() + return + } + conn.WriteError(wrongTypeMessage) +} + +// leaderEmbeddedTTLExpired looks at !redis|str| on the leader and, if the +// payload is in new format, returns the embedded-TTL expiry verdict. The bool +// indicates whether the caller should use this verdict (true) or fall through +// to the legacy !redis|ttl| index (false). +func (r *RedisServer) leaderEmbeddedTTLExpired(key []byte) (bool, bool) { + raw, err := r.tryLeaderGetAt(redisStrKey(key), 0) + if err != nil || !isNewRedisStrFormat(raw) { + return false, false + } + _, expireAt, decErr := decodeRedisStr(raw) + if decErr != nil { + // Malformed new-format payload: treat as expired rather than silently alive. + return true, true + } + if expireAt == nil { + return false, true + } + return !expireAt.After(time.Now()), true +} + +// isLeaderKeyExpired checks whether the key has an expired TTL on the leader. +func (r *RedisServer) isLeaderKeyExpired(key []byte) bool { + // For string keys with new encoding: check embedded TTL. + if expired, ok := r.leaderEmbeddedTTLExpired(key); ok { + return expired + } + raw, err := r.tryLeaderGetAt(redisTTLKey(key), 0) + if err != nil { + return false + } + ttl, err := decodeRedisTTL(raw) + if err != nil { + return false + } + return !ttl.After(time.Now()) +} + +// tryLeaderNonStringExists checks whether the key exists as a non-string type +// (hash, set, zset, stream, HLL, or list) on the leader. Returns false if the +// key has an expired TTL. +func (r *RedisServer) tryLeaderNonStringExists(key []byte) bool { + // Check TTL first: if expired, the key is logically gone. + if raw, err := r.tryLeaderGetAt(redisTTLKey(key), 0); err == nil { + if ttl, decErr := decodeRedisTTL(raw); decErr == nil && !ttl.After(time.Now()) { + return false + } + } + for _, internalKey := range [][]byte{ + redisHashKey(key), + redisSetKey(key), + redisHLLKey(key), + redisZSetKey(key), + redisStreamKey(key), + } { + if _, err := r.tryLeaderGetAt(internalKey, 0); err == nil { + return true + } + } + if _, err := r.tryLeaderGetAt(listMetaKey(key), 0); err == nil { + return true + } + return false +} + +// tryLeaderLogicalExists checks whether the key exists as any type on the leader. +func (r *RedisServer) tryLeaderLogicalExists(key []byte) bool { + // Prefer asking the leader's Redis command path directly: it evaluates + // existence with ttlAt() semantics (including the in-memory TTL buffer). + // If this path is unavailable we fall back to raw-KV probing, which is + // best-effort and may lag unflushed buffer-only TTL updates. + if cli, err := r.leaderClientForKey(key); err == nil { + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + if count, existsErr := cli.Exists(ctx, string(key)).Result(); existsErr == nil { + return count > 0 + } + } + + // Fallback to raw KV probing if Redis command proxying is unavailable. + if r.isLeaderKeyExpired(key) { + return false + } + // String type (raw user key). + if _, err := r.tryLeaderGetAt(key, 0); err == nil { + return true + } + return r.tryLeaderNonStringExists(key) +} + +func (r *RedisServer) del(conn redcon.Conn, cmd redcon.Command) { + // DEL discovers internal keys via local MVCC state. On followers this state + // may lag, producing incomplete deletes. Check per-key leadership and proxy + // non-local keys to the correct leader for accurate internal-key discovery. + localKeys := make([][]byte, 0, len(cmd.Args)-1) + proxyKeys := make([][]byte, 0) + for _, key := range cmd.Args[1:] { + if r.coordinator.IsLeaderForKey(key) { + localKeys = append(localKeys, key) + } else { + proxyKeys = append(proxyKeys, key) + } + } + + var removed int64 + + // Proxy non-local keys to the appropriate leader. + if len(proxyKeys) > 0 { + proxied, err := r.proxyDel(proxyKeys) + if err != nil { + writeRedisError(conn, err) + return + } + removed += proxied + } + + // Delete local keys directly. + if len(localKeys) > 0 { + localRemoved, err := r.delLocal(localKeys) + if err != nil { + writeRedisError(conn, err) + return + } + removed += int64(localRemoved) + } + + conn.WriteInt64(removed) +} + +func (r *RedisServer) delLocal(keys [][]byte) (int, error) { + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + var removed int + err := r.retryRedisWrite(ctx, func() error { + elems := []*kv.Elem[kv.OP]{} + nextRemoved := 0 + readTS := r.readTS() + for _, key := range keys { + keyElems, existed, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + if existed { + nextRemoved++ + } + elems = append(elems, keyElems...) + } + if err := r.dispatchElems(ctx, true, readTS, elems); err != nil { + return err + } + removed = nextRemoved + return nil + }) + return removed, err +} + +func (r *RedisServer) exists(conn redcon.Conn, cmd redcon.Command) { + readTS := r.readTS() + // Derive ctx from the server's base context so work in this handler + // that honors context deadlines is bounded and cancels on shutdown. + // Local Pebble reads (store.GetAt / ExistsAt / ScanAt) currently + // ignore the context parameter, so cancellation does not interrupt + // an in-flight local probe. The negative-result follower fallback + // currently calls tryLeaderLogicalExists(), which manages its own + // timeout/context rather than using this ctx. + ctx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + count := 0 + for _, key := range cmd.Args[1:] { + ok, err := r.existsAtFast(ctx, key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + if ok { + count++ + } else if !r.coordinator.IsLeaderForKey(key) { + // Local MVCC may be stale on a follower; proxy to the leader. + if r.tryLeaderLogicalExists(key) { + count++ + } + } + } + conn.WriteInt(count) +} + +// existsAtFast is a string-first fast path for EXISTS-style liveness +// checks. Strings dominate real workloads, and a live string key +// resolves here in 1-2 seeks against redisStrKey (with TTL filtering +// applied inline) versus the ~17 seeks of a full logicalExistsAt +// probe. When the redisStrKey probe misses we fall back to the full +// type-probe. +// +// The probe goes directly to the local store. EXISTS tolerates stale- +// positive reads on followers by design -- the pre-optimisation flow +// (logicalExistsAt → keyTypeAt → local store.ExistsAt) never proxied +// to the leader for the probe itself; proxying is reserved for the +// negative-result fallback (tryLeaderLogicalExists in the caller). +// Routing through readRedisStringAt here would instead issue a Raft +// round-trip per key on every follower, regressing EXISTS latency on +// workloads that were previously all-local. +func (r *RedisServer) existsAtFast(ctx context.Context, key []byte, readTS uint64) (bool, error) { + raw, err := r.store.GetAt(ctx, redisStrKey(key), readTS) + if err == nil { + alive, decErr := r.stringPayloadIsLive(ctx, key, raw, readTS) + if decErr != nil { + return false, errors.WithStack(decErr) + } + if alive { + return true, nil + } + // Expired: fall through so other encodings still get their + // chance. Undecodable payloads are already propagated as an + // error by stringPayloadIsLive above -- they're a corruption + // signal, not a "try something else" case. + } else if !errors.Is(err, store.ErrKeyNotFound) { + return false, errors.WithStack(err) + } + return r.logicalExistsAt(ctx, key, readTS) +} + +// stringPayloadIsLive reports whether a redisStrKey payload is still +// TTL-alive. New-format payloads carry their expiry inline; legacy- +// format payloads need the !redis|ttl| index consulted for the TTL. +// Both paths use the LOCAL store, matching existsAtFast's no-proxy +// contract. +func (r *RedisServer) stringPayloadIsLive(ctx context.Context, key, raw []byte, readTS uint64) (bool, error) { + if isNewRedisStrFormat(raw) { + _, expireAt, err := decodeRedisStr(raw) + if err != nil { + return false, err + } + return expireAt == nil || expireAt.After(time.Now()), nil + } + ttl, err := r.legacyIndexTTLAt(ctx, key, readTS) + if err != nil { + return false, err + } + return ttl == nil || ttl.After(time.Now()), nil +} diff --git a/adapter/redis_txn.go b/adapter/redis_txn.go new file mode 100644 index 00000000..b271fdc9 --- /dev/null +++ b/adapter/redis_txn.go @@ -0,0 +1,1526 @@ +package adapter + +import ( + "bytes" + "context" + "math" + "sort" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +var redisTxnKeyPrefix = []byte("!txn|") + +type txnCommandHandler func(*txnContext, redcon.Command) (redisResult, error) + +var txnApplyHandlers = map[string]txnCommandHandler{ + cmdSet: (*txnContext).applySet, + cmdDel: (*txnContext).applyDel, + cmdGet: (*txnContext).applyGet, + cmdExists: (*txnContext).applyExists, + cmdRPush: (*txnContext).applyRPush, + cmdLRange: (*txnContext).applyLRange, + cmdZIncrBy: (*txnContext).applyZIncrBy, + cmdExpire: (*txnContext).applyExpireSeconds, + cmdPExpire: (*txnContext).applyExpireMilliseconds, +} + +// argsLen is derived from redisCommandSpecs in adapter/redis_command_specs.go. +// See that file for the canonical row list and the rationale for the +// single source of truth. + +// MULTI/EXEC/DISCARD handling +func (r *RedisServer) multi(conn redcon.Conn, _ redcon.Command) { + state := getConnState(conn) + if state.inTxn { + conn.WriteError("ERR MULTI calls can not be nested") + return + } + state.inTxn = true + state.queue = nil + conn.WriteString("OK") +} + +func (r *RedisServer) discard(conn redcon.Conn, _ redcon.Command) { + state := getConnState(conn) + if !state.inTxn { + conn.WriteError("ERR DISCARD without MULTI") + return + } + state.inTxn = false + state.queue = nil + conn.WriteString("OK") +} + +func (r *RedisServer) exec(conn redcon.Conn, _ redcon.Command) { + state := getConnState(conn) + if !state.inTxn { + conn.WriteError("ERR EXEC without MULTI") + return + } + + queue := state.queue + state.inTxn = false + state.queue = nil + + // Always execute MULTI/EXEC on the leader so that reads and writes within + // the transaction see consistent, up-to-date data. Serving transactions + // on followers risks reading stale MVCC state and producing write cycles. + if !r.coordinator.IsLeader() { + r.proxyTransactionToLeader(conn, queue) + return + } + + results, err := r.runTransaction(queue) + if err != nil { + writeRedisError(conn, err) + return + } + + r.writeResults(conn, results) +} + +type txnValue struct { + raw []byte + ttl *time.Time + deleted bool + dirty bool + loaded bool +} + +type txnContext struct { + server *RedisServer + // ctx is the per-EXEC dispatch context (redisDispatchTimeout-bounded + // at the call site in runTransaction). Plumbed through so reads + // inside the EXEC such as load() → readValueAt() respect the + // caller's deadline rather than falling back to handlerContext + + // the verifyLeaderEngineCtx safety net. + ctx context.Context //nolint:containedctx // EXEC is a long-lived value type that wraps a single client command, ctx must travel with it. + working map[string]*txnValue + listStates map[string]*listTxnState + zsetStates map[string]*zsetTxnState + ttlStates map[string]*ttlTxnState + readKeys map[string][]byte + // streamDeletions tracks user keys whose stream wide-column layout must + // be tombstoned on commit: the !stream|meta| record plus every + // !stream|entry| row. stageKeyDeletion seeds this (MULTI/EXEC + // DEL / EXPIRE 0) so migrated streams are properly removed rather than + // leaking entry keys past the DEL's apparent success. + streamDeletions map[string][]byte + startTS uint64 +} + +type listTxnState struct { + meta store.ListMeta + metaExists bool + appends [][]byte + deleted bool + purge bool + purgeMeta store.ListMeta + existingDeltas [][]byte // delta key bytes present at load time; deleted on purge/delete +} + +type zsetTxnState struct { + members map[string]float64 // current (potentially modified) state + origMembers map[string]float64 // original state at load time (for wide-column diff) + isWide bool // true if loaded from wide-column !zs|mem| storage + exists bool + dirty bool +} + +type ttlTxnState struct { + value *time.Time + dirty bool +} + +func stageListDelete(st *listTxnState) { + if st == nil { + return + } + if st.metaExists { + st.purge = true + st.purgeMeta = st.meta + } + st.deleted = true + st.appends = nil +} + +func (t *txnContext) trackReadKey(key []byte) { + if len(key) == 0 { + return + } + k := string(key) + if _, ok := t.readKeys[k]; ok { + return + } + t.readKeys[k] = bytes.Clone(key) +} + +func (t *txnContext) trackTypeReadKeys(key []byte) { + for _, readKey := range [][]byte{ + listMetaKey(key), + redisHashKey(key), + redisSetKey(key), + redisZSetKey(key), + redisStreamKey(key), // legacy single-blob stream key + store.StreamMetaKey(key), // post-migration wide-column stream meta + redisHLLKey(key), + redisStrKey(key), + key, // legacy bare key for fallback reads + } { + t.trackReadKey(readKey) + } +} + +func (t *txnContext) load(key []byte) (*txnValue, error) { + // If the key is already an internal key (e.g., !redis|hash|..., + // !lst|..., !txn|..., !ddb|..., !s3|..., !dist|...), use it as-is. + // Otherwise, it's a bare user key for a string value — prefix it. + storageKey := key + if !isKnownInternalKey(key) { + storageKey = redisStrKey(key) + } + k := string(storageKey) + if tv, ok := t.working[k]; ok { + return tv, nil + } + t.trackReadKey(storageKey) + if !isKnownInternalKey(key) { + // Track the bare key too for conflict detection on legacy fallback reads. + t.trackReadKey(key) + } + tv := &txnValue{} + var val []byte + if !isKnownInternalKey(key) { + // For bare user string keys, use the fallback-aware reader. + var ( + err error + ttl *time.Time + ) + val, ttl, err = t.server.readRedisStringAt(key, t.startTS) + if err != nil && !errors.Is(err, store.ErrKeyNotFound) { + return nil, errors.WithStack(err) + } + tv.ttl = ttl + } else { + var err error + // Some redis_txn_test.go fixtures build a minimal txnContext + // literal without setting ctx; fall back to Background so + // readValueAt's coordinator.VerifyLeaderForKey does not panic + // when wrapped via context.WithTimeout(nil, …). Same defensive + // pattern as streamDeletions / loadListState. + ctx := t.ctx + if ctx == nil { + ctx = context.Background() + } + val, err = t.server.readValueAt(ctx, storageKey, t.startTS) + if err != nil && !errors.Is(err, store.ErrKeyNotFound) { + return nil, errors.WithStack(err) + } + } + tv.raw = val + tv.loaded = true + t.working[k] = tv + return tv, nil +} + +func (t *txnContext) loadListState(key []byte) (*listTxnState, error) { + k := string(key) + if st, ok := t.listStates[k]; ok { + return st, nil + } + ctx := context.Background() + meta, exists, err := t.server.resolveListMeta(ctx, key, t.startTS) + if err != nil { + return nil, err + } + + // Capture existing delta keys so they can be deleted if the list is later + // purged or deleted within this transaction. Scan one extra item to detect + // truncation: if >MaxDeltaScanLimit deltas exist the transaction cannot + // safely enumerate all of them for deletion, so we return ErrDeltaScanTruncated + // and let the caller retry after the background compactor has caught up. + deltaPrefix := store.ListMetaDeltaScanPrefix(key) + deltaEnd := store.PrefixScanEnd(deltaPrefix) + deltaKVs, err := t.server.store.ScanAt(ctx, deltaPrefix, deltaEnd, store.MaxDeltaScanLimit+1, t.startTS) + if err != nil { + return nil, errors.WithStack(err) + } + if len(deltaKVs) > store.MaxDeltaScanLimit { + return nil, ErrDeltaScanTruncated + } + existingDeltas := make([][]byte, 0, len(deltaKVs)) + for _, kv := range deltaKVs { + existingDeltas = append(existingDeltas, kv.Key) + } + + st := &listTxnState{ + meta: meta, + metaExists: exists, + appends: [][]byte{}, + existingDeltas: existingDeltas, + } + t.listStates[k] = st + + // Track the list-item key at the current tail (and the position before the + // head) so that concurrent RPUSH/LPUSH operations—which write to exactly + // these positions—trigger a read-write conflict and force a retry. + // Without this, a MULTI transaction that reads a list via LRANGE can commit + // with a stale snapshot while a concurrent RPUSH commits a new item, + // forming an anti-dependency (G2-item) cycle. + // The base meta key (listMetaKey) is intentionally NOT tracked here: the + // Delta scheme allows the DeltaCompactor to rewrite it without conflicting + // with ongoing push/read transactions (see TestRedisTxnValidateReadSet_ListMetaUpdateNoConflict). + t.trackReadKey(listItemKey(key, meta.Head+meta.Len)) // next RPUSH target + if meta.Head > math.MinInt64 { + t.trackReadKey(listItemKey(key, meta.Head-1)) // next LPUSH target + } + + return st, nil +} + +func (t *txnContext) listLength(st *listTxnState) int64 { + return st.meta.Len + int64(len(st.appends)) +} + +func (t *txnContext) loadZSetState(key []byte) (*zsetTxnState, error) { + k := string(key) + if st, ok := t.zsetStates[k]; ok { + return st, nil + } + t.trackReadKey(redisZSetKey(key)) + // Check TTL: treat expired keys as non-existent. + ttlSt, err := t.loadTTLState(key) + if err != nil { + return nil, err + } + if ttlSt.value != nil && !ttlSt.value.After(time.Now()) { + st := &zsetTxnState{ + members: map[string]float64{}, + origMembers: map[string]float64{}, + exists: false, + } + t.zsetStates[k] = st + return st, nil + } + + // Detect wide-column storage by probing the !zs|mem| prefix. + memberPrefix := store.ZSetMemberScanPrefix(key) + memberEnd := store.PrefixScanEnd(memberPrefix) + probeKVs, probeErr := t.server.store.ScanAt(context.Background(), memberPrefix, memberEnd, 1, t.startTS) + if probeErr != nil { + return nil, errors.WithStack(probeErr) + } + isWide := len(probeKVs) > 0 + + value, exists, err := t.server.loadZSetAt(context.Background(), key, t.startTS) + if err != nil { + return nil, err + } + members := zsetEntriesToMap(value.Entries) + // Snapshot the original members for wide-column diff at commit time. + origMembers := make(map[string]float64, len(members)) + for m, s := range members { + origMembers[m] = s + } + st := &zsetTxnState{ + members: members, + origMembers: origMembers, + isWide: isWide, + exists: exists, + } + t.zsetStates[k] = st + return st, nil +} + +func (t *txnContext) loadTTLState(key []byte) (*ttlTxnState, error) { + k := string(key) + if st, ok := t.ttlStates[k]; ok { + return st, nil + } + value, err := t.server.ttlAt(context.Background(), key, t.startTS) + if err != nil { + return nil, err + } + st := &ttlTxnState{value: value} + t.ttlStates[k] = st + return st, nil +} + +func (t *txnContext) stagedKeyType(key []byte) (redisValueType, error) { + k := string(key) + if typ, ok := t.stagedZSetType(k); ok { + return typ, nil + } + if typ, ok := t.stagedListType(k); ok { + return typ, nil + } + if typ, ok := t.stagedStringType(k); ok { + return typ, nil + } + t.trackTypeReadKeys(key) + return t.server.keyTypeAt(context.Background(), key, t.startTS) +} + +func (t *txnContext) stagedZSetType(key string) (redisValueType, bool) { + st, ok := t.zsetStates[key] + if !ok || (!st.dirty && !st.exists) { + return redisTypeNone, false + } + if len(st.members) == 0 { + return redisTypeNone, true + } + return redisTypeZSet, true +} + +func (t *txnContext) stagedListType(key string) (redisValueType, bool) { + st, ok := t.listStates[key] + if !ok { + return redisTypeNone, false + } + if st.deleted { + return redisTypeNone, true + } + if st.metaExists || len(st.appends) > 0 { + return redisTypeList, true + } + return redisTypeNone, false +} + +func (t *txnContext) stagedStringType(key string) (redisValueType, bool) { + tv, ok := t.working[string(redisStrKey([]byte(key)))] + if !ok { + return redisTypeNone, false + } + if tv.deleted || tv.raw == nil { + return redisTypeNone, true + } + return redisTypeString, true +} + +func (t *txnContext) apply(cmd redcon.Command) (redisResult, error) { + handler, ok := txnApplyHandlers[strings.ToUpper(string(cmd.Args[0]))] + if !ok { + return redisResult{}, errors.WithStack(errors.Newf("ERR unsupported command '%s'", cmd.Args[0])) + } + return handler(t, cmd) +} + +func (t *txnContext) applyExpireSeconds(cmd redcon.Command) (redisResult, error) { + return t.applyExpire(cmd, time.Second) +} + +func (t *txnContext) applyExpireMilliseconds(cmd redcon.Command) (redisResult, error) { + return t.applyExpire(cmd, time.Millisecond) +} + +func (t *txnContext) applySet(cmd redcon.Command) (redisResult, error) { + if isList, err := t.server.isListKeyAt(context.Background(), cmd.Args[1], t.startTS); err != nil { + return redisResult{}, err + } else if isList { + return redisResult{typ: resultError, err: errors.New("WRONGTYPE Operation against a key holding the wrong kind of value")}, nil + } + + opts, err := parseRedisSetOptions(cmd.Args[3:], time.Now()) + if err != nil { + return redisResult{}, err + } + + // NX/XX: skip the write if the key-existence condition is not met. + blocked, res, err := t.applySetCondition(cmd.Args[1], opts) + if err != nil { + return redisResult{}, err + } + if blocked { + return res, nil + } + + tv, err := t.load(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + + var oldValue []byte + if opts.returnOld && !tv.deleted { + oldValue = tv.raw + } + + tv.raw = cmd.Args[2] + tv.deleted = false + tv.dirty = true + + // Always update TTL state: EX/PX sets a new expiry; a plain SET clears it + // (opts.ttl == nil → nil stored → PERSIST semantics, matching Redis behaviour). + if err := t.applySetTTL(cmd.Args[1], opts.ttl); err != nil { + return redisResult{}, err + } + + return applySetResult(opts, oldValue), nil +} + +// applySetCondition checks NX/XX conditions. Returns (blocked, result, err). +// blocked=true means the condition prevented the write; callers should return result. +// Returns (false, _, nil) immediately when no condition is set. +func (t *txnContext) applySetCondition(key []byte, opts redisSetOptions) (bool, redisResult, error) { + if !opts.existsCond && !opts.missingCond { + return false, redisResult{}, nil + } + typ, err := t.stagedKeyType(key) + if err != nil { + return false, redisResult{}, err + } + exists := typ != redisTypeNone + if (opts.missingCond && exists) || (opts.existsCond && !exists) { + return true, redisResult{typ: resultNil}, nil + } + return false, redisResult{}, nil +} + +// applySetTTL stores the expiry in ttlStates so flushTTLToBuffer sends it to +// the TTLBuffer after a successful commit. +func (t *txnContext) applySetTTL(key []byte, expireAt *time.Time) error { + ttlSt, err := t.loadTTLState(key) + if err != nil { + return err + } + ttlSt.value = expireAt + ttlSt.dirty = true + return nil +} + +// applySetResult returns the appropriate redisResult for a completed SET. +func applySetResult(opts redisSetOptions, oldValue []byte) redisResult { + if !opts.returnOld { + return redisResult{typ: resultString, str: "OK"} + } + if oldValue == nil { + return redisResult{typ: resultNil} + } + return redisResult{typ: resultBulk, bulk: oldValue} +} + +func (t *txnContext) applyDel(cmd redcon.Command) (redisResult, error) { + var deleted int64 + for _, key := range cmd.Args[1:] { + typ, err := t.stagedKeyType(key) + if err != nil { + return redisResult{}, err + } + if typ == redisTypeNone { + continue + } + if _, err := t.stageKeyDeletion(key); err != nil { + return redisResult{}, err + } + deleted++ + } + return redisResult{typ: resultInt, integer: deleted}, nil +} + +func (t *txnContext) applyGet(cmd redcon.Command) (redisResult, error) { + typ, err := t.stagedKeyType(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if isNonStringCollectionType(typ) { + return redisResult{typ: resultError, err: wrongTypeError()}, nil + } + + tv, err := t.load(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if tv.deleted || tv.raw == nil { + return redisResult{typ: resultNil}, nil + } + return redisResult{typ: resultBulk, bulk: tv.raw}, nil +} + +func (t *txnContext) applyExists(cmd redcon.Command) (redisResult, error) { + var count int64 + for _, key := range cmd.Args[1:] { + typ, err := t.stagedKeyType(key) + if err != nil { + return redisResult{}, err + } + if typ != redisTypeNone { + count++ + } + } + return redisResult{typ: resultInt, integer: count}, nil +} + +func (t *txnContext) applyRPush(cmd redcon.Command) (redisResult, error) { + st, err := t.loadListState(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if st.deleted { + if st.metaExists { + st.purge = true + st.purgeMeta = st.meta + } + // DEL followed by RPUSH in the same transaction recreates the list. + st.deleted = false + st.metaExists = false + st.meta = store.ListMeta{} + st.appends = nil + } + + for _, v := range cmd.Args[2:] { + st.appends = append(st.appends, bytes.Clone(v)) + } + + return redisResult{typ: resultInt, integer: t.listLength(st)}, nil +} + +func (t *txnContext) applyLRange(cmd redcon.Command) (redisResult, error) { + st, err := t.loadListState(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + + s, e, err := parseRangeBounds(cmd.Args[2], cmd.Args[3], int(t.listLength(st))) + if err != nil { + return redisResult{}, err + } + if e < s { + return redisResult{typ: resultArray, arr: []string{}}, nil + } + + out, err := t.listRangeValues(cmd.Args[1], st, s, e) + if err != nil { + return redisResult{}, err + } + + return redisResult{typ: resultArray, arr: out}, nil +} + +func (t *txnContext) applyZIncrBy(cmd redcon.Command) (redisResult, error) { + typ, err := t.stagedKeyType(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if typ != redisTypeNone && typ != redisTypeZSet { + return redisResult{typ: resultError, err: wrongTypeError()}, nil + } + + inc, err := strconv.ParseFloat(string(cmd.Args[2]), 64) + if err != nil { + return redisResult{}, errors.WithStack(err) + } + st, err := t.loadZSetState(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + member := string(cmd.Args[3]) + st.members[member] += inc + st.dirty = true + return redisResult{typ: resultBulk, bulk: []byte(formatRedisFloat(st.members[member]))}, nil +} + +func (t *txnContext) applyExpire(cmd redcon.Command, unit time.Duration) (redisResult, error) { + typ, err := t.stagedKeyType(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if typ == redisTypeNone { + return redisResult{typ: resultInt, integer: 0}, nil + } + + ttl, err := strconv.ParseInt(string(cmd.Args[2]), 10, 64) + if err != nil { + return redisResult{}, errors.WithStack(err) + } + nxOnly, err := parseExpireNXOnly(cmd.Args[3:]) + if err != nil { + return redisResult{}, err + } + + state, err := t.loadTTLState(cmd.Args[1]) + if err != nil { + return redisResult{}, err + } + if nxOnly && hasActiveTTL(state.value, time.Now()) { + return redisResult{typ: resultInt, integer: 0}, nil + } + + if ttl <= 0 { + return t.stageKeyDeletion(cmd.Args[1]) + } + return t.applyPositiveExpire(cmd.Args[1], ttl, unit, typ, state) +} + +func (t *txnContext) applyPositiveExpire(key []byte, ttl int64, unit time.Duration, typ redisValueType, state *ttlTxnState) (redisResult, error) { + if ttl > math.MaxInt64/int64(unit) { + return redisResult{}, errors.New("ERR invalid expire time in command") + } + expireAt := time.Now().Add(time.Duration(ttl) * unit) + state.value = &expireAt + state.dirty = true + if typ == redisTypeString { + plain, err := t.server.isPlainRedisString(context.Background(), key, t.startTS) + if err != nil { + return redisResult{}, err + } + if plain { + return t.markStringDirty(key) + } + // HLL is reported as redisTypeString but stores its payload under + // !redis|hll|; keep TTL in the legacy scan index via buildTTLElems. + } + return redisResult{typ: resultInt, integer: 1}, nil +} + +// markStringDirty loads the string value into the working set so that +// buildKeyElems will re-encode it with the updated embedded TTL. +func (t *txnContext) markStringDirty(key []byte) (redisResult, error) { + tv, err := t.load(key) + if err != nil { + return redisResult{}, err + } + tv.dirty = true + return redisResult{typ: resultInt, integer: 1}, nil +} + +func (t *txnContext) stageKeyDeletion(key []byte) (redisResult, error) { + // Mark the list for deletion. + st, err := t.loadListState(key) + if err != nil { + return redisResult{}, err + } + stageListDelete(st) + // Mark the string/main value for deletion. + tv, err := t.load(key) + if err != nil { + return redisResult{}, err + } + tv.deleted = true + tv.dirty = true + // Mark TTL for deletion. + ttlState, err := t.loadTTLState(key) + if err != nil { + return redisResult{}, err + } + ttlState.value = nil + ttlState.dirty = true + // Mark zset for deletion. Use empty map (not nil) so that subsequent + // writes (e.g. ZINCRBY) in the same transaction can safely insert. + zs, err := t.loadZSetState(key) + if err != nil { + return redisResult{}, err + } + zs.members = map[string]float64{} + zs.exists = false + zs.dirty = true + // Mark hash, set, stream (legacy blob), and HLL internal keys for deletion. + for _, internalKey := range [][]byte{ + redisHashKey(key), + redisSetKey(key), + redisStreamKey(key), + redisHLLKey(key), + } { + iv, err := t.load(internalKey) + if err != nil { + return redisResult{}, err + } + iv.deleted = true + iv.dirty = true + } + // Stage the wide-column stream cleanup: the !stream|meta| record and + // every !stream|entry| row must also be tombstoned when the user deletes + // a migrated stream via MULTI/EXEC DEL or EXPIRE 0. Without this step + // the command would report success but leave rows behind, and a later + // XLEN / XREAD would "resurrect" the stream. commit() expands this + // entry into concrete Del elems by scanning the entry-key prefix. + // The map is lazy-initialised so test fixtures that build a minimal + // txnContext literal without this field still work. + if t.streamDeletions == nil { + t.streamDeletions = map[string][]byte{} + } + t.streamDeletions[string(key)] = bytes.Clone(key) + t.trackReadKey(store.StreamMetaKey(key)) + // Mark legacy bare string key for deletion. We bypass load() here + // because load() auto-prefixes bare keys to !redis|str|. + // Track the bare key in the read set for conflict detection. + t.trackReadKey(key) + bareK := string(key) + if _, ok := t.working[bareK]; !ok { + t.working[bareK] = &txnValue{} + } + t.working[bareK].deleted = true + t.working[bareK].dirty = true + return redisResult{typ: resultInt, integer: 1}, nil +} + +func parseRangeBounds(startRaw, endRaw []byte, total int) (int, int, error) { + start, err := parseInt(startRaw) + if err != nil { + return 0, 0, err + } + end, err := parseInt(endRaw) + if err != nil { + return 0, 0, err + } + s, e := clampRange(start, end, total) + return s, e, nil +} + +func (t *txnContext) listRangeValues(key []byte, st *listTxnState, s, e int) ([]string, error) { + persistedLen := int(st.meta.Len) + + switch { + case e < persistedLen: + return t.server.fetchListRange(context.Background(), key, st.meta, int64(s), int64(e), t.startTS) + case s >= persistedLen: + return appendValues(st.appends, s-persistedLen, e-persistedLen), nil + default: + head, err := t.server.fetchListRange(context.Background(), key, st.meta, int64(s), int64(persistedLen-1), t.startTS) + if err != nil { + return nil, err + } + tail := appendValues(st.appends, 0, e-persistedLen) + return append(head, tail...), nil + } +} + +func appendValues(buf [][]byte, start, end int) []string { + out := make([]string, 0, end-start+1) + for i := start; i <= end; i++ { + out = append(out, string(buf[i])) + } + return out +} + +func (t *txnContext) validateReadSet(ctx context.Context) error { + for _, key := range t.readKeys { + latestTS, exists, err := t.server.store.LatestCommitTS(ctx, key) + if err != nil { + return errors.WithStack(err) + } + if exists && latestTS > t.startTS { + return errors.WithStack(store.NewWriteConflictError(key)) + } + } + return nil +} + +// preparedTxnDispatch is the fully-assembled write set + read set + commit +// timestamp for a MULTI/EXEC transaction, ready to be passed to +// coordinator.Dispatch. Split out from commit() so the option-2 dedup +// path (runTransactionWithDedup) can intercept between prepare and +// dispatch — it needs to capture (elems, commitTS, readKeys) for a +// possible retry under PrevCommitTS without otherwise duplicating the +// commit-building logic. The owned ctx is the redisDispatchTimeout- +// bounded context the caller must run Dispatch under and Cancel after. +type preparedTxnDispatch struct { + elems []*kv.Elem[kv.OP] + commitTS uint64 + readKeys [][]byte + ctx context.Context + cancel context.CancelFunc +} + +// prepareDispatch builds everything Dispatch needs (elems, commitTS, +// readKeys, ctx) without actually calling Dispatch. Callers must always +// invoke `cancel()` on the returned prepared value once the dispatch +// attempt finishes (commit() does this via defer; the dedup path does it +// per retry iteration). When the transaction has no writes this returns +// a prepared value with empty `elems` and a no-op cancel — callers can +// check len(prepared.elems)==0 and skip the dispatch. +func (t *txnContext) prepareDispatch() (preparedTxnDispatch, error) { + elems := t.buildKeyElems() + + // Pre-allocate commitTS so Delta keys can embed it in their bytes before + // the coordinator assigns it during Dispatch. + commitTS, err := t.server.coordinator.Clock().NextFenced() + if err != nil { + return preparedTxnDispatch{cancel: func() {}}, errors.Wrap(err, "redis txn commit: allocate commitTS") + } + listElems := t.buildListElems(commitTS) + zsetElems, err := t.buildZSetElems(commitTS) + if err != nil { + return preparedTxnDispatch{cancel: func() {}}, err + } + // TTL elements: string keys have TTL embedded in value (buildKeyElems handles that), + // non-string keys get a !redis|ttl| element written in the same transaction. + ttlElems := t.buildTTLElems() + + // Derive a single redisDispatchTimeout-bounded context covering both + // the stream-deletion scans (paginated ScanAt/ExistsAt over + // StreamEntryScanPrefix) and the final Dispatch. The parent is the + // txnContext's own ctx (the caller's dispatchCtx), not the server- + // lifetime handlerContext, so an outer cancellation (client + // disconnect, retryRedisWrite timeout) interrupts the prepare+dispatch + // promptly instead of waiting the full redisDispatchTimeout. Symmetric + // with the reuseCtx threading in runTransactionWithDedup. The nil-guard + // falls back to handlerContext for callers that construct a txnContext + // without setting ctx (test fixtures). + parentCtx := t.ctx + if parentCtx == nil { + parentCtx = t.server.handlerContext() + } + ctx, cancel := context.WithTimeout(parentCtx, redisDispatchTimeout) + + streamElems, err := t.buildStreamDeletionElems(ctx) + if err != nil { + cancel() + return preparedTxnDispatch{cancel: func() {}}, err + } + + elems = append(elems, listElems...) + elems = append(elems, zsetElems...) + elems = append(elems, ttlElems...) + elems = append(elems, streamElems...) + + readKeys := make([][]byte, 0, len(t.readKeys)) + for _, k := range t.readKeys { + readKeys = append(readKeys, k) + } + return preparedTxnDispatch{ + elems: elems, + commitTS: commitTS, + readKeys: readKeys, + ctx: ctx, + cancel: cancel, + }, nil +} + +func (t *txnContext) commit() error { + prepared, err := t.prepareDispatch() + if err != nil { + return err + } + defer prepared.cancel() + if len(prepared.elems) == 0 { + return nil + } + group := &kv.OperationGroup[kv.OP]{ + IsTxn: true, + Elems: prepared.elems, + StartTS: t.startTS, + CommitTS: prepared.commitTS, + ReadKeys: prepared.readKeys, + } + if _, err := t.server.coordinator.Dispatch(prepared.ctx, group); err != nil { + return errors.WithStack(err) + } + return nil +} + +// stringValueAndTTLElem returns the encoded string value and an optional +// !redis|ttl| scan-index mutation for a string write. Dirty EXPIRE/PERSIST +// state takes priority; otherwise the TTL loaded with the value is preserved +// so commands like INCR or SETBIT inside MULTI/EXEC don't clear it. A dirty +// PERSIST emits a Del so the sweeper cannot later expire a persistent key. +func (t *txnContext) stringValueAndTTLElem(userKey []byte, tv *txnValue) ([]byte, *kv.Elem[kv.OP]) { + ttl := tv.ttl + ttlSt := t.ttlStates[string(userKey)] + if ttlSt != nil && ttlSt.dirty { + ttl = ttlSt.value + } + value := encodeRedisStr(tv.raw, ttl) + if ttl != nil { + return value, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey(userKey), Value: encodeRedisTTL(*ttl)} + } + // ttl is nil: emit Del when there was a prior TTL (loaded or dirty-cleared) + // so the sweeper cannot later expire a now-persistent key or hit a stale index. + if tv.ttl != nil || (ttlSt != nil && ttlSt.dirty) { + return value, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(userKey)} + } + return value, nil +} + +func (t *txnContext) buildKeyElems() []*kv.Elem[kv.OP] { + keys := make([]string, 0, len(t.working)) + for k := range t.working { + keys = append(keys, k) + } + sort.Strings(keys) + + var elems []*kv.Elem[kv.OP] + for _, k := range keys { + tv := t.working[k] + if !tv.dirty { + continue + } + storageKey := []byte(k) + if tv.deleted { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: storageKey}) + // Deleting a string anchor must also drop any stale !redis|ttl| + // scan-index entry; buildTTLElems skips strings because it assumes + // the inline-TTL path owns them. + if bytes.HasPrefix(storageKey, []byte(redisStrPrefix)) { + userKey := storageKey[len(redisStrPrefix):] + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey(userKey)}) + } + continue + } + value := tv.raw + if bytes.HasPrefix(storageKey, []byte(redisStrPrefix)) { + userKey := storageKey[len(redisStrPrefix):] + var extra *kv.Elem[kv.OP] + value, extra = t.stringValueAndTTLElem(userKey, tv) + if extra != nil { + elems = append(elems, extra) + } + } + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: storageKey, Value: value}) + } + return elems +} + +func listDeleteMeta(st *listTxnState) (store.ListMeta, bool) { + switch { + case st.metaExists: + return st.meta, true + case st.purge: + return st.purgeMeta, true + default: + return store.ListMeta{}, false + } +} + +func appendListDeleteOps(elems []*kv.Elem[kv.OP], userKey []byte, meta store.ListMeta) []*kv.Elem[kv.OP] { + for seq := meta.Head; seq < meta.Tail; seq++ { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: listItemKey(userKey, seq)}) + } + return append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: listMetaKey(userKey)}) +} + +func (t *txnContext) buildListElems(commitTS uint64) []*kv.Elem[kv.OP] { + listKeys := make([]string, 0, len(t.listStates)) + for k := range t.listStates { + listKeys = append(listKeys, k) + } + sort.Strings(listKeys) + + var elems []*kv.Elem[kv.OP] + var seqInTxn uint32 + for _, k := range listKeys { + st := t.listStates[k] + userKey := []byte(k) + + if st.deleted { + if meta, ok := listDeleteMeta(st); ok { + elems = appendListDeleteOps(elems, userKey, meta) + } + // Delete existing delta keys so they don't survive the logical delete. + for _, dk := range st.existingDeltas { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: dk}) + } + continue + } + if len(st.appends) == 0 { + continue + } + if st.purge { + elems = appendListDeleteOps(elems, userKey, st.purgeMeta) + // Delete existing delta keys so they don't accumulate after DEL+RPUSH. + for _, dk := range st.existingDeltas { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: dk}) + } + } + + startSeq := st.meta.Head + st.meta.Len + for i, v := range st.appends { + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: listItemKey(userKey, startSeq+int64(i)), + Value: v, + }) + } + + // Emit a Delta key instead of updating the base metadata key. + // Each list key in this transaction gets a unique seqInTxn. + n := int64(len(st.appends)) + deltaVal := store.MarshalListMetaDelta(store.ListMetaDelta{HeadDelta: 0, LenDelta: n}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ListMetaDeltaKey(userKey, commitTS, seqInTxn), + Value: deltaVal, + }) + seqInTxn++ + } + return elems +} + +func (t *txnContext) buildZSetElems(commitTS uint64) ([]*kv.Elem[kv.OP], error) { + keys := make([]string, 0, len(t.zsetStates)) + for k := range t.zsetStates { + keys = append(keys, k) + } + sort.Strings(keys) + + elems := make([]*kv.Elem[kv.OP], 0, len(keys)) + seqInTxn := uint32(0) + for _, k := range keys { + st := t.zsetStates[k] + if !st.dirty { + continue + } + key := []byte(k) + if st.isWide { + wideElems, lenDelta := buildZSetWideElems(key, st) + elems = append(elems, wideElems...) + if lenDelta != 0 { + deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: lenDelta}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetMetaDeltaKey(key, commitTS, seqInTxn), + Value: deltaVal, + }) + seqInTxn++ + } + continue + } + // Legacy blob path. + if len(st.members) == 0 { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisZSetKey(key)}) + continue + } + payload, err := marshalZSetValue(redisZSetValue{Entries: zsetMapToEntries(st.members)}) + if err != nil { + return nil, err + } + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisZSetKey(key), Value: payload}) + } + return elems, nil +} + +// buildZSetWideElems computes the minimal set of ops to transition from st.origMembers to +// st.members in wide-column format. Returns the ops and the net length delta. +func buildZSetWideElems(key []byte, st *zsetTxnState) ([]*kv.Elem[kv.OP], int64) { + elems := make([]*kv.Elem[kv.OP], 0, len(st.members)+len(st.origMembers)) + var lenDelta int64 + + // Deletions: members removed or score changed (old score index must be removed). + for member, oldScore := range st.origMembers { + newScore, inNew := st.members[member] + if !inNew { + // Fully removed. + elems = append(elems, + &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetMemberKey(key, []byte(member))}, + &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}, + ) + lenDelta-- + } else if newScore != oldScore { + // Score updated: delete old score index. + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}) + } + } + + // Insertions / updates. + for member, newScore := range st.members { + _, wasOrig := st.origMembers[member] + elems = append(elems, + &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetMemberKey(key, []byte(member)), Value: store.MarshalZSetScore(newScore)}, + &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, newScore, []byte(member)), Value: []byte{}}, + ) + if !wasOrig { + lenDelta++ + } + } + return elems, lenDelta +} + +// buildStreamDeletionElems expands every user key queued in streamDeletions +// into the Del operations that actually tombstone a migrated stream: +// !stream|meta| and every !stream|entry| row. Called from +// commit() so that MULTI/EXEC DEL / EXPIRE 0 on a migrated stream leaves +// the store in a consistent state instead of only dropping the legacy blob. +// Each scan runs at t.startTS so the delete honours the transaction's +// snapshot view. +// +// ctx is the redisDispatchTimeout-bounded context derived in commit(); it +// caps the paginated ExistsAt + scanAllDeltaElems inside +// deleteStreamWideColumnElems so a pathological staged-stream count cannot +// hold the EXEC handler open past the per-request budget. +func (t *txnContext) buildStreamDeletionElems(ctx context.Context) ([]*kv.Elem[kv.OP], error) { + if len(t.streamDeletions) == 0 { + return nil, nil + } + keys := make([]string, 0, len(t.streamDeletions)) + for k := range t.streamDeletions { + keys = append(keys, k) + } + sort.Strings(keys) + var elems []*kv.Elem[kv.OP] + for _, k := range keys { + userKey := t.streamDeletions[k] + streamElems, err := t.server.deleteStreamWideColumnElems(ctx, userKey, t.startTS) + if err != nil { + return nil, err + } + elems = append(elems, streamElems...) + } + return elems, nil +} + +// buildTTLElems returns !redis|ttl| Raft elements for non-string keys with dirty TTL state. +// String keys have TTL embedded in the value; they are handled by buildKeyElems. +func (t *txnContext) buildTTLElems() []*kv.Elem[kv.OP] { + var elems []*kv.Elem[kv.OP] + for k, st := range t.ttlStates { + if !st.dirty { + continue + } + // String keys encode TTL inside the value in buildKeyElems; skip them here. + if _, isString := t.working[string(redisStrKey([]byte(k)))]; isString { + continue + } + if st.value == nil { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: redisTTLKey([]byte(k))}) + } else { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Put, Key: redisTTLKey([]byte(k)), Value: encodeRedisTTL(*st.value)}) + } + } + return elems +} + +func (r *RedisServer) runTransaction(queue []redcon.Command) ([]redisResult, error) { + if r.onePhaseTxnDedup { + return r.runTransactionWithDedup(queue) + } + + dispatchCtx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + var results []redisResult + err := r.retryRedisWrite(dispatchCtx, func() error { + startTS := r.txnStartTS() + readPin := r.pinReadTS(startTS) + defer readPin.Release() + + txn := &txnContext{ + server: r, + ctx: dispatchCtx, + working: map[string]*txnValue{}, + listStates: map[string]*listTxnState{}, + zsetStates: map[string]*zsetTxnState{}, + ttlStates: map[string]*ttlTxnState{}, + readKeys: map[string][]byte{}, + streamDeletions: map[string][]byte{}, + startTS: startTS, + } + + nextResults := make([]redisResult, 0, len(queue)) + for _, cmd := range queue { + res, err := txn.apply(cmd) + if err != nil { + return err + } + nextResults = append(nextResults, res) + } + + if err := txn.validateReadSet(dispatchCtx); err != nil { + return err + } + if err := txn.commit(); err != nil { + return err + } + results = nextResults + return nil + }) + if err != nil { + return nil, err + } + + return results, nil +} + +// reusableExecTxn captures a dispatched MULTI/EXEC transaction so a +// subsequent retry can reuse its exact write set under a fresh +// commit_ts (carrying prev_commit_ts) and probe whether the prior +// attempt already landed. This is the EXEC analogue of +// reusableListPush (M3 R1 result reconstruction for MULTI/EXEC). +// +// `results` is computed once from attempt 1's startTS snapshot and is +// invariant across reuse for the same reason RPUSH/LPUSH's `length` +// is: the write set is fixed, so apply-vs-no-op is invisible to the +// client. Reads in the EXEC body returned values from attempt 1's +// snapshot — those values were what the client would have observed if +// attempt 1 hadn't returned an ambiguous error, so caching them is +// the right semantics for a confirmed-or-deduped commit. A +// genuine cross-txn conflict is caught by OCC on readKeys at the FSM +// apply (WriteConflict → drop pending → recompute), so the cached +// results are only returned when reuse actually represents the +// outcome of attempt 1's intent. +type reusableExecTxn struct { + elems []*kv.Elem[kv.OP] + startTS uint64 + commitTS uint64 + readKeys [][]byte + results []redisResult +} + +// dispatchExecReuse runs one iteration of the option-2 reuse path for +// MULTI/EXEC: dispatches the captured write set under a fresh +// commit_ts (carrying pending.commitTS as PrevCommitTS so the FSM +// probes whether the prior attempt landed) and returns the cached +// client-visible results on success. The drop return signals the +// caller to clear pending — set on a genuine WriteConflict from +// another txn (after the self-conflict probe rules out our own apply) +// so the next iteration rebuilds the txn from a fresh read snapshot. +// +// Mirrors dispatchListPushReuse; the only difference is the result +// payload (cached []redisResult vs computed list length) and the lack +// of a meta re-read fallback — for EXEC there is no post-apply "what +// is the current length" question; the client-visible result IS the +// cached results array. +func (r *RedisServer) dispatchExecReuse(ctx context.Context, pending *reusableExecTxn) (results []redisResult, drop bool, err error) { + // gemini PR-A HIGH: persistence-grade commit_ts allocation must honor the + // HLC-4 physical-ceiling fence (see kv/hlc.go NextFenced + the TLA proof + // at tla/hlc/MCHLC_gap.cfg). Clock().Next() bypasses the ceiling and + // could issue a timestamp that collides with a subsequent leader's + // window after renewal — the very class of bug option-2 is meant to + // rule out. + commitTS, allocErr := r.coordinator.Clock().NextFenced() + if allocErr != nil { + return nil, false, errors.Wrap(allocErr, "redis exec reuse: allocate commitTS") + } + _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: pending.startTS, + CommitTS: commitTS, + PrevCommitTS: pending.commitTS, + ReadKeys: pending.readKeys, + Elems: pending.elems, + }) + if dispErr == nil { + return pending.results, false, nil + } + if errors.Is(dispErr, store.ErrWriteConflict) { + // Self-inflicted-conflict guard (mirrors dispatchListPushReuse): + // the apply might have landed at this fresh commitTS but bubbled + // up as WriteConflict due to leadership churn. Probe whether our + // reused write set actually landed; if yes, return the cached + // results unchanged (they describe the EXEC body's outcome + // against attempt 1's snapshot, which is the outcome whether + // the bytes hit MVCC at attempt-1's commitTS or at this fresh + // commitTS — the OCC fence on readKeys guarantees no + // intervening cross-txn write slipped past). + if probeKey := firstWriteKey(pending.elems); len(probeKey) > 0 { + landed, perr := r.store.CommittedVersionAt(ctx, probeKey, commitTS) + if perr == nil && landed { + pending.commitTS = commitTS + return pending.results, false, nil + } + } + // Our attempt did not land at commitTS and a key collides with + // another txn — genuine conflict. Drop pending so the next + // iteration rebuilds from a fresh snapshot. + return nil, true, errors.WithStack(dispErr) + } + // Still ambiguous (lock / other retryable): the reuse may itself + // have landed, so the next retry must probe THIS commit_ts. Only + // advance pending.commitTS if retryRedisWrite will actually loop + // (non-retryable errors escape to the client; pending is then + // discarded with the goroutine). + if isRetryableRedisTxnErr(dispErr) { + pending.commitTS = commitTS + } + return nil, false, errors.WithStack(dispErr) +} + +// runTransactionWithDedup is the option-2 retry loop for MULTI/EXEC. +// The first attempt builds the txn write set + cached results from +// the user's startTS snapshot; any retryable failure makes the next +// iteration REUSE that write set under a fresh commit_ts with +// prev_commit_ts set, so the FSM no-ops if the prior attempt already +// landed. A WriteConflict on a reuse attempt (after the self-conflict +// probe rules out our own apply) means another txn touched a read or +// write key, and we drop pending → rebuild from a fresh snapshot. +// +// Mirrors listPushCoreWithDedup at the EXEC granularity. +func (r *RedisServer) runTransactionWithDedup(queue []redcon.Command) ([]redisResult, error) { + dispatchCtx, cancel := context.WithTimeout(r.handlerContext(), redisDispatchTimeout) + defer cancel() + + var results []redisResult + var pending *reusableExecTxn + err := r.retryRedisWrite(dispatchCtx, func() error { + if pending != nil { + // gemini PR-A MEDIUM: derive the per-attempt reuse ctx from the + // caller's `dispatchCtx` (not `r.handlerContext()`) so a cancelled + // caller stops the reuse promptly. Per-attempt `redisDispatchTimeout` + // still caps the dispatch the same way `commit()` does for the + // first attempt; what changes is that an outer cancellation can + // now interrupt mid-attempt instead of being ignored until the + // fresh 10 s budget elapses. The earlier "fresh ctx from + // handlerContext" pattern (noted in design doc §M3) was strictly + // more conservative but wasted resources on a disconnected + // client. + reuseCtx, reuseCancel := context.WithTimeout(dispatchCtx, redisDispatchTimeout) + defer reuseCancel() + res, drop, dispErr := r.dispatchExecReuse(reuseCtx, pending) + if drop { + pending = nil + } + if dispErr != nil { + return dispErr + } + results = res + return nil + } + res, next, ferr := r.firstExecAttempt(dispatchCtx, queue) + if ferr != nil { + if next != nil { + pending = next + } + return ferr + } + results = res + return nil + }) + if err != nil { + return nil, err + } + return results, nil +} + +// firstExecAttempt runs the initial (no-reuse) EXEC attempt: builds the +// txn snapshot, applies each command to capture the client-visible +// results, validates the read set, and dispatches. On success returns +// the results. On a retryable dispatch failure it returns a +// reusableExecTxn capturing what the retry loop needs to dispatch via +// PrevCommitTS on the next iteration; non-retryable failures return a +// nil reuse state (mirrors listPushCoreWithDedup's gating). Extracted +// from runTransactionWithDedup to keep that loop under the cyclop +// budget; the dedup rationale lives there. +func (r *RedisServer) firstExecAttempt(dispatchCtx context.Context, queue []redcon.Command) ([]redisResult, *reusableExecTxn, error) { + startTS := r.txnStartTS() + readPin := r.pinReadTS(startTS) + defer readPin.Release() + + txn := &txnContext{ + server: r, + ctx: dispatchCtx, + working: map[string]*txnValue{}, + listStates: map[string]*listTxnState{}, + zsetStates: map[string]*zsetTxnState{}, + ttlStates: map[string]*ttlTxnState{}, + readKeys: map[string][]byte{}, + streamDeletions: map[string][]byte{}, + startTS: startTS, + } + + nextResults := make([]redisResult, 0, len(queue)) + for _, cmd := range queue { + res, err := txn.apply(cmd) + if err != nil { + return nil, nil, err + } + nextResults = append(nextResults, res) + } + + if err := txn.validateReadSet(dispatchCtx); err != nil { + return nil, nil, err + } + + prepared, err := txn.prepareDispatch() + if err != nil { + return nil, nil, err + } + defer prepared.cancel() + if len(prepared.elems) == 0 { + // Read-only EXEC: nothing to dispatch, no dedup window. + return nextResults, nil, nil + } + + group := &kv.OperationGroup[kv.OP]{ + IsTxn: true, + Elems: prepared.elems, + StartTS: txn.startTS, + CommitTS: prepared.commitTS, + ReadKeys: prepared.readKeys, + } + if _, dispErr := r.coordinator.Dispatch(prepared.ctx, group); dispErr != nil { + // Only remember the attempt for reuse if retryRedisWrite will + // actually loop. Mirrors listPushCoreWithDedup's gating + // rationale — errors that escape the loop (transient-leader, + // context deadline, FSM apply error) leave pending pointing at + // state wasted with the goroutine; ambiguous errors that + // escape to the client are out of scope for this loop. + if isRetryableRedisTxnErr(dispErr) { + return nil, &reusableExecTxn{ + elems: prepared.elems, + startTS: txn.startTS, + commitTS: prepared.commitTS, + readKeys: prepared.readKeys, + results: nextResults, + }, errors.WithStack(dispErr) + } + return nil, nil, errors.WithStack(dispErr) + } + return nextResults, nil, nil +} + +func (r *RedisServer) txnStartTS() uint64 { + // store.LastCommitTS() is the authoritative safe-snapshot watermark: it is + // updated atomically only AFTER the corresponding Pebble batch commit, so + // every version with commitTS ≤ store.LastCommitTS() is guaranteed visible + // in the store when we read. + // + // We must NOT return clock.Next() here. clock.Next() can be AHEAD of + // store.LastCommitTS() because concurrent dispatchTxn calls advance the HLC + // before their Raft entry is applied. If startTS = clock.Next() = T, a + // concurrent transaction that already called clock.Next() to obtain + // commitTS = T-1 and is still in the Raft pipeline will satisfy + // latestTS(key) = T-1 ≤ T = startTS + // causing the FSM conflict check (latestTS > startTS) to silently pass even + // though we read stale data. This allows two concurrent RPUSHes to pick the + // same sequence number, with the second overwriting the first — a lost write. + // + // Using store.LastCommitTS() directly closes this gap: any concurrent commit + // at > maxTS triggers a WriteConflict and a retry via retryRedisWrite. + // + // The Observe call still advances the HLC so that dispatchTxn's clock.Next() + // produces a commitTS strictly greater than maxTS (leader-election safety). + // + // When maxTS is 0 (empty store) we return 1 so the coordinator treats this + // as a valid startTS and does not override it with clock.Next() — which + // could be ahead of unapplied Raft entries and reintroduce the anomaly. + var maxTS uint64 + if r.store != nil { + maxTS = r.store.LastCommitTS() + } + if r.coordinator != nil && r.coordinator.Clock() != nil && maxTS > 0 { + r.coordinator.Clock().Observe(maxTS) + } + if maxTS == 0 { + return 1 + } + return maxTS +} + +func (r *RedisServer) writeResults(conn redcon.Conn, results []redisResult) { + conn.WriteArray(len(results)) + for _, res := range results { + switch res.typ { + case resultNil: + conn.WriteNull() + case resultError: + writeRedisError(conn, res.err) + case resultBulk: + conn.WriteBulk(res.bulk) + case resultString: + conn.WriteString(res.str) + case resultArray: + conn.WriteArray(len(res.arr)) + for _, s := range res.arr { + conn.WriteBulkString(s) + } + case resultInt: + conn.WriteInt64(res.integer) + default: + conn.WriteNull() + } + } +} diff --git a/adapter/redis_zset_cmds.go b/adapter/redis_zset_cmds.go new file mode 100644 index 00000000..082b920d --- /dev/null +++ b/adapter/redis_zset_cmds.go @@ -0,0 +1,1155 @@ +package adapter + +import ( + "bytes" + "context" + "errors" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/monitoring" + "github.com/bootjp/elastickv/store" + cockerrors "github.com/cockroachdb/errors" + "github.com/tidwall/redcon" +) + +type zrangeOptions struct { + withScores bool + reverse bool +} + +type bzpopminResult struct { + key []byte + entry redisZSetEntry +} + +// zsetMemberFastScore probes the wide-column score entry for (key, +// member) directly and reports whether it is present and TTL-alive. +// Priority-alignment scope mirrors hashFieldFastLookup: only the +// redisStrKey dual-encoding case is guarded (see +// hasHigherPriorityStringEncoding's narrow-scope caveats). Callers +// must fall back to the full zsetState loader on hit=false to cover +// legacy-blob zsets and nil / WRONGTYPE disambiguation. +// +// Probe ORDER matches hashFieldFastLookup / setMemberFastExists / +// hashFieldFastExists post-PR #565: hit the wide-column score key +// first so the negative case (missing, legacy-blob, wrong-type) does +// not pay the priority-guard seek. +func (r *RedisServer) zsetMemberFastScore(ctx context.Context, key, member []byte, readTS uint64) (score float64, hit, alive bool, err error) { + raw, err := r.store.GetAt(ctx, store.ZSetMemberKey(key, member), readTS) + if err != nil { + if cockerrors.Is(err, store.ErrKeyNotFound) { + return 0, false, false, nil + } + return 0, false, false, cockerrors.WithStack(err) + } + score, err = store.UnmarshalZSetScore(raw) + if err != nil { + return 0, false, false, cockerrors.WithStack(err) + } + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return 0, false, false, hErr + } else if higher { + return 0, false, false, nil + } + expired, expErr := r.hasExpired(ctx, key, readTS, true) + if expErr != nil { + return 0, false, false, cockerrors.WithStack(expErr) + } + return score, true, !expired, nil +} + +// zsetRangeByScoreFast streams the score index for key over the +// caller-supplied [startKey, endKey) byte range, returning the +// decoded entries up to offset+limit. This replaces the +// load-the-whole-zset path used by cmdZRangeByScore / cmdZRevRangeByScore +// when the caller has no script-local mutations and the zset is in +// wide-column form. For a delay-queue poll ("next 10 jobs due by +// now") the cost goes from O(N) member GetAts to O(range_width + +// offset + limit) score-index entries. +// +// hit=false means the fast path cannot safely answer (legacy-blob +// zset present, string-encoding corruption, or empty-result case +// where we cannot distinguish "zset is empty in this range" from +// "key exists as another type / is missing"). Callers MUST take +// the slow path on hit=false so keyTypeAt disambiguation fires. +// reason carries the specific hit=false branch so observers can +// subdivide fallback rates for dashboarding; "" when hit=true. +// +// scoreInRange filter is applied post-scan for exclusive bound +// edge cases; the caller supplies precomputed scan bounds that +// over-approximate toward INclusive and lets this helper filter. +func (r *RedisServer) zsetRangeByScoreFast( + ctx context.Context, + key, startKey, endKey []byte, + reverse bool, + offset, limit int, + scoreFilter func(float64) bool, + readTS uint64, +) ([]redisZSetEntry, bool, monitoring.LuaFastPathFallbackReason, error) { + if eligible, err := r.zsetFastPathEligible(ctx, key, readTS); err != nil || !eligible { + return nil, false, monitoring.LuaFastPathFallbackIneligible, err + } + // Large-offset short-circuit: once offset >= maxWideScanLimit, + // the fast path can only scan maxWideScanLimit rows then skip all + // of them -- guaranteed wasted I/O. Defer to the slow path + // immediately so it can answer from the full member load without + // the redundant score-index scan. + if offset >= maxWideScanLimit { + return nil, false, monitoring.LuaFastPathFallbackLargeOffset, nil + } + scanLimit := zsetFastScanLimit(offset, limit) + if scanLimit <= 0 || bytes.Compare(startKey, endKey) >= 0 { + hit, reason, err := r.zsetRangeEmptyFastResult(ctx, key, readTS) + return nil, hit, reason, err + } + kvs, err := r.zsetScoreScan(ctx, startKey, endKey, scanLimit, reverse, readTS) + if err != nil { + return nil, false, monitoring.LuaFastPathFallbackOther, err + } + return r.finalizeZSetFastRange(ctx, key, kvs, offset, limit, scanLimit, scoreFilter, readTS) +} + +// finalizeZSetFastRange runs the post-scan priority guard, decodes +// the candidate score rows into redisZSetEntry, and applies the TTL +// filter. Factored out so zsetRangeByScoreFast stays under the +// cyclomatic-complexity cap. +// +// Takes scanLimit so we can detect a saturated scan: if the scanner +// returned exactly scanLimit rows AND the caller's request is not +// satisfied (unbounded limit, or collected fewer entries than limit), +// there MAY be more entries beyond the scan window. In that case we +// return hit=false so the slow path can produce the authoritative +// answer -- the fast path MUST NOT silently truncate. +func (r *RedisServer) finalizeZSetFastRange( + ctx context.Context, key []byte, kvs []*store.KVPair, + offset, limit, scanLimit int, scoreFilter func(float64) bool, readTS uint64, +) ([]redisZSetEntry, bool, monitoring.LuaFastPathFallbackReason, error) { + // Priority guard runs after a candidate hit (mirrors post-PR #565 + // ordering). Skip it on empty result -- the empty-result tail + // handles disambiguation. + if len(kvs) > 0 { + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return nil, false, monitoring.LuaFastPathFallbackOther, hErr + } else if higher { + return nil, false, monitoring.LuaFastPathFallbackWrongType, nil + } + } + entries := decodeZSetScoreRange(key, kvs, offset, limit, scoreFilter) + // Truncation guard: the raw scanner hit its cap AND the caller did + // not get a satisfied result. Entries beyond the window may + // exist; defer to the slow path for correctness. + if zsetFastPathTruncated(len(kvs), scanLimit, len(entries), limit) { + return nil, false, monitoring.LuaFastPathFallbackTruncated, nil + } + if len(entries) == 0 { + hit, reason, err := r.zsetRangeEmptyFastResult(ctx, key, readTS) + return nil, hit, reason, err + } + expired, expErr := r.hasExpired(ctx, key, readTS, true) + if expErr != nil { + return nil, false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(expErr) + } + if expired { + return nil, true, "", nil + } + return entries, true, "", nil +} + +// zsetFastPathTruncated reports whether the bounded score-index scan +// may have dropped entries that the caller's request would otherwise +// include. Returns true when the scanner returned the full quota +// (scannedRows == scanLimit) AND the caller's request is still +// unsatisfied (unbounded limit or collectedEntries < limit). In that +// case the caller must fall back to the slow full-load path to get +// the authoritative result. +func zsetFastPathTruncated(scannedRows, scanLimit, collectedEntries, limit int) bool { + if scannedRows < scanLimit { + return false + } + if limit < 0 { + return true + } + return collectedEntries < limit +} + +// zsetFastPathEligible returns false (without error) when a legacy- +// blob zset is present; the caller must take the slow path so +// ensureZSetLoaded / blob decoding runs. +func (r *RedisServer) zsetFastPathEligible(ctx context.Context, key []byte, readTS uint64) (bool, error) { + legacyExists, err := r.store.ExistsAt(ctx, redisZSetKey(key), readTS) + if err != nil { + return false, cockerrors.WithStack(err) + } + return !legacyExists, nil +} + +// zsetFastScanLimit clamps offset+limit to maxWideScanLimit so an +// unbounded or malicious LIMIT cannot force an O(N) scan of a large +// zset. A negative limit means "unbounded" at the Redis level; cap it +// at the collection OOM limit. +// +// Check bounds BEFORE adding to avoid signed-integer overflow on +// hostile input (e.g. a Lua script passing offset=limit=math.MaxInt). +// A wrap would produce a negative scanLimit and cause the caller's +// `scanLimit <= 0` branch to misroute a live zset into the +// empty-result tail. +func zsetFastScanLimit(offset, limit int) int { + // limit == 0: the caller wants zero entries regardless of offset. + // Return 0 so the caller's `scanLimit <= 0` branch routes to the + // empty-result tail (which still runs resolveZSetMeta for proper + // WRONGTYPE / existence disambiguation) instead of a pointless + // full-quota scan. + if limit == 0 { + return 0 + } + if limit < 0 { + return maxWideScanLimit + } + if offset >= maxWideScanLimit { + return maxWideScanLimit + } + if limit > maxWideScanLimit-offset { + return maxWideScanLimit + } + return offset + limit +} + +// zsetScoreScan picks Forward / Reverse ScanAt based on direction. +func (r *RedisServer) zsetScoreScan( + ctx context.Context, startKey, endKey []byte, scanLimit int, reverse bool, readTS uint64, +) ([]*store.KVPair, error) { + if reverse { + kvs, err := r.store.ReverseScanAt(ctx, startKey, endKey, scanLimit, readTS) + return kvs, cockerrors.WithStack(err) + } + kvs, err := r.store.ScanAt(ctx, startKey, endKey, scanLimit, readTS) + return kvs, cockerrors.WithStack(err) +} + +// zsetDecodeAllocSize returns a tight upper bound on the collected +// entry count for decodeZSetScoreRange: (kvLen - offset) capped by +// limit, never negative. Avoiding a make([]...len(kvs)) saves up to +// maxWideScanLimit entries of wasted slice capacity when the caller +// asked for a small window at a large offset. +func zsetDecodeAllocSize(kvLen, offset, limit int) int { + allocSize := kvLen - offset + if allocSize < 0 { + return 0 + } + if limit >= 0 && limit < allocSize { + return limit + } + return allocSize +} + +// decodeZSetScoreRange decodes score-index scan results into +// redisZSetEntry, applying the post-scan score filter (exclusive +// bound edges) and the offset / limit pagination. Entries that fail +// to decode are silently dropped -- they can only appear under data +// corruption. +func decodeZSetScoreRange( + key []byte, kvs []*store.KVPair, offset, limit int, scoreFilter func(float64) bool, +) []redisZSetEntry { + entries := make([]redisZSetEntry, 0, zsetDecodeAllocSize(len(kvs), offset, limit)) + skipped := 0 + for _, kv := range kvs { + score, member, ok := store.ExtractZSetScoreAndMember(kv.Key, key) + if !ok { + continue + } + if scoreFilter != nil && !scoreFilter(score) { + continue + } + // Check limit saturation BEFORE the offset skip so a small + // limit with a large offset exits immediately instead of + // burning offset iterations on the skip branch. Correct for + // any (offset, limit): once len(entries) >= limit we are done + // regardless of remaining skip budget. + if limit >= 0 && len(entries) >= limit { + break + } + if skipped < offset { + skipped++ + continue + } + entries = append(entries, redisZSetEntry{Member: string(member), Score: score}) + } + return entries +} + +// zsetRangeEmptyFastResult is the empty-result tail: either the +// score range is genuinely empty on a live zset (return empty + +// hit=true) or the zset does not exist in wide-column form (return +// hit=false so the caller takes the slow path for WRONGTYPE / missing +// disambiguation). +// +// Uses resolveZSetMeta so delta-only wide zsets (a fresh zset whose +// base meta has not been persisted yet, only delta rows) are detected +// as "exists". Using a plain ExistsAt on ZSetMetaKey would miss those +// and force the slow path unnecessarily. Also runs the string-priority +// guard so a corrupted redisStrKey + zset meta surfaces WRONGTYPE via +// the slow path rather than an empty array. +// zsetRangeEmptyFastResult returns (hit, reason, err) for the empty- +// result tail. hit=true means the key is a live zset whose score +// range is simply empty (callers return an empty array and no +// fallback); hit=false carries a specific fallback reason so the +// caller can route its slow-path observation accordingly. +func (r *RedisServer) zsetRangeEmptyFastResult(ctx context.Context, key []byte, readTS uint64) (bool, monitoring.LuaFastPathFallbackReason, error) { + _, zsetExists, err := r.resolveZSetMeta(ctx, key, readTS) + if err != nil { + return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(err) + } + if !zsetExists { + // The key has no ZSet encoding at readTS. Redis semantics: + // - key truly absent → ZRANGEBYSCORE returns empty + // - key is another type → ZRANGEBYSCORE returns WRONGTYPE + // Production metric (PR #572) showed this branch is the + // hot-path dominant outcome (~96% of ZRANGEBYSCORE calls on + // BullMQ-style workloads that poll an empty delayed queue). + // Punting every such call to the slow path repeats the same + // 3-probe member/meta/delta scan we just did and then + // re-probes all other types anyway -- pure duplicate I/O. + // + // Short-circuit: use keyTypeAt (logical type after TTL check) + // to distinguish "truly absent" from "wrong type". If None, + // return hit=true with an empty result -- that is the correct + // Redis answer and saves the slow-path round-trip. Otherwise + // fall back so the slow path can produce WRONGTYPE. + typ, typErr := r.keyTypeAtExpect(ctx, key, readTS, redisTypeZSet) + if typErr != nil { + return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(typErr) + } + if typ == redisTypeNone { + return true, "", nil + } + return false, monitoring.LuaFastPathFallbackWrongType, nil + } + if higher, hErr := r.hasHigherPriorityStringEncoding(ctx, key, readTS); hErr != nil { + return false, monitoring.LuaFastPathFallbackOther, hErr + } else if higher { + return false, monitoring.LuaFastPathFallbackWrongType, nil + } + // hasExpired is called for its error-surfacing side effect only: + // whether the zset is expired or not, a live zset with no members + // in range returns an empty hit=true result. Keep the call so + // storage errors during TTL resolution still propagate. + if _, expErr := r.hasExpired(ctx, key, readTS, true); expErr != nil { + return false, monitoring.LuaFastPathFallbackOther, cockerrors.WithStack(expErr) + } + return true, "", nil +} + +type zaddFlags struct { + nx bool // only add new elements + xx bool // only update existing elements + gt bool // only update when new score > current score + lt bool // only update when new score < current score +} + +func parseZAddFlags(args [][]byte) (zaddFlags, int, error) { + var flags zaddFlags + i := 2 + for i < len(args) { + if !flags.applyFlag(strings.ToUpper(string(args[i]))) { + break + } + i++ + } + if err := flags.validate(); err != nil { + return zaddFlags{}, 0, err + } + return flags, i, nil +} + +func (f *zaddFlags) applyFlag(name string) bool { + switch name { + case "NX": + f.nx = true + case "XX": + f.xx = true + case "GT": + f.gt = true + case "LT": + f.lt = true + default: + return false + } + return true +} + +func (f zaddFlags) allows(exists bool, oldScore, newScore float64) bool { + if (f.nx && exists) || (f.xx && !exists) { + return false + } + return !exists || f.scoreAllowed(oldScore, newScore) +} + +func (f zaddFlags) scoreAllowed(oldScore, newScore float64) bool { + if f.gt && newScore <= oldScore { + return false + } + if f.lt && newScore >= oldScore { + return false + } + return true +} + +func (f zaddFlags) validate() error { + if f.nx && f.xx { + return fmt.Errorf("ERR XX and NX options at the same time are not compatible") + } + if f.nx && (f.gt || f.lt) { + return fmt.Errorf("ERR GT, LT, and NX options at the same time are not compatible") + } + return nil +} + +type zaddPair struct { + score float64 + member string +} + +func parseZAddPairs(remaining [][]byte) ([]zaddPair, error) { + pairs := make([]zaddPair, 0, len(remaining)/redisPairWidth) + for i := 0; i < len(remaining); i += redisPairWidth { + score, err := strconv.ParseFloat(string(remaining[i]), 64) + if err != nil { + return nil, fmt.Errorf("parse zadd score: %w", err) + } + pairs = append(pairs, zaddPair{score: score, member: string(remaining[i+1])}) + } + return pairs, nil +} + +func (r *RedisServer) zadd(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + flags, pairStart, err := parseZAddFlags(cmd.Args) + if err != nil { + writeRedisError(conn, err) + return + } + remaining := cmd.Args[pairStart:] + if len(remaining) == 0 || len(remaining)%redisPairWidth != 0 { + conn.WriteError("ERR syntax error") + return + } + pairs, err := parseZAddPairs(remaining) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var added int + if err := r.retryRedisWrite(ctx, func() error { + var err error + added, err = r.zaddTxn(ctx, cmd.Args[1], flags, pairs) + return err + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(added) +} + +// buildZSetMigrationView extracts member→score from ZSet migration Put elems +// so that applyZAddPair can see migrated members without a store round-trip. +// Returns a map from member name to score; absent members were not migrated. +func buildZSetMigrationView(migrationElems []*kv.Elem[kv.OP], key []byte) map[string]float64 { + view := make(map[string]float64) + for _, elem := range migrationElems { + if elem.Op != kv.Put { + continue + } + m := store.ExtractZSetMemberName(elem.Key, key) + if m == nil { + continue + } + score, err := store.UnmarshalZSetScore(elem.Value) + if err == nil { + view[string(m)] = score + } + } + return view +} + +// resolveZSetMemberScore returns the current score and existence for a ZSet +// member. It checks inTxnView first (covers migration elems and earlier pairs +// in the same ZADD call), then falls back to a store GetAt. +func (r *RedisServer) resolveZSetMemberScore(ctx context.Context, memberKey []byte, member string, readTS uint64, inTxnView map[string]float64) (score float64, exists bool, err error) { + if s, ok := inTxnView[member]; ok { + return s, true, nil + } + raw, getErr := r.store.GetAt(ctx, memberKey, readTS) + if getErr == nil { + s, unmarshalErr := store.UnmarshalZSetScore(raw) + if unmarshalErr != nil { + return 0, false, cockerrors.WithStack(unmarshalErr) + } + return s, true, nil + } + if !cockerrors.Is(getErr, store.ErrKeyNotFound) { + return 0, false, cockerrors.WithStack(getErr) + } + return 0, false, nil +} + +// applyZAddPair processes one ZADD pair against the wide-column store: reads the +// existing member score (if any), checks the ZADD flags, emits del-old-score / +// put-member / put-score-index ops, and returns the updated elems, the add count +// (0 or 1), and the length delta (0 or +1). +// inTxnView provides an in-transaction view of member→score for members written +// in the same transaction (migration or earlier pairs); checked before GetAt so +// migrated and duplicate members are handled correctly. +func (r *RedisServer) applyZAddPair(ctx context.Context, key []byte, p zaddPair, flags zaddFlags, readTS uint64, elems []*kv.Elem[kv.OP], inTxnView map[string]float64) ([]*kv.Elem[kv.OP], int, int64, error) { + memberKey := store.ZSetMemberKey(key, []byte(p.member)) + oldScore, memberExists, err := r.resolveZSetMemberScore(ctx, memberKey, p.member, readTS, inTxnView) + if err != nil { + return nil, 0, 0, err + } + if !flags.allows(memberExists, oldScore, p.score) { + return elems, 0, 0, nil + } + if memberExists { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(p.member))}) + } + elems = append(elems, + &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: store.MarshalZSetScore(p.score)}, + &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, p.score, []byte(p.member)), Value: []byte{}}, + ) + // Update inTxnView so subsequent pairs (duplicates) see this write. + inTxnView[p.member] = p.score + if memberExists { + return elems, 0, 0, nil + } + return elems, 1, 1, nil +} + +func (r *RedisServer) zaddTxn(ctx context.Context, key []byte, flags zaddFlags, pairs []zaddPair) (int, error) { + readTS := r.readTS() + if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeZSet); err != nil { + return 0, err + } + + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return 0, cockerrors.Wrap(err, "zaddTxn: allocate commitTS") + } + + migrationElems, err := r.buildZSetLegacyMigrationElems(ctx, key, readTS) + if err != nil { + return 0, err + } + // Capacity: each pair may produce 3 ops (del old score + put member + put score index), + // plus migration elems and a delta key. + elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+len(pairs)*3+setWideColOverhead) //nolint:mnd // 3 ops per pair + elems = append(elems, migrationElems...) + + // Seed the in-transaction view from migration elems so that migrated + // members are not incorrectly counted as new by applyZAddPair. + inTxnView := buildZSetMigrationView(migrationElems, key) + + // For large batches, mergeZSetBulkScores performs one prefix scan that + // eliminates O(N) GetAt calls inside applyZAddPair; it is a no-op for + // batches below wideColumnBulkScanThreshold. + inTxnView, err = r.mergeZSetBulkScores(ctx, key, readTS, len(pairs), inTxnView) + if err != nil { + return 0, err + } + + added := 0 + lenDelta := int64(0) + for _, p := range pairs { + var c int + var d int64 + elems, c, d, err = r.applyZAddPair(ctx, key, p, flags, readTS, elems, inTxnView) + if err != nil { + return 0, err + } + added += c + lenDelta += d + } + + if len(elems) == 0 { + return 0, nil + } + + if lenDelta != 0 { + deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: lenDelta}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + + return added, r.dispatchAndSignalZSet(ctx, readTS, commitTS, elems, key) +} + +// dispatchAndSignalZSet dispatches the elems through the coordinator +// and, on success, wakes any BZPOPMIN waiter on the same node. +// coordinator.Dispatch blocks until the FSM applies locally, so by +// the time Signal fires the new members are visible at the readTS +// the woken waiter will pick on its next iteration. Pulled out of +// zaddTxn / zincrbyTxn so the parents stay under the cyclop budget +// — the signal step would otherwise add an extra branch on the +// dispatch error path. +func (r *RedisServer) dispatchAndSignalZSet( + ctx context.Context, + readTS, commitTS uint64, + elems []*kv.Elem[kv.OP], + zsetKey []byte, +) error { + _, err := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + if err != nil { + return cockerrors.WithStack(err) + } + r.zsetWaiters.Signal(zsetKey) + return nil +} + +// zincrbyTxn performs one attempt of ZINCRBY in wide-column format. +// Returns the new score after applying increment. +func (r *RedisServer) zincrbyTxn(ctx context.Context, key []byte, member string, increment float64) (float64, error) { + readTS := r.readTS() + if err := r.requireKeyTypeOrEmpty(ctx, key, readTS, redisTypeZSet); err != nil { + return 0, err + } + + memberKey := store.ZSetMemberKey(key, []byte(member)) + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return 0, cockerrors.Wrap(err, "zincrbyTxn: allocate commitTS") + } + + migrationElems, migErr := r.buildZSetLegacyMigrationElems(ctx, key, readTS) + if migErr != nil { + return 0, migErr + } + + // Check in-txn migration view before falling back to the store + // (migrated keys are not yet visible at readTS). + inTxnView := buildZSetMigrationView(migrationElems, key) + oldScore, memberExists, err := r.resolveZSetMemberScore(ctx, memberKey, member, readTS, inTxnView) + if err != nil { + return 0, err + } + + newScore := oldScore + increment + if math.IsNaN(newScore) { + return 0, errors.New("ERR resulting score is not a number (NaN)") + } + elems := make([]*kv.Elem[kv.OP], 0, len(migrationElems)+3) //nolint:mnd // del old score + put member + put score index + elems = append(elems, migrationElems...) + if memberExists { + elems = append(elems, &kv.Elem[kv.OP]{Op: kv.Del, Key: store.ZSetScoreKey(key, oldScore, []byte(member))}) + } + elems = append(elems, + &kv.Elem[kv.OP]{Op: kv.Put, Key: memberKey, Value: store.MarshalZSetScore(newScore)}, + &kv.Elem[kv.OP]{Op: kv.Put, Key: store.ZSetScoreKey(key, newScore, []byte(member)), Value: []byte{}}, + ) + if !memberExists { + deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: 1}) + elems = append(elems, &kv.Elem[kv.OP]{ + Op: kv.Put, + Key: store.ZSetMetaDeltaKey(key, commitTS, 0), + Value: deltaVal, + }) + } + if err := r.dispatchAndSignalZSet(ctx, readTS, commitTS, elems, key); err != nil { + return 0, err + } + return newScore, nil +} + +func (r *RedisServer) zincrby(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + increment, err := strconv.ParseFloat(string(cmd.Args[2]), 64) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var newScore float64 + if err := r.retryRedisWrite(ctx, func() error { + var txnErr error + newScore, txnErr = r.zincrbyTxn(ctx, cmd.Args[1], string(cmd.Args[3]), increment) + return txnErr + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteBulkString(formatRedisFloat(newScore)) +} + +func parseZRangeOptions(args [][]byte) (zrangeOptions, error) { + opts := zrangeOptions{} + for _, arg := range args { + switch strings.ToUpper(string(arg)) { + case "WITHSCORES": + opts.withScores = true + case "REV": + opts.reverse = true + default: + return zrangeOptions{}, errors.New("ERR syntax error") + } + } + return opts, nil +} + +func reverseZSetEntries(entries []redisZSetEntry) { + for i, j := 0, len(entries)-1; i < j; i, j = i+1, j-1 { + entries[i], entries[j] = entries[j], entries[i] + } +} + +func writeZRangeReply(conn redcon.Conn, entries []redisZSetEntry, withScores bool) { + if withScores { + conn.WriteArray(len(entries) * redisPairWidth) + for _, entry := range entries { + conn.WriteBulkString(entry.Member) + conn.WriteBulkString(formatRedisFloat(entry.Score)) + } + return + } + + conn.WriteArray(len(entries)) + for _, entry := range entries { + conn.WriteBulkString(entry.Member) + } +} + +func removeZSetMembers(members map[string]float64, rawMembers [][]byte) int { + removed := 0 + for _, member := range rawMembers { + memberKey := string(member) + if _, ok := members[memberKey]; ok { + delete(members, memberKey) + removed++ + } + } + return removed +} + +func (r *RedisServer) persistZSetEntriesTxn(ctx context.Context, key []byte, readTS uint64, entries []redisZSetEntry) error { + if len(entries) == 0 { + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, elems) + } + payload, err := marshalZSetValue(redisZSetValue{Entries: entries}) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: redisZSetKey(key), Value: payload}, + }) +} + +func (r *RedisServer) zrange(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + start, err := parseInt(cmd.Args[2]) + if err != nil { + writeRedisError(conn, err) + return + } + stop, err := parseInt(cmd.Args[3]) + if err != nil { + writeRedisError(conn, err) + return + } + + opts, err := parseZRangeOptions(cmd.Args[4:]) + if err != nil { + writeRedisError(conn, err) + return + } + + r.zrangeRead(conn, cmd.Args[1], start, stop, opts) +} + +func (r *RedisServer) zrangeRead(conn redcon.Conn, key []byte, start, stop int, opts zrangeOptions) { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(context.Background(), key, readTS, redisTypeZSet) + if err != nil { + writeRedisError(conn, err) + return + } + if typ == redisTypeNone { + conn.WriteArray(0) + return + } + if typ != redisTypeZSet { + conn.WriteError(wrongTypeMessage) + return + } + + value, _, err := r.loadZSetAt(context.Background(), key, readTS) + if err != nil { + writeRedisError(conn, err) + return + } + entries := append([]redisZSetEntry(nil), value.Entries...) + if opts.reverse { + reverseZSetEntries(entries) + } + s, e := normalizeRankRange(start, stop, len(entries)) + if e < s { + conn.WriteArray(0) + return + } + writeZRangeReply(conn, entries[s:e+1], opts.withScores) +} + +func (r *RedisServer) zrem(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var removed int + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, cmd.Args[1], readTS, redisTypeZSet) + if err != nil { + return err + } + if typ == redisTypeNone { + removed = 0 + return nil + } + if typ != redisTypeZSet { + return wrongTypeError() + } + value, _, err := r.loadZSetAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + return err + } + members := zsetEntriesToMap(value.Entries) + removed = removeZSetMembers(members, cmd.Args[2:]) + if removed == 0 { + return nil + } + return r.persistZSetEntriesTxn(ctx, cmd.Args[1], readTS, zsetMapToEntries(members)) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(removed) +} + +func (r *RedisServer) zremrangebyrank(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + start, err := parseInt(cmd.Args[2]) + if err != nil { + writeRedisError(conn, err) + return + } + stop, err := parseInt(cmd.Args[3]) + if err != nil { + writeRedisError(conn, err) + return + } + + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var removed int + if err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + typ, err := r.keyTypeAtExpect(ctx, cmd.Args[1], readTS, redisTypeZSet) + if err != nil { + return err + } + if typ == redisTypeNone { + removed = 0 + return nil + } + if typ != redisTypeZSet { + return wrongTypeError() + } + value, _, err := r.loadZSetAt(context.Background(), cmd.Args[1], readTS) + if err != nil { + return err + } + s, e := normalizeRankRange(start, stop, len(value.Entries)) + if e < s { + removed = 0 + return nil + } + remaining := append([]redisZSetEntry{}, value.Entries[:s]...) + remaining = append(remaining, value.Entries[e+1:]...) + removed = e - s + 1 + return r.persistZSetEntriesTxn(ctx, cmd.Args[1], readTS, remaining) + }); err != nil { + writeRedisError(conn, err) + return + } + conn.WriteInt(removed) +} + +// tryBZPopMinWithMode runs one BZPOPMIN attempt against key. The +// fast flag selects keyTypeAtExpectFast (no slow-path fallback, no +// wrongType detection) when true; the caller MUST guarantee that the +// only mutations since the previous full check are signalling writes +// (ZADD/ZINCRBY for zsetWaiters). bzpopminWaitLoop enforces this by +// running fast=false on the first iteration and after every +// fallback-timer wake or wall-time-bounded re-arm. +func (r *RedisServer) tryBZPopMinWithMode(key []byte, fast bool) (*bzpopminResult, error) { + ctx, cancel := context.WithTimeout(context.Background(), redisDispatchTimeout) + defer cancel() + var result *bzpopminResult + err := r.retryRedisWrite(ctx, func() error { + readTS := r.readTS() + var typ redisValueType + var err error + if fast { + typ, err = r.keyTypeAtExpectFast(ctx, key, readTS, redisTypeZSet) + } else { + typ, err = r.keyTypeAtExpect(ctx, key, readTS, redisTypeZSet) + } + if err != nil { + return err + } + if typ == redisTypeNone { + result = nil + return nil + } + if typ != redisTypeZSet { + return wrongTypeError() + } + value, _, err := r.loadZSetAt(context.Background(), key, readTS) + if err != nil { + return err + } + if len(value.Entries) == 0 { + result = nil + return nil + } + popped := value.Entries[0] + remaining := append([]redisZSetEntry(nil), value.Entries[1:]...) + + // Detect wide-column storage. + memberPrefix := store.ZSetMemberScanPrefix(key) + memberEnd := store.PrefixScanEnd(memberPrefix) + probeKVs, probeErr := r.store.ScanAt(ctx, memberPrefix, memberEnd, 1, readTS) + if probeErr != nil { + return cockerrors.WithStack(probeErr) + } + isWide := len(probeKVs) > 0 + + if err := r.persistBZPopMinResult(ctx, key, readTS, popped, remaining, isWide); err != nil { + return err + } + result = &bzpopminResult{key: key, entry: popped} + return nil + }) + return result, err +} + +func (r *RedisServer) persistBZPopMinResult(ctx context.Context, key []byte, readTS uint64, popped redisZSetEntry, remaining []redisZSetEntry, isWide bool) error { + if len(remaining) == 0 { + elems, _, err := r.deleteLogicalKeyElems(ctx, key, readTS) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, elems) + } + if isWide { + // Wide-column: delete the popped member key + score index, emit delta -1. + commitTS, err := r.coordinator.Clock().NextFenced() + if err != nil { + return cockerrors.Wrap(err, "persistBZPopMinResult: allocate commitTS") + } + deltaVal := store.MarshalZSetMetaDelta(store.ZSetMetaDelta{LenDelta: -1}) + elems := []*kv.Elem[kv.OP]{ + {Op: kv.Del, Key: store.ZSetMemberKey(key, []byte(popped.Member))}, + {Op: kv.Del, Key: store.ZSetScoreKey(key, popped.Score, []byte(popped.Member))}, + {Op: kv.Put, Key: store.ZSetMetaDeltaKey(key, commitTS, 0), Value: deltaVal}, + } + _, dispatchErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: normalizeStartTS(readTS), + CommitTS: commitTS, + Elems: elems, + }) + return cockerrors.WithStack(dispatchErr) + } + // Legacy blob: write back all remaining entries. + payload, err := marshalZSetValue(redisZSetValue{Entries: remaining}) + if err != nil { + return err + } + return r.dispatchElems(ctx, true, readTS, []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: redisZSetKey(key), Value: payload}, + }) +} + +func (r *RedisServer) bzpopmin(conn redcon.Conn, cmd redcon.Command) { + if r.proxyToLeader(conn, cmd, cmd.Args[1]) { + return + } + timeoutSeconds, err := strconv.ParseFloat(string(cmd.Args[len(cmd.Args)-1]), 64) + if err != nil || timeoutSeconds < 0 { + conn.WriteError("ERR timeout is not a float or out of range") + return + } + + // timeout=0 means infinite wait in Redis; cap at redisDispatchTimeout to prevent goroutine leak. + if timeoutSeconds == 0 { + timeoutSeconds = redisDispatchTimeout.Seconds() + } + deadline := time.Now().Add(time.Duration(timeoutSeconds * float64(time.Second))) + + keys := cmd.Args[1 : len(cmd.Args)-1] + r.bzpopminWaitLoop(conn, keys, deadline) +} + +// bzpopminWaitLoop runs the BLOCK-window wait loop. Extracted from +// bzpopmin so the parent function stays under the cyclop budget. +// Uses an event-driven signal from the in-process ZADD / ZINCRBY +// path with a fallback timer for paths that bypass the signal. +// +// Registration happens BEFORE the first tryBZPopMin so a signal that +// fires between the check and the wait cannot be lost: the buffered +// channel holds it, and the next select wakes immediately. +func (r *RedisServer) bzpopminWaitLoop(conn redcon.Conn, keys [][]byte, deadline time.Time) { + handlerCtx := r.handlerContext() + w, release := r.zsetWaiters.Register(keys) + defer release() + // fast tracks whether the next iteration may skip the wrongType + // slow probe. The first iteration is always full so an existing + // wrongType key surfaces an immediate WRONGTYPE; subsequent + // iterations after a signal-driven wake skip the wrongType + // detection because zsetWaiters.Signal only fires for ZADD / + // ZINCRBY (neither of which can introduce a wrongType). + // + // lastFullCheck wall-time-bounds how long the fast mode can stay + // active under sustained signal pressure. Without this gate, a + // hot key whose zsetWaiters.Signal fires faster than each + // bzpopminTryAllKeys round finishes can keep waiterC perpetually + // full, starving the fallback timer and letting a wrongType + // write on a co-registered key (multi-key BZPOPMIN) go + // undetected for the entire BLOCK window. Demoting `fast` back + // to false after redisBlockWaitFallback elapses since the last + // full check restores the #666 ceiling: WRONGTYPE on any + // registered key surfaces within ~one fallback interval (100 ms) + // regardless of signal rate. See + // TestRedis_BZPopMinDetectsWrongTypeUnderSignalLoad for the + // regression scenario. + fast := false + lastFullCheck := time.Now() + for { + if handlerCtx.Err() != nil { + conn.WriteNull() + return + } + if r.bzpopminTryAllKeys(conn, keys, fast) { + return + } + if !fast { + lastFullCheck = time.Now() + } + if !time.Now().Before(deadline) { + conn.WriteNull() + return + } + signaled := waitForBlockedCommandUpdate(handlerCtx, w.C, deadline) + fast = signaled && time.Since(lastFullCheck) < redisBlockWaitFallback + } +} + +// bzpopminTryAllKeys runs one tryBZPopMinWithMode pass across keys. +// Returns true when a result was written (success or terminal error) +// and the caller should stop the loop, false to continue waiting. +// The fast flag is forwarded to tryBZPopMinWithMode: true selects +// the signal-driven-wake path (skips wrongType detection); false +// selects the full check. +func (r *RedisServer) bzpopminTryAllKeys(conn redcon.Conn, keys [][]byte, fast bool) bool { + for _, key := range keys { + result, err := r.tryBZPopMinWithMode(key, fast) + if err != nil { + writeRedisError(conn, err) + return true + } + if result == nil { + continue + } + conn.WriteArray(redisTripletWidth) + conn.WriteBulk(result.key) + conn.WriteBulkString(result.entry.Member) + conn.WriteBulkString(formatRedisFloat(result.entry.Score)) + return true + } + return false +} + +// waitForBlockedCommandUpdate blocks until one of: a write signal +// arrives, the fallback poll tick fires, the parent handlerCtx is +// cancelled, or the BLOCK deadline elapses — whichever happens first. +// The fallback bounds latency for write paths that do not signal (Lua +// flush, follower-applied entries); it cannot exceed the remaining +// BLOCK window so the deadline branch in the caller's loop top always +// gets a chance to fire when the BLOCK expires. Shared by every +// blocking-command wait loop (XREAD BLOCK, BZPOPMIN today; BLPOP / +// BRPOP / BLMOVE in follow-ups) — the keyWaiterRegistry that produces +// waiterC is per-domain (streamWaiters vs zsetWaiters), but the +// timer-and-select shape is identical. +// +// Returns true iff the wake came from waiterC (i.e., a producer +// Signal). False on fallback-timer fire or handlerCtx cancellation. +// Callers that have a signal-implied invariant (e.g., "only ZADD / +// ZINCRBY fires zsetWaiters.Signal") can use the return value to +// pick a faster re-check on the next iteration; fallback wakes +// always need the full check because writes that bypass Signal +// (Lua flush, follower-applied entries, wrongType-introducing +// commands) only become observable through the timer branch. +func waitForBlockedCommandUpdate(handlerCtx context.Context, waiterC <-chan struct{}, deadline time.Time) bool { + fallback := redisBlockWaitFallback + if remaining := time.Until(deadline); remaining < fallback { + fallback = remaining + } + timer := time.NewTimer(fallback) + defer func() { + if !timer.Stop() { + // The timer either fired (its case won and the channel + // was drained inline by select) or is still buffering + // the tick (waiter / handlerCtx won the race); drain + // the channel non-blocking so timer GC is clean. + select { + case <-timer.C: + default: + } + } + }() + select { + case <-waiterC: + return true + case <-timer.C: + return false + case <-handlerCtx.Done(): + return false + } +}