diff --git a/Cargo.lock b/Cargo.lock index 6d8e8c841c..f3564681e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9271,6 +9271,7 @@ dependencies = [ name = "partitions" version = "0.1.0" dependencies = [ + "ahash 0.8.12", "bytemuck", "bytes", "compio", @@ -11579,6 +11580,7 @@ version = "0.8.0" dependencies = [ "ahash 0.8.12", "argon2", + "assert_cmd", "async-channel", "async_zip", "axum", @@ -11600,9 +11602,11 @@ dependencies = [ "error_set", "figlet-rs", "fs2", + "futures", "hash32 1.0.0", "human-repr", "hwlocality", + "iggy", "iggy_binary_protocol", "iggy_common", "journal", @@ -11640,6 +11644,7 @@ dependencies = [ "sysinfo 0.39.2", "tempfile", "thiserror 2.0.18", + "tokio", "toml 1.1.2+spec-1.1.0", "tower-http", "tracing", diff --git a/core/binary_protocol/src/responses/topics/create_topic.rs b/core/binary_protocol/src/responses/topics/create_topic.rs index 74d5513603..33ea71a750 100644 --- a/core/binary_protocol/src/responses/topics/create_topic.rs +++ b/core/binary_protocol/src/responses/topics/create_topic.rs @@ -15,5 +15,10 @@ // specific language governing permissions and limitations // under the License. -/// `CreateTopic` response is empty. -pub type CreateTopicResponse = super::EmptyResponse; +/// `CreateTopic` reply ships the freshly-created topic. +/// +/// Same `[TopicHeader][PartitionResponse]*` layout as `GetTopicResponse`, +/// so the SDK reuses one decoder for both calls. Legacy server's +/// `create_topic_handler` builds this shape directly; server-ng's metadata +/// STM emits the same bytes from `apply`. +pub type CreateTopicResponse = super::GetTopicResponse; diff --git a/core/configs/src/server_config/sharding.rs b/core/configs/src/server_config/sharding.rs index acbe3a124f..7434f9167a 100644 --- a/core/configs/src/server_config/sharding.rs +++ b/core/configs/src/server_config/sharding.rs @@ -77,6 +77,21 @@ pub const SHUTDOWN_DRAIN_TIMEOUT_MAX: Duration = Duration::from_secs(600); /// at 5s so Ctrl-C latency stays bounded regardless of config. pub const SHUTDOWN_POLL_INTERVAL_MAX: Duration = Duration::from_secs(5); +/// Default safety-tick cadence for the partition reconciliation loop. +/// The reconciler also wakes on every `LifecycleFrame::MetadataCommitTick` +/// broadcast by shard 0; this fallback covers dropped wake-ups (the wake +/// channel is intentionally capacity-1) and the initial post-bootstrap +/// convergence window before shard 0's first tick. One second is +/// invisible to operators yet keeps idle clusters from burning CPU +/// re-reading the same target snapshot. +pub const DEFAULT_RECONCILE_PERIODIC_INTERVAL: Duration = Duration::from_secs(1); + +/// Hard upper bound on `reconcile_periodic_interval`. A tick longer +/// than ~30s makes post-failure recovery latency operator-visible; the +/// cap reins in pathological typos without disturbing reasonable +/// production values. +pub const RECONCILE_PERIODIC_INTERVAL_MAX: Duration = Duration::from_secs(30); + const fn default_inbox_capacity() -> usize { DEFAULT_INBOX_CAPACITY } @@ -89,6 +104,10 @@ fn default_shutdown_poll_interval() -> IggyDuration { IggyDuration::new(DEFAULT_SHUTDOWN_POLL_INTERVAL) } +fn default_reconcile_periodic_interval() -> IggyDuration { + IggyDuration::new(DEFAULT_RECONCILE_PERIODIC_INTERVAL) +} + #[serde_as] #[derive(Debug, Deserialize, Serialize, ConfigEnv)] pub struct ShardingConfig { @@ -136,6 +155,16 @@ pub struct ShardingConfig { #[serde_as(as = "DisplayFromStr")] #[config_env(leaf)] pub shutdown_poll_interval: IggyDuration, + /// Safety-tick cadence for the partition reconciliation loop; the + /// reconciler also wakes immediately on every + /// `LifecycleFrame::MetadataCommitTick` from shard 0. See + /// [`DEFAULT_RECONCILE_PERIODIC_INTERVAL`] for the rationale; values + /// above [`RECONCILE_PERIODIC_INTERVAL_MAX`] are rejected by the + /// validator. + #[serde(default = "default_reconcile_periodic_interval")] + #[serde_as(as = "DisplayFromStr")] + #[config_env(leaf)] + pub reconcile_periodic_interval: IggyDuration, } impl Default for ShardingConfig { @@ -145,6 +174,7 @@ impl Default for ShardingConfig { inbox_capacity: DEFAULT_INBOX_CAPACITY, shutdown_drain_timeout: default_shutdown_drain_timeout(), shutdown_poll_interval: default_shutdown_poll_interval(), + reconcile_periodic_interval: default_reconcile_periodic_interval(), } } } diff --git a/core/configs/src/server_config/validators.rs b/core/configs/src/server_config/validators.rs index 2889a571e3..de72d77240 100644 --- a/core/configs/src/server_config/validators.rs +++ b/core/configs/src/server_config/validators.rs @@ -22,8 +22,8 @@ use super::server::{ }; use super::server::{MemoryPoolConfig, PersonalAccessTokenConfig, ServerConfig}; use super::sharding::{ - CpuAllocation, INBOX_CAPACITY_MAX, SHUTDOWN_DRAIN_TIMEOUT_MAX, SHUTDOWN_POLL_INTERVAL_MAX, - ShardingConfig, + CpuAllocation, INBOX_CAPACITY_MAX, RECONCILE_PERIODIC_INTERVAL_MAX, SHUTDOWN_DRAIN_TIMEOUT_MAX, + SHUTDOWN_POLL_INTERVAL_MAX, ShardingConfig, }; use super::system::SegmentConfig; use super::system::{CompressionConfig, LoggingConfig, PartitionConfig}; @@ -440,6 +440,25 @@ impl Validatable for ShardingConfig { return Err(ConfigurationError::InvalidConfigurationValue); } + let reconcile = self.reconcile_periodic_interval.get_duration(); + if reconcile.is_zero() { + eprintln!( + "Invalid sharding configuration: reconcile_periodic_interval resolves to zero. \ + Note that \"0\", \"none\", \"unlimited\", and \"disabled\" all parse to zero. The \ + periodic reconcile tick is a safety net for dropped commit-wakes and cannot be \ + turned off; set a positive duration (default \"1s\", max {RECONCILE_PERIODIC_INTERVAL_MAX:?})." + ); + return Err(ConfigurationError::InvalidConfigurationValue); + } + if reconcile > RECONCILE_PERIODIC_INTERVAL_MAX { + eprintln!( + "Invalid sharding configuration: reconcile_periodic_interval {:?} exceeds the \ + {:?} cap (a long tick makes post-failure convergence latency operator-visible)", + reconcile, RECONCILE_PERIODIC_INTERVAL_MAX + ); + return Err(ConfigurationError::InvalidConfigurationValue); + } + let available_cpus = available_parallelism() .map_err(|_| { eprintln!("Failed to detect available CPU cores"); diff --git a/core/consensus/src/metadata_helpers.rs b/core/consensus/src/metadata_helpers.rs index 09a8b58729..53061aa812 100644 --- a/core/consensus/src/metadata_helpers.rs +++ b/core/consensus/src/metadata_helpers.rs @@ -387,6 +387,8 @@ mod tests { #[allow(clippy::future_not_send)] impl MessageBus for ClientSpyBus { + fn track_background(&self, _handle: message_bus::JoinHandle<()>) {} + async fn send_to_client( &self, client_id: u128, diff --git a/core/consensus/src/plane_helpers.rs b/core/consensus/src/plane_helpers.rs index 35aebd08b4..e990afd701 100644 --- a/core/consensus/src/plane_helpers.rs +++ b/core/consensus/src/plane_helpers.rs @@ -454,6 +454,8 @@ mod tests { struct NoopBus; impl MessageBus for NoopBus { + fn track_background(&self, _handle: message_bus::JoinHandle<()>) {} + async fn send_to_client( &self, _client_id: u128, @@ -661,6 +663,8 @@ mod tests { #[allow(clippy::future_not_send)] impl MessageBus for SpyBus { + fn track_background(&self, _handle: message_bus::JoinHandle<()>) {} + async fn send_to_client( &self, _client_id: u128, diff --git a/core/message_bus/src/lib.rs b/core/message_bus/src/lib.rs index 10c96ec907..a87801ed73 100644 --- a/core/message_bus/src/lib.rs +++ b/core/message_bus/src/lib.rs @@ -99,7 +99,7 @@ pub use lifecycle::{ }; pub use transports::tls::TlsServerCredentials; -use compio::runtime::JoinHandle; +pub use compio::runtime::JoinHandle; use configs::server_ng::ServerNgConfig; use iggy_binary_protocol::GenericHeader; use server_common::{MESSAGE_ALIGN, Message, iobuf::Frozen}; @@ -507,6 +507,14 @@ pub trait MessageBus { /// Panics on a second install on the same bus, same one-shot /// `OnceCell` invariant as [`Self::set_replica_forward_fn`]. fn set_client_forward_fn(&self, f: ClientForwardFn); + + /// Register a detached `compio::runtime::spawn` handle so + /// [`shutdown`](IggyMessageBus::shutdown) can await it before the + /// runtime drops. Production [`IggyMessageBus`] pushes onto a + /// `RefCell>` drained on shutdown. Required (no default) + /// so a new impl cannot silently drop detached handles by omission; a + /// stub that spawns nothing implements it as a no-op. + fn track_background(&self, handle: JoinHandle<()>); } /// Production message bus backed by real TCP connections. @@ -1024,6 +1032,10 @@ impl MessageBus for std::rc::Rc { fn set_client_forward_fn(&self, f: ClientForwardFn) { (**self).set_client_forward_fn(f); } + + fn track_background(&self, handle: JoinHandle<()>) { + (**self).track_background(handle); + } } #[allow(clippy::future_not_send)] @@ -1109,6 +1121,10 @@ impl MessageBus for IggyMessageBus { fn set_client_forward_fn(&self, f: ClientForwardFn) { Self::set_client_forward_fn(self, f); } + + fn track_background(&self, handle: JoinHandle<()>) { + Self::track_background(self, handle); + } } /// Extract the owning shard from a client id. diff --git a/core/metadata/src/impls/metadata.rs b/core/metadata/src/impls/metadata.rs index eb2248a956..14d5b3040f 100644 --- a/core/metadata/src/impls/metadata.rs +++ b/core/metadata/src/impls/metadata.rs @@ -48,6 +48,7 @@ use server_common::Message; use std::cell::RefCell; use std::mem::size_of; use std::path::Path; +use std::rc::Rc; use tracing::{debug, error, warn}; fn freeze_client_reply( @@ -335,6 +336,16 @@ fn require_shard_zero<'a, T>( slot } +/// Late-bound callback invoked after every successful +/// `mux_stm.update(prepare)` on shard 0's metadata commit path. +/// +/// Wired by server-ng bootstrap once the metadata bundle has broadcast; +/// receives the committed [`Operation`] so the recipient can filter (the +/// partition reconciliation loop only cares about partition-shaped +/// events). Wrapped in [`RefCell`] for late binding; the per-shard +/// single-thread invariant keeps access safe without [`Sync`]. +pub type CommitNotifier = std::rc::Rc; + pub struct IggyMetadata { /// `Some` on shard 0, `None` on other shards. Server-ng bootstrap /// holds the invariant: only shard 0 owns the metadata consensus @@ -359,6 +370,12 @@ pub struct IggyMetadata { pub coordinator: Option>, /// Per-client session state (sessions, dedup, eviction). Metadata-only. pub client_table: RefCell, + /// Late-bound post-commit notifier. Fires once per committed + /// operation after [`crate::stm::StateMachine::update`] succeeds in + /// both [`Plane::on_ack`] and [`Self::commit_journal`]. `None` until + /// [`Self::set_commit_notifier`] runs (server-ng bootstrap on shard + /// 0 sets it; peer shards and tests leave it `None`). + commit_notifier: RefCell>, } impl IggyMetadata @@ -388,6 +405,26 @@ where allocator, coordinator, client_table: RefCell::new(ClientTable::new(CLIENTS_TABLE_MAX)), + commit_notifier: RefCell::new(None), + } + } +} + +impl IggyMetadata { + /// Install (or replace) the post-commit notifier. Passing `None` + /// removes any previous one. Server-ng bootstrap calls this on shard 0 + /// only; peer shards never commit metadata locally. + pub fn set_commit_notifier(&self, notifier: Option) { + *self.commit_notifier.borrow_mut() = notifier; + } + + /// Fire post-commit notifier. Clones the `Rc` out under a short + /// borrow so a re-entrant `set_commit_notifier` from inside the + /// closure cannot panic on `borrow_mut`. + fn fire_commit_notifier(&self, operation: Operation) { + let notifier = self.commit_notifier.borrow().as_ref().map(Rc::clone); + if let Some(notifier) = notifier { + notifier(operation); } } } @@ -742,6 +779,10 @@ where prepare_header.op ); }); + // Post-commit notifier (e.g. partition reconciler + // wake-up). Filtering by operation is the + // recipient's responsibility. + self.fire_commit_notifier(prepare_header.operation); let reply = build_reply_message(&prepare_header, &response); // Cache only if session exists. Client evicted between // prepare and commit: skip cache (`commit_reply` no-ops), @@ -1484,6 +1525,11 @@ where let response = self.mux_stm.update(prepare).unwrap_or_else(|err| { panic!("commit_journal: committed metadata op={op} failed to apply: {err}"); }); + // Post-commit notifier (e.g. partition reconciler + // wake-up). Same hook fires on backups so reconcilers + // converge after replicated commits, not only quorum-acked + // ones reached via `on_ack` on the primary. + self.fire_commit_notifier(header.operation); let reply = build_reply_message(&header, &response); // Cache only if session still exists. WAL replay may carry a // reply for a later-evicted client; `commit_reply` no-ops. @@ -1671,3 +1717,92 @@ where prepare } + +#[cfg(test)] +mod tests { + use super::*; + use crate::stm::consumer_group::ConsumerGroups; + use crate::stm::stream::Streams; + use crate::stm::user::Users; + use iggy_common::variadic; + use std::cell::RefCell; + use std::rc::Rc; + + type TestMux = MuxStateMachine; + + /// Build a peer-shard-style `IggyMetadata` with `consensus`, + /// `journal`, and `snapshot` all `None`. Enough to test the + /// commit-notifier slot without standing up VSR / WAL infrastructure: + /// the test picks `()` for `C` / `J` / `S` since no notifier code path + /// touches their methods. + fn peer_metadata() -> IggyMetadata<(), (), (), TestMux> { + IggyMetadata::new(None, None, None, TestMux::default(), None) + } + + #[test] + fn commit_notifier_fires_with_received_operation() { + let md = peer_metadata(); + let captured: Rc>> = Rc::new(RefCell::new(Vec::new())); + + let observer = Rc::clone(&captured); + md.set_commit_notifier(Some(Rc::new(move |op| { + observer.borrow_mut().push(op); + }))); + + md.fire_commit_notifier(Operation::CreateTopicWithAssignments); + md.fire_commit_notifier(Operation::DeletePartitions); + md.fire_commit_notifier(Operation::DeleteStream); + + let seen = captured.borrow(); + assert_eq!( + seen.as_slice(), + &[ + Operation::CreateTopicWithAssignments, + Operation::DeletePartitions, + Operation::DeleteStream, + ], + "notifier must observe every fired operation in order" + ); + } + + #[test] + fn commit_notifier_is_no_op_when_unset() { + // No notifier installed: firing must not panic, must not allocate. + // Mirrors the production-side guarantee that peer shards (no + // notifier) take the same commit path as shard 0 (with notifier). + let md = peer_metadata(); + md.fire_commit_notifier(Operation::CreateStream); + } + + #[test] + fn commit_notifier_can_be_replaced_and_cleared() { + let md = peer_metadata(); + let first_count: Rc> = Rc::new(RefCell::new(0)); + let second_count: Rc> = Rc::new(RefCell::new(0)); + + let first_observer = Rc::clone(&first_count); + md.set_commit_notifier(Some(Rc::new(move |_op| { + *first_observer.borrow_mut() += 1; + }))); + md.fire_commit_notifier(Operation::CreateStream); + assert_eq!(*first_count.borrow(), 1); + + // Replace: the first closure must no longer run. + let second_observer = Rc::clone(&second_count); + md.set_commit_notifier(Some(Rc::new(move |_op| { + *second_observer.borrow_mut() += 1; + }))); + md.fire_commit_notifier(Operation::DeleteStream); + assert_eq!(*first_count.borrow(), 1, "old notifier must be detached"); + assert_eq!(*second_count.borrow(), 1, "new notifier must take over"); + + // Clear: subsequent fires must be no-ops. + md.set_commit_notifier(None); + md.fire_commit_notifier(Operation::DeleteTopic); + assert_eq!( + *second_count.borrow(), + 1, + "cleared notifier must stay quiet" + ); + } +} diff --git a/core/metadata/src/lib.rs b/core/metadata/src/lib.rs index 5a1d96c28d..d9d2da327a 100644 --- a/core/metadata/src/lib.rs +++ b/core/metadata/src/lib.rs @@ -22,7 +22,7 @@ pub mod permissioner; pub mod stm; // Re-export IggyMetadata for use in other modules -pub use impls::metadata::{IggyMetadata, RegisterSubmitError}; +pub use impls::metadata::{CommitNotifier, IggyMetadata, RegisterSubmitError}; // Re-export MuxStateMachine for use in other modules pub use stm::mux::MuxStateMachine; diff --git a/core/metadata/src/stm/snapshot.rs b/core/metadata/src/stm/snapshot.rs index 3e9681de03..ea439bffe1 100644 --- a/core/metadata/src/stm/snapshot.rs +++ b/core/metadata/src/stm/snapshot.rs @@ -318,6 +318,7 @@ mod tests { let mut snapshot = MetadataSnapshot::new(100); snapshot.streams = Some(StreamsSnapshot { + revision: 0, items: vec![( 0, StreamSnapshot { @@ -348,6 +349,7 @@ mod tests { id: 0, consensus_group_id: 33, created_at: ts, + created_revision: 0, }], round_robin_counter: 0, }, @@ -424,6 +426,7 @@ mod tests { }; let streams_snap = StreamsSnapshot { + revision: 0, items: vec![ ( 0, diff --git a/core/metadata/src/stm/stream.rs b/core/metadata/src/stm/stream.rs index a3e3b231a1..c910c4299d 100644 --- a/core/metadata/src/stm/stream.rs +++ b/core/metadata/src/stm/stream.rs @@ -19,7 +19,7 @@ use crate::stm::StateHandler; use crate::stm::snapshot::Snapshotable; use crate::{collect_handlers, define_state, impl_fill_restore}; use ahash::AHashMap; -use bytes::Bytes; +use bytes::{Bytes, BytesMut}; use iggy_binary_protocol::WireIdentifier; use iggy_binary_protocol::codec::WireEncode; use iggy_binary_protocol::requests::partitions::{ @@ -29,11 +29,12 @@ use iggy_binary_protocol::requests::streams::{ CreateStreamRequest, DeleteStreamRequest, PurgeStreamRequest, UpdateStreamRequest, }; use iggy_binary_protocol::requests::topics::{ - CreateTopicWithAssignmentsRequest, DeleteTopicRequest, PurgeTopicRequest, UpdateTopicRequest, + CreateTopicRequest, CreateTopicWithAssignmentsRequest, DeleteTopicRequest, PurgeTopicRequest, + UpdateTopicRequest, }; use iggy_binary_protocol::responses::streams::StreamResponse; use iggy_binary_protocol::responses::streams::get_stream::{GetStreamResponse, TopicHeader}; -use iggy_binary_protocol::responses::topics::get_topic::{GetTopicResponse, PartitionResponse}; +use iggy_binary_protocol::responses::topics::get_topic::PartitionResponse; use iggy_common::{ CompressionAlgorithm, IggyExpiry, IggyTimestamp, MaxTopicSize, StreamStats, TopicStats, }; @@ -49,6 +50,10 @@ pub struct PartitionSnapshot { pub id: usize, pub consensus_group_id: u64, pub created_at: IggyTimestamp, + /// `#[serde(default)]` so snapshots predating this field restore + /// with revision 0 instead of failing to decode. + #[serde(default)] + pub created_revision: u64, } #[derive(Debug, Clone)] @@ -56,15 +61,26 @@ pub struct Partition { pub id: usize, pub consensus_group_id: u64, pub created_at: IggyTimestamp, + /// `StreamsInner::revision` at creation. Reconciler compares it to the + /// epoch it stored at materialisation; a mismatch means a delete+recreate + /// reused the slab key, so the local partition is stale and must be torn + /// down before rebuild. + pub created_revision: u64, } impl Partition { #[must_use] - pub const fn new(id: usize, consensus_group_id: u64, created_at: IggyTimestamp) -> Self { + pub const fn new( + id: usize, + consensus_group_id: u64, + created_at: IggyTimestamp, + created_revision: u64, + ) -> Self { Self { id, consensus_group_id, created_at, + created_revision, } } } @@ -226,6 +242,12 @@ define_state! { Streams { index: AHashMap, usize>, items: Slab, + // Monotonic counter bumped on every partition-shaping commit + // (create/delete topic, create/delete partitions, delete stream). + // Reconciler uses it for a fast-skip when nothing changed and stamps + // it onto each new Partition::created_revision. Deterministic across + // replicas: same ops, same order. + revision: u64, } } @@ -426,12 +448,13 @@ impl StateHandler for DeleteStreamRequest { return Bytes::new(); }; - if let Some(stream) = state.items.get(stream_id) { - let name = stream.name.clone(); - - state.items.remove(stream_id); - state.index.remove(&name); - } + let Some(stream) = state.items.get(stream_id) else { + return Bytes::new(); + }; + let name = stream.name.clone(); + state.items.remove(stream_id); + state.index.remove(&name); + state.revision = state.revision.wrapping_add(1); Bytes::new() } } @@ -451,15 +474,28 @@ impl StateHandler for CreateTopicWithAssignmentsRequest { let Some(stream_id) = state.resolve_stream_id(&self.request.stream_id) else { return Bytes::new(); }; - let Some(stream) = state.items.get_mut(stream_id) else { - return Bytes::new(); - }; let name_arc: Arc = Arc::from(self.request.name.as_str()); - if stream.topic_index.contains_key(&name_arc) { - return Bytes::new(); + // Validate under a short immutable borrow that ends before the + // revision bump below takes `&mut state`. + { + let Some(stream) = state.items.get(stream_id) else { + return Bytes::new(); + }; + if stream.topic_index.contains_key(&name_arc) { + return Bytes::new(); + } } + // Past validation: this commit adds partitions, so bump the + // monotonic revision and stamp every new partition with it. + let new_revision = state.revision.wrapping_add(1); + state.revision = new_revision; + + let Some(stream) = state.items.get_mut(stream_id) else { + return Bytes::new(); + }; + let replication_factor = if self.request.replication_factor == 0 { 1 } else { @@ -491,6 +527,7 @@ impl StateHandler for CreateTopicWithAssignmentsRequest { id: partition.partition_id as usize, consensus_group_id: partition.consensus_group_id, created_at: timestamp, + created_revision: new_revision, }; topic.partitions.push(partition); } @@ -498,38 +535,71 @@ impl StateHandler for CreateTopicWithAssignmentsRequest { stream.topic_index.insert(name_arc, topic_id); - // Reply body: the SDK `create_topic` decodes a `GetTopicResponse`. A - // freshly created topic has empty/zeroed segment stats; each assigned - // partition is reported at offset 0. - let partitions = self - .partitions - .iter() - .map(|partition| PartitionResponse { - id: partition.partition_id, - created_at: timestamp.as_micros(), + let Some(topic) = stream.topics.get(topic_id) else { + return Bytes::new(); + }; + encode_create_topic_reply(&self.request, topic_id, topic) + } +} + +/// Encode the `CreateTopic` reply as `[TopicHeader][PartitionResponse]*`, +/// the `GetTopicResponse` shape the SDK already decodes, so the create +/// reply deserializes without a schema break. Returns empty bytes on a +/// `u32` overflow (same contract as a validation rejection) rather than +/// saturating to `u32::MAX`. +fn encode_create_topic_reply( + request: &CreateTopicRequest, + topic_id: usize, + topic: &Topic, +) -> Bytes { + let Ok(topic_id_u32) = u32::try_from(topic_id) else { + return Bytes::new(); + }; + let Ok(partitions_count_u32) = u32::try_from(topic.partitions.len()) else { + return Bytes::new(); + }; + let header = TopicHeader { + id: topic_id_u32, + created_at: topic.created_at.into(), + partitions_count: partitions_count_u32, + message_expiry: request.message_expiry, + compression_algorithm: request.compression_algorithm, + max_topic_size: request.max_topic_size, + replication_factor: topic.replication_factor, + size_bytes: 0, + messages_count: 0, + name: request.name.clone(), + }; + let Ok(partitions_resp) = topic + .partitions + .iter() + .map(|p| { + u32::try_from(p.id).map(|id| PartitionResponse { + id, + created_at: p.created_at.into(), segments_count: 0, current_offset: 0, size_bytes: 0, messages_count: 0, }) - .collect(); - GetTopicResponse { - topic: TopicHeader { - id: topic_id as u32, - created_at: timestamp.as_micros(), - partitions_count: self.partitions.len() as u32, - message_expiry: self.request.message_expiry, - compression_algorithm: self.request.compression_algorithm, - max_topic_size: self.request.max_topic_size, - replication_factor, - size_bytes: 0, - messages_count: 0, - name: self.request.name.clone(), - }, - partitions, - } - .to_bytes() + }) + .collect::, _>>() + else { + return Bytes::new(); + }; + + let mut buf = BytesMut::with_capacity( + header.encoded_size() + + partitions_resp + .iter() + .map(WireEncode::encoded_size) + .sum::(), + ); + header.encode(&mut buf); + for partition in &partitions_resp { + partition.encode(&mut buf); } + buf.freeze() } impl StateHandler for UpdateTopicRequest { @@ -583,11 +653,13 @@ impl StateHandler for DeleteTopicRequest { return Bytes::new(); }; - if let Some(topic) = stream.topics.get(topic_id) { - let name = topic.name.clone(); - stream.topics.remove(topic_id); - stream.topic_index.remove(&name); - } + let Some(topic) = stream.topics.get(topic_id) else { + return Bytes::new(); + }; + let name = topic.name.clone(); + stream.topics.remove(topic_id); + stream.topic_index.remove(&name); + state.revision = state.revision.wrapping_add(1); Bytes::new() } } @@ -600,7 +672,6 @@ impl StateHandler for PurgeTopicRequest { } } -// TODO(hubcio): Serialize proper reply (e.g. assigned partition IDs) instead of empty Bytes. impl StateHandler for CreatePartitionsWithAssignmentsRequest { type State = StreamsInner; fn apply(&self, state: &mut StreamsInner, timestamp: IggyTimestamp) -> Bytes { @@ -611,39 +682,64 @@ impl StateHandler for CreatePartitionsWithAssignmentsRequest { return Bytes::new(); }; + // Resolve absolute partition ids under a borrow that ends before + // the revision bump. Validate every id transition before mutating + // topic.partitions; mid-batch overflow + retry would otherwise + // re-base over a partial set and mint duplicate ids. + let resolved: Vec = { + let Some(stream) = state.items.get_mut(stream_id) else { + return Bytes::new(); + }; + let Some(topic) = stream.topics.get_mut(topic_id) else { + return Bytes::new(); + }; + + let base_partition_id = topic + .partitions + .iter() + .map(|partition| partition.id) + .max() + .and_then(|partition_id| partition_id.checked_add(1)) + .unwrap_or(0); + let Ok(base_partition_id) = u32::try_from(base_partition_id) else { + return Bytes::new(); + }; + + let mut resolved: Vec = Vec::with_capacity(self.partitions.len()); + for partition in &self.partitions { + let Some(resolved_id_u32) = partition.partition_id.checked_add(base_partition_id) + else { + return Bytes::new(); + }; + let Ok(resolved_id_usize) = usize::try_from(resolved_id_u32) else { + return Bytes::new(); + }; + resolved.push(resolved_id_usize); + } + resolved + }; + + let new_revision = state.revision.wrapping_add(1); + state.revision = new_revision; + let Some(stream) = state.items.get_mut(stream_id) else { return Bytes::new(); }; let Some(topic) = stream.topics.get_mut(topic_id) else { return Bytes::new(); }; - - let base_partition_id = topic - .partitions - .iter() - .map(|partition| partition.id) - .max() - .and_then(|partition_id| partition_id.checked_add(1)) - .unwrap_or(0); - let Ok(base_partition_id) = u32::try_from(base_partition_id) else { - return Bytes::new(); - }; - - for partition in &self.partitions { - let partition_id = partition - .partition_id - .checked_add(base_partition_id) - .and_then(|partition_id| usize::try_from(partition_id).ok()); - let Some(partition_id) = partition_id else { - return Bytes::new(); - }; - let partition = Partition { - id: partition_id, + for (resolved_id_usize, partition) in resolved.into_iter().zip(self.partitions.iter()) { + topic.partitions.push(Partition { + id: resolved_id_usize, consensus_group_id: partition.consensus_group_id, created_at: timestamp, - }; - topic.partitions.push(partition); + created_revision: new_revision, + }); } + + // Matches legacy CreatePartitions wire contract: empty-ok body on + // success. SDK discards the reply payload (resolved ids are derivable + // from the request's base + count). Bytes::new() } } @@ -666,11 +762,15 @@ impl StateHandler for DeletePartitionsRequest { }; let count_to_delete = self.partitions_count as usize; - if count_to_delete > 0 && count_to_delete <= topic.partitions.len() { + let did_delete = count_to_delete > 0 && count_to_delete <= topic.partitions.len(); + if did_delete { topic .partitions .truncate(topic.partitions.len() - count_to_delete); } + if did_delete { + state.revision = state.revision.wrapping_add(1); + } Bytes::new() } } @@ -679,6 +779,9 @@ impl StateHandler for DeletePartitionsRequest { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct StreamsSnapshot { pub items: Vec<(usize, StreamSnapshot)>, + /// `#[serde(default)]` so older snapshots restore at revision 0. + #[serde(default)] + pub revision: u64, } impl Snapshotable for Streams { @@ -719,6 +822,7 @@ impl Snapshotable for Streams { id: p.id, consensus_group_id: p.consensus_group_id, created_at: p.created_at, + created_revision: p.created_revision, }) .collect(), round_robin_counter: topic @@ -744,7 +848,10 @@ impl Snapshotable for Streams { ) }) .collect(); - StreamsSnapshot { items } + StreamsSnapshot { + items, + revision: inner.revision, + } }) } @@ -789,6 +896,7 @@ impl Snapshotable for Streams { id: p.id, consensus_group_id: p.consensus_group_id, created_at: p.created_at, + created_revision: p.created_revision, }) .collect(), round_robin_counter: Arc::new(AtomicUsize::new(topic_snap.round_robin_counter)), @@ -817,6 +925,7 @@ impl Snapshotable for Streams { let inner = StreamsInner { index, items, + revision: snapshot.revision, last_result: None, }; Ok(inner.into()) @@ -829,6 +938,7 @@ impl_fill_restore!(Streams, streams); mod tests { use super::*; use iggy_binary_protocol::WireName; + use iggy_binary_protocol::codec::WireDecode; use iggy_binary_protocol::primitives::partition_assignment::CreatedPartitionAssignment; use iggy_binary_protocol::requests::partitions::{ CreatePartitionsRequest as WireCreatePartitionsRequest, @@ -837,6 +947,7 @@ mod tests { use iggy_binary_protocol::requests::topics::{ CreateTopicRequest as WireCreateTopicRequest, CreateTopicWithAssignmentsRequest, }; + use iggy_binary_protocol::responses::topics::get_topic::GetTopicResponse; fn create_stream(inner: &mut StreamsInner, name: &str) { let request = CreateStreamRequest { @@ -938,4 +1049,125 @@ mod tests { 13 ); } + + #[test] + fn create_topic_apply_returns_get_topic_response_compatible_bytes() { + // STM apply must emit `[TopicHeader][PartitionResponse]*` so existing + // SDK decoders (`decode_response::`) parse the reply + // without a wire-schema break. + let mut inner = StreamsInner::new(); + create_stream(&mut inner, "stream"); + let create_topic = CreateTopicWithAssignmentsRequest { + request: make_topic_request(0, 2, "topic"), + partitions: vec![ + CreatedPartitionAssignment { + partition_id: 0, + consensus_group_id: 100, + }, + CreatedPartitionAssignment { + partition_id: 1, + consensus_group_id: 101, + }, + ], + }; + + let bytes = StateHandler::apply(&create_topic, &mut inner, IggyTimestamp::now()); + let (reply, consumed) = GetTopicResponse::decode(&bytes).expect("reply decodes"); + assert_eq!(consumed, bytes.len()); + assert_eq!(reply.topic.id, 0); + assert_eq!(reply.topic.partitions_count, 2); + assert_eq!(reply.topic.name.as_str(), "topic"); + assert_eq!(reply.partitions.len(), 2); + assert_eq!(reply.partitions[0].id, 0); + assert_eq!(reply.partitions[1].id, 1); + } + + #[test] + fn create_topic_apply_returns_empty_on_validation_failure() { + // Unknown stream id rejects the command; legacy contract returns + // empty body so SDK decodes failure. + let mut inner = StreamsInner::new(); + let create_topic = CreateTopicWithAssignmentsRequest { + request: make_topic_request(42, 1, "topic"), + partitions: vec![CreatedPartitionAssignment { + partition_id: 0, + consensus_group_id: 1, + }], + }; + let bytes = StateHandler::apply(&create_topic, &mut inner, IggyTimestamp::now()); + assert!(bytes.is_empty()); + } + + #[test] + fn create_partitions_apply_resolves_ids_and_returns_empty_reply() { + // STM resolves request-relative ids against the topic's current + // partition count; the wire reply is empty (matches legacy + // empty-ok response; SDK ignores the body). + let mut inner = StreamsInner::new(); + create_stream(&mut inner, "stream"); + let create_topic = CreateTopicWithAssignmentsRequest { + request: make_topic_request(0, 2, "topic"), + partitions: vec![ + CreatedPartitionAssignment { + partition_id: 0, + consensus_group_id: 50, + }, + CreatedPartitionAssignment { + partition_id: 1, + consensus_group_id: 51, + }, + ], + }; + let _ = StateHandler::apply(&create_topic, &mut inner, IggyTimestamp::now()); + + let create_partitions = CreatePartitionsWithAssignmentsRequest { + request: WireCreatePartitionsRequest { + stream_id: WireIdentifier::numeric(0), + topic_id: WireIdentifier::numeric(0), + partitions_count: 2, + }, + // request-relative offsets 0..=1; base is 2 (next after the + // two topic-creation partitions), so resolved ids are 2 and 3. + partitions: vec![ + CreatedPartitionAssignment { + partition_id: 0, + consensus_group_id: 60, + }, + CreatedPartitionAssignment { + partition_id: 1, + consensus_group_id: 61, + }, + ], + }; + + let bytes = StateHandler::apply(&create_partitions, &mut inner, IggyTimestamp::now()); + assert!(bytes.is_empty()); + + let partitions = &inner.items[0].topics[0].partitions; + assert_eq!(partitions.len(), 4); + assert_eq!(partitions[2].id, 2); + assert_eq!(partitions[2].consensus_group_id, 60); + assert_eq!(partitions[3].id, 3); + assert_eq!(partitions[3].consensus_group_id, 61); + } + + #[test] + fn create_partitions_apply_returns_empty_on_validation_failure() { + let mut inner = StreamsInner::new(); + create_stream(&mut inner, "stream"); + // Topic missing => validation failure path + let create_partitions = CreatePartitionsWithAssignmentsRequest { + request: WireCreatePartitionsRequest { + stream_id: WireIdentifier::numeric(0), + topic_id: WireIdentifier::numeric(99), + partitions_count: 1, + }, + partitions: vec![CreatedPartitionAssignment { + partition_id: 0, + consensus_group_id: 1, + }], + }; + let bytes = StateHandler::apply(&create_partitions, &mut inner, IggyTimestamp::now()); + assert!(bytes.is_empty()); + } } diff --git a/core/partitions/Cargo.toml b/core/partitions/Cargo.toml index 4562e6c2b6..940fef1462 100644 --- a/core/partitions/Cargo.toml +++ b/core/partitions/Cargo.toml @@ -29,6 +29,7 @@ readme = "../../README.md" publish = false [dependencies] +ahash = { workspace = true } bytemuck = { workspace = true } bytes = { workspace = true } compio = { workspace = true } diff --git a/core/partitions/src/iggy_index_writer.rs b/core/partitions/src/iggy_index_writer.rs index 0d9027bc52..1eb11599a1 100644 --- a/core/partitions/src/iggy_index_writer.rs +++ b/core/partitions/src/iggy_index_writer.rs @@ -120,12 +120,16 @@ impl IggyIndexWriter { /// Flushes buffered index file contents to disk. /// + /// Uses `fdatasync` (data only): index files are append-only and the + /// size change is tracked in datasync metadata on Linux, so the inode + /// metadata fsync of `sync_all` adds latency without correctness gain. + /// /// # Errors /// /// Returns an error if the file cannot be synchronized. pub async fn fsync(&self) -> Result<(), IggyError> { self.file - .sync_all() + .sync_data() .await .map_err(|_| IggyError::CannotWriteToFile)?; Ok(()) diff --git a/core/partitions/src/iggy_partitions.rs b/core/partitions/src/iggy_partitions.rs index 6846abf035..998e132837 100644 --- a/core/partitions/src/iggy_partitions.rs +++ b/core/partitions/src/iggy_partitions.rs @@ -19,13 +19,14 @@ use crate::IggyPartition; use crate::types::PartitionsConfig; +use ahash::AHashSet; use consensus::{Consensus, Plane, PlaneIdentity, VsrConsensus}; use iggy_binary_protocol::{ Command2, ConsensusHeader, PrepareHeader, PrepareOkHeader, RequestHeader, }; use message_bus::MessageBus; use server_common::sharding::{IggyNamespace, LocalIdx, ShardId}; -use std::cell::UnsafeCell; +use std::cell::{RefCell, UnsafeCell}; use std::collections::HashMap; use tracing::warn; @@ -44,13 +45,27 @@ where { shard_id: ShardId, config: PartitionsConfig, - /// Collection of partitions, the index of each partition isn't it's ID, but rather a local index (`LocalIdx`) which is used for lookups. + /// Index is `LocalIdx`, not `partition_id`. /// - /// Wrapped in `UnsafeCell` for interior mutability — matches the single-threaded - /// per-shard execution model. Consensus trait methods take `&self` but need to - /// mutate partition state (segments, offsets, journal). + /// # Safety invariant + /// + /// Container-level mutation (`Vec::push` / `swap_remove`) must run + /// only on the shard's pump task. Reconciler routes mutations + /// through `ReconcileOp` + `ReconcileApply`. Cross-task + /// access would be UB under cooperative `.await` interleaving. partitions: UnsafeCell>>, - namespace_to_local: HashMap, + /// Same single-pump invariant as `partitions`. + namespace_to_local: UnsafeCell>, + /// Tombstone gate: reconciler sets it synchronously before awaiting + /// disk delete; pump clears on `ConfirmRemove`. Pump's `Plane::on_*` + /// short-circuits frames hitting a tombstoned namespace. + /// + /// `RefCell` (not `UnsafeCell`) because both the reconciler task and + /// the pump task mutate it, so the single-pump invariant protecting + /// `partitions` / `namespace_to_local` does NOT hold here. Compio's + /// per-shard runtime is single-threaded, so runtime borrow checks + /// suffice; callers must not hold a borrow across `.await`. + tombstoned: RefCell>, } impl IggyPartitions @@ -63,7 +78,8 @@ where shard_id, config, partitions: UnsafeCell::new(Vec::new()), - namespace_to_local: HashMap::new(), + namespace_to_local: UnsafeCell::new(HashMap::new()), + tombstoned: RefCell::new(AHashSet::new()), } } @@ -73,7 +89,8 @@ where shard_id, config, partitions: UnsafeCell::new(Vec::with_capacity(capacity)), - namespace_to_local: HashMap::with_capacity(capacity), + namespace_to_local: UnsafeCell::new(HashMap::with_capacity(capacity)), + tombstoned: RefCell::new(AHashSet::new()), } } @@ -86,6 +103,17 @@ where unsafe { &*self.partitions.get() } } + fn namespace_map(&self) -> &HashMap { + // Safety: single-threaded per-shard model, no concurrent access. + unsafe { &*self.namespace_to_local.get() } + } + + #[allow(clippy::mut_from_ref)] + fn namespace_map_mut(&self) -> &mut HashMap { + // Safety: single-threaded per-shard model, no concurrent access. + unsafe { &mut *self.namespace_to_local.get() } + } + pub const fn shard_id(&self) -> ShardId { self.shard_id } @@ -112,81 +140,133 @@ where /// Lookup local index by namespace. pub fn local_idx(&self, namespace: &IggyNamespace) -> Option { - self.namespace_to_local.get(namespace).copied() + self.namespace_map().get(namespace).copied() } /// Insert a new partition and return its local index. - pub fn insert(&mut self, namespace: IggyNamespace, partition: IggyPartition) -> LocalIdx { - let partitions = self.partitions.get_mut(); + /// + /// # Safety discipline (compiler cannot enforce) + /// + /// Must only be called from the shard's pump task (i.e. inside + /// `IggyShard::apply_reconcile_ops`). `Vec::push` may reallocate + + /// invalidate any live `&mut IggyPartition` returned by + /// [`Self::get_mut_by_ns`] / [`Self::get_mut`] held by a sibling + /// task across an `.await`. New external call sites MUST route + /// through `ReconcileOp::InsertOwned` instead. + #[doc(hidden)] + pub fn insert(&self, namespace: IggyNamespace, partition: IggyPartition) -> LocalIdx { + // Safety: pump-only invariant, caller responsibility. + let partitions = unsafe { &mut *self.partitions.get() }; let local_idx = LocalIdx::new(partitions.len()); partitions.push(partition); - self.namespace_to_local.insert(namespace, local_idx); + self.namespace_map_mut().insert(namespace, local_idx); local_idx } /// Check if a namespace exists. pub fn contains(&self, namespace: &IggyNamespace) -> bool { - self.namespace_to_local.contains_key(namespace) + self.namespace_map().contains_key(namespace) } /// Get partition by namespace directly. + /// + /// Returns `None` for tombstoned namespaces so callers outside + /// [`Plane`] (view-change handlers, `tick_partitions`, loopback drain) + /// can't drive journal writes against a partition the reconciler has + /// already fenced for delete. pub fn get_by_ns(&self, namespace: &IggyNamespace) -> Option<&IggyPartition> { - let idx = self.namespace_to_local.get(namespace)?; + if self.is_tombstoned(namespace) { + return None; + } + let idx = self.namespace_map().get(namespace)?; self.partitions().get(**idx) } - /// Get mutable partition by namespace directly. + /// Get mutable partition by namespace directly. Tombstone-gated like + /// [`Self::get_by_ns`]. #[allow(clippy::mut_from_ref)] pub fn get_mut_by_ns(&self, namespace: &IggyNamespace) -> Option<&mut IggyPartition> { - let idx = self.namespace_to_local.get(namespace)?; + if self.is_tombstoned(namespace) { + return None; + } + let idx = self.namespace_map().get(namespace)?; // Safety: single-threaded per-shard model, no concurrent access. unsafe { (&mut *self.partitions.get()).get_mut(**idx) } } - /// Remove a partition by namespace. Returns the removed partition if found. - pub fn remove(&mut self, namespace: &IggyNamespace) -> Option> { - let local_idx = self.namespace_to_local.remove(namespace)?; + /// Remove a partition by namespace. + /// + /// # Safety discipline (compiler cannot enforce) + /// + /// Must only be called from the shard's pump task. `Vec::swap_remove` + /// invalidates any live `&mut IggyPartition` returned by + /// [`Self::get_mut_by_ns`] / [`Self::get_mut`] held by a sibling + /// task across an `.await`. New external call sites MUST route + /// through `ReconcileOp::ConfirmRemove` instead. + /// + /// # Panics + /// + /// Panics if the stored `LocalIdx` is past `partitions.len()`, an + /// invariant violation. Silent `None` would leave the map half-mutated + /// and prime the next `insert` for a colliding index. + #[doc(hidden)] + pub fn remove(&self, namespace: &IggyNamespace) -> Option> { + let local_idx = self.namespace_map_mut().remove(namespace)?; let idx = *local_idx; - let partitions = self.partitions.get_mut(); + let partitions = unsafe { &mut *self.partitions.get() }; - if idx >= partitions.len() { - return None; - } + assert!( + idx < partitions.len(), + "IggyPartitions invariant: LocalIdx({idx}) >= len {len}", + idx = idx, + len = partitions.len(), + ); let partition = partitions.swap_remove(idx); if idx < partitions.len() { - for lidx in self.namespace_to_local.values_mut() { - if **lidx == partitions.len() { - *lidx = LocalIdx::new(idx); - break; - } - } + // `swap_remove` moved the tail entry into `idx`. Update the map + // by the moved partition's namespace key in O(1); the previous + // linear value-scan turned bulk DeleteStream into O(K²) on the + // pump task, stalling client traffic for ~10k-partition topics. + let moved_ns = IggyNamespace::from_raw(partitions[idx].consensus().namespace()); + let entry = self.namespace_map_mut().get_mut(&moved_ns).expect( + "IggyPartitions invariant: swapped-in partition missing namespace_to_local entry", + ); + *entry = LocalIdx::new(idx); } Some(partition) } /// Remove multiple partitions at once. - pub fn remove_many(&mut self, namespaces: &[IggyNamespace]) -> Vec> { + /// + /// Same pump-only safety discipline as [`Self::remove`]. + #[doc(hidden)] + pub fn remove_many(&self, namespaces: &[IggyNamespace]) -> Vec> { namespaces.iter().filter_map(|ns| self.remove(ns)).collect() } /// Iterate over all namespaces owned by this shard. pub fn namespaces(&self) -> impl Iterator { - self.namespace_to_local.keys() + self.namespace_map().keys() } - /// Get partition by namespace, initializing if not present. - pub fn get_or_init(&mut self, namespace: IggyNamespace, init: F) -> &mut IggyPartition - where - F: FnOnce() -> IggyPartition, - { - if !self.namespace_to_local.contains_key(&namespace) { - self.insert(namespace, init()); - } - let idx = *self.namespace_to_local[&namespace]; - &mut self.partitions.get_mut()[idx] + pub fn is_tombstoned(&self, namespace: &IggyNamespace) -> bool { + self.tombstoned.borrow().contains(namespace) + } + + /// Mark a namespace as tombstoned. Callable from any task on the + /// shard's runtime (reconciler sets the fence synchronously before + /// awaiting disk delete). + pub fn tombstone(&self, namespace: IggyNamespace) { + self.tombstoned.borrow_mut().insert(namespace); + } + + /// Clear a namespace tombstone. Pump-side hook called from + /// `ReconcileOp::ConfirmRemove` after the partition is dropped. + pub fn untombstone(&self, namespace: &IggyNamespace) { + self.tombstoned.borrow_mut().remove(namespace); } } @@ -196,6 +276,14 @@ where { async fn on_request(&self, message: as Consensus>::Message) { let namespace = IggyNamespace::from_raw(message.header().namespace); + if self.is_tombstoned(&namespace) { + warn!( + target: "iggy.partitions.diag", + namespace_raw = namespace.inner(), + "dropping request: namespace tombstoned" + ); + return; + } let Some(partition) = self.get_mut_by_ns(&namespace) else { warn!( target: "iggy.partitions.diag", @@ -211,6 +299,14 @@ where async fn on_replicate(&self, message: as Consensus>::Message) { let namespace = IggyNamespace::from_raw(message.header().namespace); + if self.is_tombstoned(&namespace) { + warn!( + target: "iggy.partitions.diag", + namespace_raw = namespace.inner(), + "dropping prepare: namespace tombstoned" + ); + return; + } let Some(partition) = self.get_mut_by_ns(&namespace) else { warn!( target: "iggy.partitions.diag", @@ -228,6 +324,14 @@ where #[allow(clippy::too_many_lines)] async fn on_ack(&self, message: as Consensus>::Message) { let namespace = IggyNamespace::from_raw(message.header().namespace); + if self.is_tombstoned(&namespace) { + warn!( + target: "iggy.partitions.diag", + namespace_raw = namespace.inner(), + "dropping prepare-ok: namespace tombstoned" + ); + return; + } let config = self.config.clone(); let Some(partition) = self.get_mut_by_ns(&namespace) else { warn!( diff --git a/core/partitions/src/messages_writer.rs b/core/partitions/src/messages_writer.rs index 6f330aadbb..cbdef58a09 100644 --- a/core/partitions/src/messages_writer.rs +++ b/core/partitions/src/messages_writer.rs @@ -116,12 +116,16 @@ impl MessagesWriter { /// Flushes buffered segment file contents to disk. /// + /// Uses `fdatasync` (data only): segment files are append-only and the + /// size change is tracked in datasync metadata on Linux, so the inode + /// metadata fsync of `sync_all` adds latency without correctness gain. + /// /// # Errors /// /// Returns an error if the file cannot be synchronized. pub async fn fsync(&self) -> Result<(), IggyError> { self.file - .sync_all() + .sync_data() .await .map_err(|_| IggyError::CannotWriteToFile)?; Ok(()) diff --git a/core/server-ng/Cargo.toml b/core/server-ng/Cargo.toml index 8435b40811..bf42650ede 100644 --- a/core/server-ng/Cargo.toml +++ b/core/server-ng/Cargo.toml @@ -115,6 +115,7 @@ err_trail = { workspace = true } error_set = { workspace = true } figlet-rs = { workspace = true } fs2 = { workspace = true } +futures = { workspace = true } hash32 = { workspace = true } human-repr = { workspace = true } iggy_binary_protocol = { workspace = true } @@ -171,6 +172,12 @@ hwlocality = { workspace = true, features = ["vendored"] } [build-dependencies] vergen-git2 = { workspace = true } +[dev-dependencies] +assert_cmd = { workspace = true } +bytemuck = { workspace = true } +iggy = { workspace = true } +tokio = { workspace = true, features = ["full", "test-util"] } + [lints.clippy] enum_glob_use = "deny" pedantic = "deny" diff --git a/core/server-ng/src/bootstrap.rs b/core/server-ng/src/bootstrap.rs index ab3a4580a3..069fc47fa2 100644 --- a/core/server-ng/src/bootstrap.rs +++ b/core/server-ng/src/bootstrap.rs @@ -20,21 +20,22 @@ use crate::dispatch::{ make_client_request_handler, make_deferred_client_request_handler, make_deferred_replica_message_handler, make_list_clients_handler, make_metadata_submit_handler, }; +use crate::partition_helpers::{ + configure_consumer_offsets, ensure_initial_segment, validate_namespace_bounds, +}; use crate::server_error::{ServerNgError, ShardJoinFailure, ShardJoinFailureKind}; use crate::session_manager::SessionManager; use configs::server_ng::ServerNgConfig; use configs::sharding::{ INBOX_CAPACITY_MAX, SHUTDOWN_DRAIN_TIMEOUT_MAX, SHUTDOWN_POLL_INTERVAL_MAX, }; -use consensus::{LocalPipeline, PartitionsHandle, Sequencer, VsrConsensus}; +use consensus::{LocalPipeline, MetadataHandle, PartitionsHandle, Sequencer, VsrConsensus}; // `try_send` / `try_recv` resolve through these traits on `MAsyncTx` / // `MAsyncRx`; the metadata-handoff loops below depend on the // non-blocking variants for cancel-safe shutdown polling. use crossfire::{AsyncRxTrait, AsyncTxTrait}; -use iggy_common::{ - ConsumerGroupOffsets, ConsumerOffsets, IggyByteSize, IggyError, PartitionStats, TopicStats, - variadic, -}; +use iggy_binary_protocol::Operation; +use iggy_common::{IggyByteSize, PartitionStats, TopicStats, variadic}; use journal::Journal; use journal::prepare_journal::PrepareJournal; use message_bus::client_listener::{self, RequestHandler}; @@ -65,20 +66,19 @@ use partitions::{ }; use server_common::bootstrap::create_directories; use server_common::executor::create_shard_executor; -use server_common::sharding::{IggyNamespace, LocalIdx, PartitionLocation, ShardId}; +use server_common::sharding::{IggyNamespace, PartitionLocation, ShardId}; // TODO: decouple bootstrap/storage helpers and logging from the `server` crate. use server::log::logger::Logging; use server::shard_allocator::{ShardAllocator, ShardInfo}; -use server::streaming::partitions::storage::{load_consumer_group_offsets, load_consumer_offsets}; -use server::streaming::segments::storage::create_segment_storage; use server::streaming::users::user::User as LegacyUser; use server::{IGGY_ROOT_PASSWORD_ENV, IGGY_ROOT_USERNAME_ENV}; use shard::builder::IggyShardBuilder; -use shard::metrics::ShardMetrics; -use shard::shards_table::{PapayaShardsTable, calculate_shard_assignment}; +use shard::metrics::{ShardMetrics, frame_drop_reason, frame_drop_variant}; +use shard::shards_table::{PapayaShardsTable, ShardsTable, calculate_shard_assignment}; use shard::{ - CoordinatorConfig, IggyShard, PartitionConsensusConfig, Receiver as ShardReceiver, ShardFrame, - ShardIdentity, TaggedSender, channel, shard_mesh_channels, + CoordinatorConfig, IggyShard, LifecycleFrame, PartitionConsensusConfig, + Receiver as ShardReceiver, ShardFrame, ShardIdentity, TaggedSender, channel, + shard_mesh_channels, }; use std::cell::RefCell; use std::env; @@ -632,7 +632,12 @@ fn run_shard_thread( .map_err(|source| ServerNgError::ShardRuntimeCreateFailed { shard_id, source })?; let result = runtime.block_on(async move { - shard_main( + // `shard_main`'s future grows past clippy's `large_futures` cap + // (it ferries the metadata handoff, bus, builders, and inflight + // I/O in one state machine). Heap-pin it so the top-level + // `block_on` future stays small; one allocation per startup buys + // the stack budget back. + Box::pin(shard_main( shard_id, total_shards, replica_id, @@ -642,7 +647,7 @@ fn run_shard_thread( shutdown_flag, metadata_handoff, owner_table, - ) + )) .await }); @@ -770,6 +775,9 @@ async fn shard_main( ); let shard_metrics = ShardMetrics::for_shard(); + // Notifier install deferred until after tick handler wires below. + let senders_for_notifier = senders.clone(); + let metrics_for_notifier = shard_metrics.clone(); let (shard, sessions) = build_shard_for_thread( shard_id, total_shards, @@ -807,6 +815,21 @@ async fn shard_main( return Ok(()); } + // Tick handler must install before the notifier so early commits + // do not broadcast ticks whose handler slot is still `None`. + let (reconcile_wake_tx, reconcile_wake_rx) = channel::<()>(1); + let (reconcile_stop_tx, reconcile_stop_rx) = channel::<()>(1); + crate::partition_reconciler::install_tick_handler(&shard, reconcile_wake_tx); + + // Only shard 0 commits metadata. + if shard_id == 0 { + let notifier = make_metadata_commit_notifier(senders_for_notifier, metrics_for_notifier); + shard.plane.metadata().set_commit_notifier(Some(notifier)); + } else { + drop(senders_for_notifier); + drop(metrics_for_notifier); + } + let (stop_tx, stop_rx) = channel(1); let pump_shard = Rc::clone(&shard); let pump_handle = compio::runtime::spawn(async move { @@ -814,6 +837,33 @@ async fn shard_main( }); bus.track_background(pump_handle); + let reconciler_ctx = Rc::new(crate::partition_reconciler::ReconcilerCtx::new( + Rc::clone(&shard), + total_shards, + Rc::new(config.clone()), + topology.cluster_id, + topology.self_replica_id, + topology.replica_count, + )); + let reconcile_periodic = config + .system + .sharding + .reconcile_periodic_interval + .get_duration(); + let reconciler_handle = compio::runtime::spawn({ + let ctx = Rc::clone(&reconciler_ctx); + async move { + crate::partition_reconciler::run_reconciler( + ctx, + reconcile_wake_rx, + reconcile_stop_rx, + reconcile_periodic, + ) + .await; + } + }); + bus.track_background(reconciler_handle); + // Listeners (replica + every client transport) bind on shard 0 only. // Shard 0's coordinator round-robins inbound TCP/WS connections to // peer shards via fd-transfer. QUIC and TCP-TLS clients terminate @@ -837,12 +887,14 @@ async fn shard_main( start_tcp_runtime(&shard, config, &topology, accepted_replica, accepted_client).await { let _ = stop_tx.try_send(()); + let _ = reconcile_stop_tx.try_send(()); return Err(error); } } bus.token().wait().await; let _ = stop_tx.try_send(()); + let _ = reconcile_stop_tx.try_send(()); info!(shard = shard_id, "server-ng shard exited cleanly"); Ok(()) @@ -1009,7 +1061,7 @@ async fn build_shard_for_thread( let owned_partitions_capacity = total_partitions .div_ceil(usize::from(total_shards).max(1)) .saturating_mul(2); - let mut partitions = IggyPartitions::with_capacity( + let partitions = IggyPartitions::with_capacity( shard_local_id, PartitionsConfig { messages_required_to_save: config.system.partition.messages_required_to_save, @@ -1040,12 +1092,12 @@ async fn build_shard_for_thread( if owning_shard == shard_id { owned.push((stream.id, topic_id, topic.stats.clone(), partition.clone())); } else { - // Non-owning entry: only `shard_id` is consulted - // by the router; `local_idx` is meaningful only on - // the owning shard, so a sentinel `0` is safe. shards_table.insert( namespace, - PartitionLocation::new(ShardId::new(owning_shard), LocalIdx::new(0)), + PartitionLocation::new( + ShardId::new(owning_shard), + partition.created_revision, + ), ); } } @@ -1059,7 +1111,7 @@ async fn build_shard_for_thread( // here only add their per-partition deltas, so the shared // `Arc` atomics race only against other atomic adds. for (stream_id, topic_id, topic_stats, partition_metadata) in owned { - validate_recovered_namespace(config, stream_id, topic_id, partition_metadata.id)?; + validate_namespace_bounds(config, stream_id, topic_id, partition_metadata.id)?; let namespace = IggyNamespace::new(stream_id, topic_id, partition_metadata.id); let partition = load_partition( config, @@ -1072,10 +1124,10 @@ async fn build_shard_for_thread( Rc::clone(&bus), ) .await?; - let local_idx = partitions.insert(namespace, partition); + partitions.insert(namespace, partition); shards_table.insert( namespace, - PartitionLocation::new(ShardId::new(shard_id), LocalIdx::new(*local_idx)), + PartitionLocation::new(ShardId::new(shard_id), partition_metadata.created_revision), ); } @@ -1101,7 +1153,11 @@ async fn build_shard_for_thread( senders, inbox, shards_table, - PartitionConsensusConfig::new(topology.cluster_id, topology.replica_count, Rc::clone(&bus)), + PartitionConsensusConfig::new( + topology.cluster_id, + shard::ReplicaTopology::new(topology.self_replica_id, topology.replica_count), + Rc::clone(&bus), + ), CoordinatorConfig::default(), metrics, ) @@ -1173,30 +1229,6 @@ fn restore_metadata_consensus( consensus } -const fn validate_recovered_namespace( - config: &ServerNgConfig, - stream_id: usize, - topic_id: usize, - partition_id: usize, -) -> Result<(), ServerNgError> { - let namespace = &config.extra.namespace; - if stream_id < namespace.max_streams - && topic_id < namespace.max_topics - && partition_id < namespace.max_partitions - { - return Ok(()); - } - - Err(ServerNgError::RecoveredNamespaceOutOfBounds { - stream_id, - topic_id, - partition_id, - max_streams: namespace.max_streams, - max_topics: namespace.max_topics, - max_partitions: namespace.max_partitions, - }) -} - #[allow(clippy::too_many_arguments)] async fn load_partition( config: &ServerNgConfig, @@ -1461,222 +1493,6 @@ async fn load_segment_max_timestamp( Ok(indexes_max_timestamp(&indexes)) } -fn configure_consumer_offsets( - partition: &mut IggyPartition>, - config: &ServerNgConfig, - namespace: IggyNamespace, - current_offset: u64, -) -> Result<(), ServerNgError> { - let stream_id = namespace.stream_id(); - let topic_id = namespace.topic_id(); - let partition_id = namespace.partition_id(); - let consumer_offsets_path = - config - .system - .get_consumer_offsets_path(stream_id, topic_id, partition_id); - let consumer_group_offsets_path = - config - .system - .get_consumer_group_offsets_path(stream_id, topic_id, partition_id); - - let loaded_consumer_offsets = load_partition_consumer_offsets( - &consumer_offsets_path, - "consumer", - stream_id, - topic_id, - partition_id, - )?; - let consumer_offsets = ConsumerOffsets::with_capacity(loaded_consumer_offsets.len()); - { - let guard = consumer_offsets.pin(); - for offset in loaded_consumer_offsets { - let recovered_offset = offset.offset.load(Ordering::Relaxed); - if recovered_offset > current_offset { - return Err(ServerNgError::RecoveredConsumerOffsetOutOfBounds { - consumer_kind: "consumer", - consumer_id: offset.consumer_id as usize, - offset: recovered_offset, - current_offset, - stream_id, - topic_id, - partition_id, - }); - } - guard.insert(offset.consumer_id as usize, offset); - } - } - - let loaded_group_offsets = load_partition_consumer_group_offsets( - &consumer_group_offsets_path, - stream_id, - topic_id, - partition_id, - )?; - let consumer_group_offsets = ConsumerGroupOffsets::with_capacity(loaded_group_offsets.len()); - { - let guard = consumer_group_offsets.pin(); - for (group_id, offset) in loaded_group_offsets { - let recovered_offset = offset.offset.load(Ordering::Relaxed); - if recovered_offset > current_offset { - return Err(ServerNgError::RecoveredConsumerOffsetOutOfBounds { - consumer_kind: "consumer group", - consumer_id: group_id.0, - offset: recovered_offset, - current_offset, - stream_id, - topic_id, - partition_id, - }); - } - guard.insert(group_id, offset); - } - } - - partition.configure_consumer_offset_storage( - consumer_offsets_path, - consumer_group_offsets_path, - consumer_offsets, - consumer_group_offsets, - config.system.partition.enforce_fsync, - ); - Ok(()) -} - -fn load_partition_consumer_offsets( - path: &str, - consumer_kind: &'static str, - stream_id: usize, - topic_id: usize, - partition_id: usize, -) -> Result, ServerNgError> { - if !Path::new(path).exists() { - return Ok(Vec::new()); - } - - load_consumer_offsets(path).or_else(|source| { - if matches!(&source, IggyError::CannotReadConsumerOffsets(missing_path) if !Path::new(missing_path).exists()) - { - return Ok(Vec::new()); - } - - Err(ServerNgError::ConsumerOffsetsLoad { - consumer_kind, - stream_id, - topic_id, - partition_id, - path: path.to_string(), - source: Box::new(source), - }) - }) -} - -fn load_partition_consumer_group_offsets( - path: &str, - stream_id: usize, - topic_id: usize, - partition_id: usize, -) -> Result, ServerNgError> { - if !Path::new(path).exists() { - return Ok(Vec::new()); - } - - load_consumer_group_offsets(path).or_else(|source| { - if matches!(&source, IggyError::CannotReadConsumerOffsets(missing_path) if !Path::new(missing_path).exists()) - { - return Ok(Vec::new()); - } - - Err(ServerNgError::ConsumerOffsetsLoad { - consumer_kind: "consumer group", - stream_id, - topic_id, - partition_id, - path: path.to_string(), - source: Box::new(source), - }) - }) -} - -async fn ensure_initial_segment( - partition: &mut IggyPartition>, - config: &ServerNgConfig, - stream_id: usize, - topic_id: usize, - partition_id: usize, -) -> Result<(), ServerNgError> { - if partition.log.has_segments() { - return Ok(()); - } - - // TODO: decouple segment storage creation from the `server` crate. - let storage = - create_segment_storage(&config.system, stream_id, topic_id, partition_id, 0, 0, 0) - .await - .map_err(|source| { - error!( - stream_id, - topic_id, - partition_id, - error = %source, - "failed to create initial segment storage" - ); - source - })?; - let messages_path = config - .system - .get_messages_file_path(stream_id, topic_id, partition_id, 0); - let index_path = config - .system - .get_index_path(stream_id, topic_id, partition_id, 0); - partition.log.add_persisted_segment( - Segment::new(0, config.system.segment.size), - storage, - Some(Rc::new( - MessagesWriter::new( - &messages_path, - Rc::new(AtomicU64::new(0)), - config.system.partition.enforce_fsync, - false, - ) - .await - .map_err(|source| { - error!( - stream_id, - topic_id, - partition_id, - path = %messages_path, - error = %source, - "failed to initialize initial messages writer" - ); - source - })?, - )), - Some(Rc::new( - IggyIndexWriter::new( - &index_path, - Rc::new(AtomicU64::new(0)), - config.system.partition.enforce_fsync, - false, - ) - .await - .map_err(|source| { - error!( - stream_id, - topic_id, - partition_id, - path = %index_path, - error = %source, - "failed to initialize initial sparse index writer" - ); - source - })?, - )), - ); - partition.stats.increment_segments_count(1); - - Ok(()) -} - fn resolve_tcp_topology( config: &ServerNgConfig, current_replica_id: Option, @@ -2374,6 +2190,66 @@ fn socket_addr_from_parts( Ok(SocketAddr::new(ip, port)) } +/// Build the closure that broadcasts a +/// [`LifecycleFrame::MetadataCommitTick`] to every shard's inbox after a +/// partition-shaped metadata operation commits on shard 0. +/// +/// The receiver-side partition reconciliation loop listens for these +/// wake-ups; coalescing is intentional, so `Full` is recorded as a metric +/// and dropped (the periodic tick recovers). Installed via +/// [`metadata::IggyMetadata::set_commit_notifier`] on shard 0 only, the +/// sole writer of the metadata state machine. +fn make_metadata_commit_notifier( + senders: Vec, + metrics: ShardMetrics, +) -> metadata::CommitNotifier { + Rc::new(move |operation: Operation| { + if !operation_triggers_partition_reconcile(operation) { + return; + } + for sender in &senders { + let frame = ShardFrame::lifecycle(LifecycleFrame::MetadataCommitTick); + match sender.try_send(frame) { + Ok(()) => {} + Err(crossfire::TrySendError::Full(_)) => { + metrics.record_frame_drop( + frame_drop_variant::METADATA_COMMIT_TICK, + frame_drop_reason::FULL, + ); + } + Err(crossfire::TrySendError::Disconnected(_)) => { + metrics.record_frame_drop( + frame_drop_variant::METADATA_COMMIT_TICK, + frame_drop_reason::DISCONNECTED, + ); + } + } + } + }) +} + +/// Filter at the broadcast site, keeping unrelated ops off the SDK reply +/// path. Any new partition-shape op must be added here. +/// +/// The bare `CreateTopic` / `CreatePartitions` arms are unreachable: the +/// leader's prepare-builder in `IggyMetadata` rewrites both into their +/// `*WithAssignments` form, stamping each partition's `consensus_group_id` +/// before journaling, so a committed prepare only ever carries the +/// assignment-bearing variant. Kept as defense-in-depth against a future +/// commit path that emits a bare op. +const fn operation_triggers_partition_reconcile(op: Operation) -> bool { + matches!( + op, + Operation::CreateTopic + | Operation::CreateTopicWithAssignments + | Operation::CreatePartitions + | Operation::CreatePartitionsWithAssignments + | Operation::DeleteTopic + | Operation::DeleteStream + | Operation::DeletePartitions + ) +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/server-ng/src/lib.rs b/core/server-ng/src/lib.rs index 0346719442..94d96c638a 100644 --- a/core/server-ng/src/lib.rs +++ b/core/server-ng/src/lib.rs @@ -22,6 +22,8 @@ pub mod bootstrap; pub mod config_writer; pub mod dispatch; pub mod login_register; +pub mod partition_helpers; +pub mod partition_reconciler; pub mod pat; pub mod responses; pub mod server_error; diff --git a/core/server-ng/src/partition_helpers.rs b/core/server-ng/src/partition_helpers.rs new file mode 100644 index 0000000000..f1245fecf9 --- /dev/null +++ b/core/server-ng/src/partition_helpers.rs @@ -0,0 +1,537 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Helpers shared between the recovery path in [`crate::bootstrap`] and +//! the runtime partition reconciliation loop. +//! +//! Recovery hydrates an [`IggyPartition`] from on-disk state; the +//! reconciler builds one from scratch when a committed +//! `CreateTopic` / `CreatePartitions` metadata event has no matching +//! local partition yet. The two paths share namespace-bounds validation, +//! consumer-offset configuration, and initial-segment provisioning. + +use crate::server_error::ServerNgError; +use compio::fs::create_dir_all; +use configs::server_ng::ServerNgConfig; +use consensus::{LocalPipeline, VsrConsensus}; +use iggy_common::{ + ConsumerGroupOffsets, ConsumerOffsets, IggyError, IggyTimestamp, PartitionStats, TopicStats, +}; +use message_bus::IggyMessageBus; +use partitions::{IggyIndexWriter, IggyPartition, MessagesWriter, Segment}; +use server::io::fs_utils::remove_dir_all; +use server::streaming::partitions::storage::{load_consumer_group_offsets, load_consumer_offsets}; +use server::streaming::segments::storage::create_segment_storage; +use server_common::sharding::IggyNamespace; +use std::path::Path; +use std::rc::Rc; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use tracing::error; + +/// Validate that a namespace fits within the static caps declared in +/// `config.extra.namespace`. +/// +/// Bootstrap calls this for every recovered namespace; the reconciler +/// calls this before materialising a freshly committed partition. Same +/// error variant either way so operators see one root cause label. +/// +/// # Errors +/// +/// Returns [`ServerNgError::RecoveredNamespaceOutOfBounds`] if any of +/// `stream_id`, `topic_id`, or `partition_id` exceed the configured +/// maxima. +pub const fn validate_namespace_bounds( + config: &ServerNgConfig, + stream_id: usize, + topic_id: usize, + partition_id: usize, +) -> Result<(), ServerNgError> { + let namespace = &config.extra.namespace; + if stream_id < namespace.max_streams + && topic_id < namespace.max_topics + && partition_id < namespace.max_partitions + { + return Ok(()); + } + + Err(ServerNgError::RecoveredNamespaceOutOfBounds { + stream_id, + topic_id, + partition_id, + max_streams: namespace.max_streams, + max_topics: namespace.max_topics, + max_partitions: namespace.max_partitions, + }) +} + +/// Create the on-disk directory hierarchy for a partition. +/// +/// Builds the partition root, offsets, consumer offsets, and consumer +/// group offsets directories. Idempotent: every step short-circuits when +/// the directory already exists, so a reconciler retry after a partial +/// failure is safe. +/// +/// # Errors +/// +/// Returns [`IggyError::CannotCreatePartitionDirectory`] or +/// [`IggyError::CannotCreatePartition`] on directory creation failure. +pub async fn create_partition_file_hierarchy( + stream_id: usize, + topic_id: usize, + partition_id: usize, + config: &ServerNgConfig, +) -> Result<(), IggyError> { + let partition_path = config + .system + .get_partition_path(stream_id, topic_id, partition_id); + if !Path::new(&partition_path).exists() && create_dir_all(&partition_path).await.is_err() { + return Err(IggyError::CannotCreatePartitionDirectory( + partition_id, + stream_id, + topic_id, + )); + } + + let offset_path = config + .system + .get_offsets_path(stream_id, topic_id, partition_id); + if !Path::new(&offset_path).exists() && create_dir_all(&offset_path).await.is_err() { + error!( + stream_id, + topic_id, partition_id, "Failed to create offsets directory for partition" + ); + return Err(IggyError::CannotCreatePartition( + partition_id, + stream_id, + topic_id, + )); + } + + let consumer_offset_path = + config + .system + .get_consumer_offsets_path(stream_id, topic_id, partition_id); + if !Path::new(&consumer_offset_path).exists() + && create_dir_all(&consumer_offset_path).await.is_err() + { + error!( + stream_id, + topic_id, partition_id, "Failed to create consumer offsets directory for partition" + ); + return Err(IggyError::CannotCreatePartition( + partition_id, + stream_id, + topic_id, + )); + } + + let consumer_group_offsets_path = + config + .system + .get_consumer_group_offsets_path(stream_id, topic_id, partition_id); + if !Path::new(&consumer_group_offsets_path).exists() + && create_dir_all(&consumer_group_offsets_path).await.is_err() + { + error!( + stream_id, + topic_id, + partition_id, + "Failed to create consumer group offsets directory for partition" + ); + return Err(IggyError::CannotCreatePartition( + partition_id, + stream_id, + topic_id, + )); + } + + Ok(()) +} + +/// Populate `partition` with consumer-offset / consumer-group-offset storage. +/// +/// Hydrates from on-disk state if files exist (recovery path) or +/// configures empty maps (fresh partition path). `current_offset` bounds +/// recovered offsets so a partition that lost its tail does not surface +/// consumer offsets ahead of its current log head. +/// +/// # Errors +/// +/// Returns [`ServerNgError::RecoveredConsumerOffsetOutOfBounds`] when a +/// stored offset exceeds `current_offset`, or +/// [`ServerNgError::ConsumerOffsetsLoad`] when the on-disk files exist +/// but fail to decode. +pub fn configure_consumer_offsets( + partition: &mut IggyPartition>, + config: &ServerNgConfig, + namespace: IggyNamespace, + current_offset: u64, +) -> Result<(), ServerNgError> { + let stream_id = namespace.stream_id(); + let topic_id = namespace.topic_id(); + let partition_id = namespace.partition_id(); + let consumer_offsets_path = + config + .system + .get_consumer_offsets_path(stream_id, topic_id, partition_id); + let consumer_group_offsets_path = + config + .system + .get_consumer_group_offsets_path(stream_id, topic_id, partition_id); + + let loaded_consumer_offsets = load_partition_consumer_offsets( + &consumer_offsets_path, + "consumer", + stream_id, + topic_id, + partition_id, + )?; + let consumer_offsets = ConsumerOffsets::with_capacity(loaded_consumer_offsets.len()); + { + let guard = consumer_offsets.pin(); + for offset in loaded_consumer_offsets { + let recovered_offset = offset.offset.load(Ordering::Relaxed); + if recovered_offset > current_offset { + return Err(ServerNgError::RecoveredConsumerOffsetOutOfBounds { + consumer_kind: "consumer", + consumer_id: offset.consumer_id as usize, + offset: recovered_offset, + current_offset, + stream_id, + topic_id, + partition_id, + }); + } + guard.insert(offset.consumer_id as usize, offset); + } + } + + let loaded_group_offsets = load_partition_consumer_group_offsets( + &consumer_group_offsets_path, + stream_id, + topic_id, + partition_id, + )?; + let consumer_group_offsets = ConsumerGroupOffsets::with_capacity(loaded_group_offsets.len()); + { + let guard = consumer_group_offsets.pin(); + for (group_id, offset) in loaded_group_offsets { + let recovered_offset = offset.offset.load(Ordering::Relaxed); + if recovered_offset > current_offset { + return Err(ServerNgError::RecoveredConsumerOffsetOutOfBounds { + consumer_kind: "consumer group", + consumer_id: group_id.0, + offset: recovered_offset, + current_offset, + stream_id, + topic_id, + partition_id, + }); + } + guard.insert(group_id, offset); + } + } + + partition.configure_consumer_offset_storage( + consumer_offsets_path, + consumer_group_offsets_path, + consumer_offsets, + consumer_group_offsets, + config.system.partition.enforce_fsync, + ); + Ok(()) +} + +fn load_partition_consumer_offsets( + path: &str, + consumer_kind: &'static str, + stream_id: usize, + topic_id: usize, + partition_id: usize, +) -> Result, ServerNgError> { + if !Path::new(path).exists() { + return Ok(Vec::new()); + } + + load_consumer_offsets(path).or_else(|source| { + if matches!(&source, IggyError::CannotReadConsumerOffsets(missing_path) if !Path::new(missing_path).exists()) + { + return Ok(Vec::new()); + } + + Err(ServerNgError::ConsumerOffsetsLoad { + consumer_kind, + stream_id, + topic_id, + partition_id, + path: path.to_string(), + source: Box::new(source), + }) + }) +} + +fn load_partition_consumer_group_offsets( + path: &str, + stream_id: usize, + topic_id: usize, + partition_id: usize, +) -> Result, ServerNgError> { + if !Path::new(path).exists() { + return Ok(Vec::new()); + } + + load_consumer_group_offsets(path).or_else(|source| { + if matches!(&source, IggyError::CannotReadConsumerOffsets(missing_path) if !Path::new(missing_path).exists()) + { + return Ok(Vec::new()); + } + + Err(ServerNgError::ConsumerOffsetsLoad { + consumer_kind: "consumer group", + stream_id, + topic_id, + partition_id, + path: path.to_string(), + source: Box::new(source), + }) + }) +} + +/// Provision an initial segment + writers for a partition that has none. +/// +/// No-op when `partition.log.has_segments()` already returns `true` +/// (recovery hydrated existing segments), so callers can invoke this +/// unconditionally. +/// +/// # Errors +/// +/// Returns [`ServerNgError`] on segment-storage creation failure or +/// writer initialisation failure. +pub async fn ensure_initial_segment( + partition: &mut IggyPartition>, + config: &ServerNgConfig, + stream_id: usize, + topic_id: usize, + partition_id: usize, +) -> Result<(), ServerNgError> { + if partition.log.has_segments() { + return Ok(()); + } + + // TODO: decouple segment storage creation from the `server` crate. + let storage = + create_segment_storage(&config.system, stream_id, topic_id, partition_id, 0, 0, 0) + .await + .map_err(|source| { + error!( + stream_id, + topic_id, + partition_id, + error = %source, + "failed to create initial segment storage" + ); + source + })?; + let messages_path = config + .system + .get_messages_file_path(stream_id, topic_id, partition_id, 0); + let index_path = config + .system + .get_index_path(stream_id, topic_id, partition_id, 0); + partition.log.add_persisted_segment( + Segment::new(0, config.system.segment.size), + storage, + Some(Rc::new( + MessagesWriter::new( + &messages_path, + Rc::new(AtomicU64::new(0)), + config.system.partition.enforce_fsync, + false, + ) + .await + .map_err(|source| { + error!( + stream_id, + topic_id, + partition_id, + path = %messages_path, + error = %source, + "failed to initialize initial messages writer" + ); + source + })?, + )), + Some(Rc::new( + IggyIndexWriter::new( + &index_path, + Rc::new(AtomicU64::new(0)), + config.system.partition.enforce_fsync, + false, + ) + .await + .map_err(|source| { + error!( + stream_id, + topic_id, + partition_id, + path = %index_path, + error = %source, + "failed to initialize initial sparse index writer" + ); + source + })?, + )), + ); + partition.stats.increment_segments_count(1); + + Ok(()) +} + +/// Materialise a brand-new [`IggyPartition`] for a namespace that has no on-disk state yet. +/// +/// Counterpart to bootstrap's `load_partition`, which hydrates from +/// on-disk state during recovery; this builder is the runtime path +/// invoked by the reconciliation loop when a committed +/// `CreateTopic` / `CreatePartitions` metadata event names a partition +/// the local shard has not yet materialised. +/// +/// Steps performed (all idempotent on retry after a partial failure): +/// 1. Validate namespace fits within the configured caps. +/// 2. Create directory hierarchy on disk. +/// 3. Build per-partition VSR consensus group at view 0. +/// 4. Configure empty consumer-offset storage with the on-disk paths set. +/// 5. Provision the initial segment + writers (offset 0). +/// +/// The returned partition's `offset` / `dirty_offset` are `0` and +/// `should_increment_offset` is `false`, mirroring a clean append starting +/// at the empty segment. +/// +/// # Errors +/// +/// Returns [`ServerNgError`] when bounds validation, directory creation, +/// or segment provisioning fails. +pub async fn build_partition_fresh( + config: &ServerNgConfig, + namespace: IggyNamespace, + topic_stats: Arc, + cluster_id: u128, + self_replica_id: u8, + replica_count: u8, + bus: Rc, +) -> Result>, ServerNgError> { + let stream_id = namespace.stream_id(); + let topic_id = namespace.topic_id(); + let partition_id = namespace.partition_id(); + + validate_namespace_bounds(config, stream_id, topic_id, partition_id)?; + create_partition_file_hierarchy(stream_id, topic_id, partition_id, config) + .await + .map_err(|source| { + error!( + stream_id, + topic_id, + partition_id, + error = %source, + "failed to create partition file hierarchy for fresh partition" + ); + source + })?; + + let stats = Arc::new(PartitionStats::new(topic_stats)); + let consensus = VsrConsensus::new( + cluster_id, + self_replica_id, + replica_count, + namespace.inner(), + bus, + LocalPipeline::new(), + ); + consensus.init(); + + let mut partition = IggyPartition::new(stats, consensus); + partition.created_at = IggyTimestamp::now(); + partition.offset.store(0, Ordering::Release); + partition.dirty_offset.store(0, Ordering::Relaxed); + partition.should_increment_offset = false; + partition.stats.set_current_offset(0); + debug_assert!( + !partition.log.has_segments(), + "fresh partition must not carry recovered segments" + ); + + configure_consumer_offsets(&mut partition, config, namespace, 0)?; + ensure_initial_segment(&mut partition, config, stream_id, topic_id, partition_id).await?; + + Ok(partition) +} + +/// Recursive delete of partition root. Idempotent: `NotFound` is treated +/// as success so a prior crashed pass cannot arm perpetual backoff. +/// +/// # Errors +/// +/// [`IggyError::CannotDeletePartitionDirectory`] on any non-`NotFound` +/// OS error. +pub async fn delete_partitions_from_disk( + stream_id: usize, + topic_id: usize, + partition_id: usize, + config: &ServerNgConfig, +) -> Result<(), IggyError> { + let partition_path = config + .system + .get_partition_path(stream_id, topic_id, partition_id); + match remove_dir_all(&partition_path).await { + Ok(()) => { + tracing::info!( + stream_id, + topic_id, + partition_id, + path = %partition_path, + "deleted partition directory" + ); + Ok(()) + } + Err(source) if source.kind() == std::io::ErrorKind::NotFound => { + tracing::debug!( + stream_id, + topic_id, + partition_id, + path = %partition_path, + "partition directory already absent" + ); + Ok(()) + } + Err(source) => { + error!( + stream_id, + topic_id, + partition_id, + path = %partition_path, + error = %source, + "failed to delete partition directory" + ); + // Variant format: {0}=partition_id, {1}=stream_id, {2}=topic_id. + Err(IggyError::CannotDeletePartitionDirectory( + partition_id, + stream_id, + topic_id, + )) + } + } +} diff --git a/core/server-ng/src/partition_reconciler.rs b/core/server-ng/src/partition_reconciler.rs new file mode 100644 index 0000000000..5bd707c5ad --- /dev/null +++ b/core/server-ng/src/partition_reconciler.rs @@ -0,0 +1,1441 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Partition reconciliation loop. +//! +//! One task per shard. On wake (commit tick or periodic safety tick), +//! diff committed `Streams` STM against local `IggyPartitions`: +//! - non-owned namespaces: seed `shards_table` row pointing at owner. +//! - owned namespaces: `build_partition_fresh` then enqueue +//! `ReconcileOp::InsertOwned` for pump-side apply. +//! - ghosts: two-phase tombstone, disk delete, `ConfirmRemove`. +//! +//! # Dormant race: reply ships before partition materialises +//! +//! `metadata::on_ack` fires the commit notifier and emits the wire reply +//! immediately after STM apply, but the owning shard's reconciler wakes +//! asynchronously and only enqueues `ReconcileOp::InsertOwned` once +//! `build_partition_fresh` finishes (mkdir + segment open + fallocate, +//! multi-millisecond). Until the pump drains the queue, +//! `shards_table.shard_for(ns)` returns `None` and `router::route_typed` +//! drops any partition op silently with +//! `frame_drops_total{variant=partition,reason=unroutable}` (see +//! `shard/src/router.rs:147-162`). +//! +//! Today this race is **unreachable** from any SDK: the `vsr` feature +//! gate only wires VSR framing for `users` + `personal_access_tokens`; +//! `topics.rs` and `partitions.rs` `binary_impls` still emit pre-VSR +//! encoding server-ng can't dispatch. The first SDK trait impl that adds +//! a `#[cfg(feature = "vsr")]` branch for `create_topic` or +//! `send_messages` surfaces the race as a silent first-produce drop after +//! every `create_topic`. +//! +//! TODO: block VSR-ification of `topics.rs` / `partitions.rs` +//! `binary_impls` on a materialization barrier (cross-shard +//! `MaterializedAck` from each assigned shard back to shard 0; shard 0 +//! holds the reply in `client_table` until all acks arrive). Sync-block +//! alternative was deferred. + +use crate::bootstrap::ServerNgShard; +use crate::partition_helpers::{build_partition_fresh, delete_partitions_from_disk}; +use ahash::{AHashMap, AHashSet}; +use configs::server_ng::ServerNgConfig; +use consensus::{MetadataHandle, PartitionsHandle}; +use futures::FutureExt; +use metadata::impls::metadata::StreamsFrontend; +use server_common::sharding::{IggyNamespace, ShardId}; +use shard::ReconcileOp; +use shard::shards_table::{ShardsTable, calculate_shard_assignment}; +use shard::{Receiver, Sender}; +use std::cell::{Cell, RefCell}; +use std::rc::Rc; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tracing::{debug, error, trace}; + +const BACKOFF_BASE: Duration = Duration::from_secs(1); +const BACKOFF_MAX: Duration = Duration::from_mins(1); + +/// Doubles per attempt, clamped at `BACKOFF_MAX`. +fn next_backoff(attempts: u32) -> Duration { + let shift = attempts.saturating_sub(1).min(6); + let multiplier = 1_u32.checked_shl(shift).unwrap_or(1); + BACKOFF_BASE.saturating_mul(multiplier).min(BACKOFF_MAX) +} + +#[derive(Debug, Clone, Copy)] +struct FailureRecord { + attempts: u32, + next_retry_at: Instant, +} + +/// Separate retry budgets so a stuck disk-delete cannot throttle a re-create. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum FailureCause { + Add, + Delete, +} + +pub struct ReconcilerCtx { + pub shard: Rc, + pub total_shards: u16, + pub config: Rc, + pub cluster_id: u128, + pub self_replica_id: u8, + pub replica_count: u8, + failure_state: RefCell>, + /// `Streams::revision` observed at the end of the last pass that fully + /// converged. Paired with `last_pass_noop` for the fast-skip in + /// [`reconcile_once`] (no O(N) scan when nothing changed). + last_revision: Cell>, + /// `true` when the previous pass made no changes. Only then is a + /// same-`revision` pass safe to skip. + last_pass_noop: Cell, +} + +impl ReconcilerCtx { + #[must_use] + pub fn new( + shard: Rc, + total_shards: u16, + config: Rc, + cluster_id: u128, + self_replica_id: u8, + replica_count: u8, + ) -> Self { + Self { + shard, + total_shards, + config, + cluster_id, + self_replica_id, + replica_count, + failure_state: RefCell::new(AHashMap::new()), + last_revision: Cell::new(None), + last_pass_noop: Cell::new(false), + } + } + + fn is_backed_off(&self, ns: IggyNamespace, cause: FailureCause, now: Instant) -> bool { + let state = self.failure_state.borrow(); + if state.is_empty() { + return false; + } + state + .get(&(ns, cause)) + .is_some_and(|record| record.next_retry_at > now) + } + + /// `true` when a prior teardown of `ns` recorded a disk-delete failure + /// not since cleared. Teardown clears the `FailureCause::Delete` record + /// (via [`Self::record_success`]) exactly when it enqueues + /// `ConfirmRemove`, and sets it only on a delete that failed without + /// enqueuing one, so it doubles as "no `ConfirmRemove` in flight for + /// `ns`": the signal [`reconcile_additions`] uses to tell a + /// permanently-wedged tombstone (retry the delete) from one whose drop + /// is genuinely pending (defer). + fn has_pending_delete_failure(&self, ns: IggyNamespace) -> bool { + let state = self.failure_state.borrow(); + if state.is_empty() { + return false; + } + state.contains_key(&(ns, FailureCause::Delete)) + } + + fn record_success(&self, ns: IggyNamespace, cause: FailureCause) { + if self.failure_state.borrow().is_empty() { + return; + } + self.failure_state.borrow_mut().remove(&(ns, cause)); + } + + fn record_failure(&self, ns: IggyNamespace, cause: FailureCause, now: Instant) { + let mut state = self.failure_state.borrow_mut(); + let entry = state.entry((ns, cause)).or_insert(FailureRecord { + attempts: 0, + next_retry_at: now, + }); + entry.attempts = entry.attempts.saturating_add(1); + entry.next_retry_at = now + next_backoff(entry.attempts); + } + + /// Drop records whose namespace left both target and local sets; + /// otherwise a failed-then-deleted namespace's stale backoff + /// would throttle a future same-namespace re-create. + fn prune_failure_state_stale( + &self, + target_set: &AHashSet, + local_set: &AHashSet, + ) { + let mut state = self.failure_state.borrow_mut(); + if state.is_empty() { + return; + } + state.retain(|(ns, _cause), _record| target_set.contains(ns) || local_set.contains(ns)); + } +} + +pub type WakeTx = Sender<()>; +pub type WakeRx = Receiver<()>; + +/// One initial reconcile before the wait loop so a shard that comes up +/// before shard 0's first `MetadataCommitTick` still converges. +pub async fn run_reconciler( + ctx: Rc, + wake_rx: WakeRx, + stop_rx: Receiver<()>, + periodic: Duration, +) { + debug!( + shard = ctx.shard.id, + total_shards = ctx.total_shards, + periodic_ms = periodic.as_millis(), + "partition reconciler starting" + ); + reconcile_once(&ctx).await; + + loop { + let sleep = compio::time::sleep(periodic); + futures::select! { + _ = stop_rx.recv().fuse() => break, + recv = wake_rx.recv().fuse() => { + if recv.is_err() { + break; + } + while wake_rx.try_recv().is_ok() {} + reconcile_once(&ctx).await; + } + () = sleep.fuse() => { + reconcile_once(&ctx).await; + } + } + } + + debug!(shard = ctx.shard.id, "partition reconciler exited"); +} + +#[derive(Default)] +struct PassCounters { + materialised: usize, + routed: usize, + removed_local: usize, + removed_routed: usize, + backoff_skipped: usize, + /// Stale incarnations (slab-key reuse) torn down for rebuild. + stale: usize, +} + +impl PassCounters { + const fn total(&self) -> usize { + self.materialised + + self.routed + + self.removed_local + + self.removed_routed + + self.backoff_skipped + + self.stale + } +} + +/// Diff target vs local; materialise missing, tear down ghosts. Idempotent. +/// Returns `false` when the pass fast-skipped (nothing changed), `true` +/// when it ran the full diff. Callers in production discard the result; +/// tests assert the skip. +async fn reconcile_once(ctx: &ReconcilerCtx) -> bool { + let shard_id = ctx.shard.id; + let revision = current_revision(ctx); + + // Fast-skip: committed partition set unchanged since the last + // fully-converged pass and no backoff retry due, so the O(N) diff is + // pure waste. Safe because reconcile is level-triggered: the next + // partition-shaping commit bumps `revision` and a pending retry keeps + // `failure_state` non-empty, either of which forces the next pass. + if ctx.last_revision.get() == Some(revision) + && ctx.last_pass_noop.get() + && ctx.failure_state.borrow().is_empty() + { + trace!( + shard = shard_id, + revision, "reconciler fast-skip (no change)" + ); + return false; + } + + let target = snapshot_target_namespaces(ctx); + let target_set: AHashSet = target.iter().map(|(ns, _)| *ns).collect(); + let mut counters = PassCounters::default(); + + reconcile_additions(ctx, target, &mut counters).await; + reconcile_removals(ctx, &target_set, &mut counters).await; + + let local_set: AHashSet = + ctx.shard.plane.partitions().namespaces().copied().collect(); + ctx.prune_failure_state_stale(&target_set, &local_set); + + // Arm the fast-skip only when this pass converged (did nothing). A + // working pass (including a staleness teardown that rebuilds on the + // next pass) leaves `last_pass_noop = false` so the follow-up pass + // still runs even though `revision` did not change. + let did_work = counters.total() > 0; + ctx.last_revision.set(Some(revision)); + ctx.last_pass_noop.set(!did_work); + + if did_work { + debug!( + shard = shard_id, + revision, + materialised = counters.materialised, + routed = counters.routed, + removed_local = counters.removed_local, + removed_routed = counters.removed_routed, + backoff_skipped = counters.backoff_skipped, + stale = counters.stale, + "partition reconciler pass complete" + ); + } else { + trace!( + shard = shard_id, + "partition reconciler pass complete (no-op)" + ); + } + + true +} + +async fn reconcile_additions( + ctx: &ReconcilerCtx, + target: Vec<(IggyNamespace, u64)>, + counters: &mut PassCounters, +) { + let shard_id = ctx.shard.id; + let partitions = ctx.shard.plane.partitions(); + let total_shards = u32::from(ctx.total_shards); + + for (ns, epoch) in target { + if partitions.contains(&ns) { + // Tombstoned but still in the map. Two cases, told apart by + // whether teardown's disk delete succeeded: + // + // * Succeeded -> a `ConfirmRemove` is in flight. The pump + // drops the partition and clears the tombstone, then + // `signal_reconcile_wake` re-wakes us to rebuild within one + // pump-iter. Building over a path mid-unlink would race, so + // defer. + // * Failed -> no `ConfirmRemove` enqueued, so the tombstone + // never lifts. Paired with a same-key recreate landing `ns` + // back in the target, this pass would defer forever while + // `reconcile_removals` no longer sees a ghost: the partition + // is fenced permanently and every data-plane frame dropped. + // Re-drive teardown to retry the delete. + // + // A recorded `FailureCause::Delete` is the authoritative "no + // ConfirmRemove in flight" signal (see + // [`ReconcilerCtx::has_pending_delete_failure`]). + if partitions.is_tombstoned(&ns) { + if !ctx.has_pending_delete_failure(ns) { + trace!( + shard = shard_id, + ns_raw = ns.inner(), + "additions: ns tombstoned + in-map; rebuild deferred to post-ConfirmRemove wake" + ); + continue; + } + trace!( + shard = shard_id, + ns_raw = ns.inner(), + "additions: ns tombstoned + in-map with failed disk delete; re-driving teardown to retry delete" + ); + tear_down_owned_partition(ctx, ns, counters).await; + continue; + } + + // Staleness: the namespace tuple is built from reused slab + // keys, so a delete+recreate of the same (stream, topic, + // partition) yields an identical `ns` whose committed + // `created_revision` differs from the epoch recorded when the + // local partition materialised. A mismatch (or a missing + // routing row on a live partition, an invariant violation) + // means the local partition is a prior incarnation carrying + // stale segments/offsets/log. Tear it down; the + // post-ConfirmRemove wake rebuilds it fresh next pass. + if ctx.shard.shards_table().epoch_for(ns) == Some(epoch) { + continue; + } + trace!( + shard = shard_id, + ns_raw = ns.inner(), + target_epoch = epoch, + "additions: stale incarnation (slab-key reuse); tearing down for rebuild" + ); + counters.stale += 1; + tear_down_owned_partition(ctx, ns, counters).await; + continue; + } + + let owning_shard = calculate_shard_assignment(&ns, total_shards); + if owning_shard != shard_id { + if !shards_table_contains(ctx, ns) { + ctx.shard.enqueue_reconcile_op(ReconcileOp::InsertRouted { + namespace: ns, + owner: ShardId::new(owning_shard), + epoch, + }); + counters.routed += 1; + } + continue; + } + + let now = Instant::now(); + if ctx.is_backed_off(ns, FailureCause::Add, now) { + counters.backoff_skipped += 1; + continue; + } + + // Clone the parent `Arc` only for namespaces actually + // built, not once per committed partition every pass. A topic that + // vanished between the target snapshot and this read defers to the + // next pass. + let Some(topic_stats) = fetch_topic_stats(ctx, ns) else { + continue; + }; + + match build_partition_fresh( + ctx.config.as_ref(), + ns, + topic_stats, + ctx.cluster_id, + ctx.self_replica_id, + ctx.replica_count, + Rc::clone(&ctx.shard.bus), + ) + .await + { + Ok(partition) => { + ctx.shard.enqueue_reconcile_op(ReconcileOp::InsertOwned { + namespace: ns, + partition: Box::new(partition), + epoch, + }); + ctx.record_success(ns, FailureCause::Add); + counters.materialised += 1; + } + Err(err) => { + ctx.record_failure(ns, FailureCause::Add, now); + ctx.shard.metrics().record_partition_reconcile_failure(); + error!( + shard = shard_id, + stream_id = ns.stream_id(), + topic_id = ns.topic_id(), + partition_id = ns.partition_id(), + error = %err, + "reconciler failed to materialize partition" + ); + } + } + } +} + +async fn reconcile_removals( + ctx: &ReconcilerCtx, + target_set: &AHashSet, + counters: &mut PassCounters, +) { + let partitions = ctx.shard.plane.partitions(); + let shards_table = ctx.shard.shards_table(); + + let owned_ghosts: Vec = partitions + .namespaces() + .copied() + .filter(|ns| !target_set.contains(ns)) + .collect(); + for ns in owned_ghosts { + tear_down_owned_partition(ctx, ns, counters).await; + } + + // Skip namespaces still locally owned (disk-delete-failed ghosts): + // pruning their shards_table row would strand peer routing. + let still_owned: AHashSet = partitions.namespaces().copied().collect(); + let routed_ghosts: Vec = shards_table + .namespaces() + .into_iter() + .filter(|ns| !target_set.contains(ns) && !still_owned.contains(ns)) + .collect(); + for ns in routed_ghosts { + ctx.shard + .enqueue_reconcile_op(ReconcileOp::RemoveRouted { namespace: ns }); + counters.removed_routed += 1; + } +} + +/// Two-phase owned-partition teardown shared by the removals pass (a ghost +/// no longer in the committed target) and the additions pass (a stale +/// incarnation after slab-key reuse). Fences writes synchronously +/// (tombstone + `shards_table` row removal), unlinks the on-disk +/// hierarchy, then enqueues `ConfirmRemove` so the pump drops the +/// in-memory partition. On disk-delete failure the namespace stays +/// tombstoned + backed off and retries on a later pass; the in-memory +/// partition is never dropped before its data is gone. +async fn tear_down_owned_partition( + ctx: &ReconcilerCtx, + ns: IggyNamespace, + counters: &mut PassCounters, +) { + let shard_id = ctx.shard.id; + let partitions = ctx.shard.plane.partitions(); + let shards_table = ctx.shard.shards_table(); + + // Partition paths share one on-disk root across all shards on a node + // (`get_partition_path` has no `shard_id` prefix), so a delete here + // unlinks data any other shard owning the same ns would see. If hashing + // now points at a peer (stale reader-mode STM during a + // delete-then-recreate race, or a hash-function change across an + // upgrade), refuse the delete and surface the inconsistency instead of + // panicking the pump; the partition stays addressable via its existing + // local entry until an operator resolves the conflict. + let hash_owner = calculate_shard_assignment(&ns, u32::from(ctx.total_shards)); + if hash_owner != shard_id { + ctx.shard.metrics().record_partition_reconcile_failure(); + error!( + shard = shard_id, + ns_raw = ns.inner(), + hash_owner, + "teardown target hashes to peer shard; refusing disk delete to avoid cross-shard data loss" + ); + ctx.record_failure(ns, FailureCause::Delete, Instant::now()); + return; + } + + let now = Instant::now(); + if ctx.is_backed_off(ns, FailureCause::Delete, now) { + counters.backoff_skipped += 1; + return; + } + + // Fence writes BEFORE awaiting disk delete. Tombstone is RefCell + // (cross-task callable) and shards_table is papaya, both safe to mutate + // directly from the reconciler. Routing through the pump's ReconcileOp + // queue here would race the unlink against in-flight on_request / + // on_replicate / on_ack frames that haven't observed the queued + // tombstone yet. Idempotent on retry: already-tombstoned namespace + // stays tombstoned; already-removed shards_table row is a no-op. + if !partitions.is_tombstoned(&ns) { + partitions.tombstone(ns); + } + shards_table.remove(&ns); + + if let Err(err) = delete_partitions_from_disk( + ns.stream_id(), + ns.topic_id(), + ns.partition_id(), + ctx.config.as_ref(), + ) + .await + { + ctx.record_failure(ns, FailureCause::Delete, now); + ctx.shard.metrics().record_partition_reconcile_failure(); + error!( + shard = shard_id, + stream_id = ns.stream_id(), + topic_id = ns.topic_id(), + partition_id = ns.partition_id(), + error = %err, + "reconciler failed to delete partition directory" + ); + return; + } + + ctx.shard + .enqueue_reconcile_op(ReconcileOp::ConfirmRemove { namespace: ns }); + ctx.record_success(ns, FailureCause::Delete); + counters.removed_local += 1; +} + +/// Committed `(namespace, created_revision)` pairs. The epoch lets the +/// additions pass detect a stale local incarnation after slab-key reuse +/// without an `Arc` clone per partition; stats are fetched +/// lazily in [`fetch_topic_stats`] only for namespaces actually built. +fn snapshot_target_namespaces(ctx: &ReconcilerCtx) -> Vec<(IggyNamespace, u64)> { + ctx.shard.plane.metadata().mux_stm.streams().read(|inner| { + // TODO(krishna): O(committed partitions) per non-skipped pass (here + + // reconcile_removals). The revision fast-skip hides this in steady + // state but not under sustained churn; switch to an incremental diff + // keyed on the changed namespaces if it bottlenecks large clusters. + let mut entries = Vec::new(); + for (_, stream) in &inner.items { + for (topic_id, topic) in &stream.topics { + for partition in &topic.partitions { + let ns = IggyNamespace::new(stream.id, topic_id, partition.id); + entries.push((ns, partition.created_revision)); + } + } + } + entries + }) +} + +/// Monotonic `Streams::revision`. Stable between passes iff no +/// partition-shaping op committed since, which is the fast-skip signal. +fn current_revision(ctx: &ReconcilerCtx) -> u64 { + ctx.shard + .plane + .metadata() + .mux_stm + .streams() + .read(|inner| inner.revision) +} + +/// Clone the parent topic's `Arc` for a single namespace. +/// `None` if the topic vanished between the target snapshot and this read. +fn fetch_topic_stats( + ctx: &ReconcilerCtx, + ns: IggyNamespace, +) -> Option> { + ctx.shard.plane.metadata().mux_stm.streams().read(|inner| { + let stream = inner.items.get(ns.stream_id())?; + let topic = stream.topics.get(ns.topic_id())?; + Some(topic.stats.clone()) + }) +} + +fn shards_table_contains(ctx: &ReconcilerCtx, ns: IggyNamespace) -> bool { + ctx.shard.shards_table().shard_for(ns).is_some() +} + +pub fn install_tick_handler(shard: &Rc, wake_tx: WakeTx) { + let shard_id = shard.id; + let handler = Rc::new(move || { + if let Err(err) = wake_tx.try_send(()) { + trace!(shard = shard_id, "tick wake dropped: {err}"); + } + }); + shard.set_metadata_tick_handler(Some(handler)); +} + +#[cfg(test)] +mod tests { + use super::{FailureCause, FailureRecord, ReconcilerCtx, reconcile_once}; + use configs::server_ng::ServerNgConfig; + use consensus::{MetadataHandle, PartitionsHandle}; + use iggy_binary_protocol::codec::WireEncode; + use iggy_binary_protocol::primitives::identifier::WireName; + use iggy_binary_protocol::primitives::partition_assignment::CreatedPartitionAssignment; + use iggy_binary_protocol::requests::partitions::{ + CreatePartitionsRequest, CreatePartitionsWithAssignmentsRequest, + }; + use iggy_binary_protocol::requests::streams::{CreateStreamRequest, DeleteStreamRequest}; + use iggy_binary_protocol::requests::topics::{ + CreateTopicRequest, CreateTopicWithAssignmentsRequest, DeleteTopicRequest, + }; + use iggy_binary_protocol::{Command2, Operation, PrepareHeader, WireIdentifier}; + use message_bus::IggyMessageBus; + use metadata::IggyMetadata; + use metadata::MuxStateMachine; + use metadata::impls::metadata::IggySnapshot; + use metadata::stm::StateMachine; + use metadata::stm::consumer_group::ConsumerGroups; + use metadata::stm::stream::Streams; + use metadata::stm::user::Users; + use partitions::{IggyPartitions, PartitionsConfig}; + use server_common::Message; + use server_common::sharding::{IggyNamespace, ShardId}; + use shard::shards_table::{PapayaShardsTable, ShardsTable, calculate_shard_assignment}; + use shard::{IggyShard, PartitionConsensusConfig, ShardIdentity}; + use std::mem::size_of; + use std::rc::Rc; + use std::sync::Arc; + use std::time::Instant; + use tempfile::TempDir; + + type TestMux = MuxStateMachine; + type TestShard = IggyShard< + Rc, + journal::prepare_journal::PrepareJournal, + IggySnapshot, + TestMux, + PapayaShardsTable, + >; + + const CLUSTER_ID: u128 = 1; + + /// Sanity test that ensures the `()` channel can coalesce wakes + /// without blocking the producer when the consumer hasn't drained + /// yet. Production behaviour relies on this: the metadata commit + /// notifier runs on the metadata commit path and cannot await. + #[test] + fn wake_channel_coalesces_drops_when_full() { + let (tx, rx) = shard::channel::<()>(1); + assert!(tx.try_send(()).is_ok()); + assert!( + tx.try_send(()).is_err(), + "second send must fail; capacity 1 enforces coalescing" + ); + assert!(rx.try_recv().is_ok()); + assert!(rx.try_recv().is_err()); + } + + /// Build a `Message` carrying `request` as its body and + /// `operation` stamped in the header. Bypasses the VSR pipeline (no + /// journal, no view, no client): the state machine reads only + /// `header.operation` and `header.size`, so the rest is left zeroed. + fn build_prepare( + op: u64, + operation: Operation, + request: &R, + ) -> Message { + let body = request.to_bytes(); + let header_size = size_of::(); + let total_size = header_size + body.len(); + let mut msg = Message::::new(total_size); + msg.as_mut_slice()[header_size..total_size].copy_from_slice(&body); + let header = bytemuck::checked::try_from_bytes_mut::( + &mut msg.as_mut_slice()[..header_size], + ) + .expect("zeroed bytes form a valid PrepareHeader"); + header.command = Command2::Prepare; + header.size = u32::try_from(total_size).expect("prepare size fits u32"); + header.op = op; + header.operation = operation; + msg + } + + fn assignment(partition_id: u32, consensus_group_id: u64) -> CreatedPartitionAssignment { + CreatedPartitionAssignment { + partition_id, + consensus_group_id, + } + } + + /// Drive a `CreateStream` commit through the state machine. The STM + /// assigns slab keys from 0 for the first stream on a fresh STM. + fn seed_stream(mux: &TestMux, op: u64, name: &str) { + let req = CreateStreamRequest { + name: WireName::new(name).expect("test stream name fits WireName"), + }; + mux.update(build_prepare(op, Operation::CreateStream, &req)) + .expect("CreateStream apply succeeds"); + } + + /// Drive a `CreateTopicWithAssignments` commit. + fn seed_topic( + mux: &TestMux, + op: u64, + stream_id: u32, + name: &str, + assignments: Vec, + ) { + let req = CreateTopicWithAssignmentsRequest { + request: CreateTopicRequest { + stream_id: WireIdentifier::numeric(stream_id), + partitions_count: u32::try_from(assignments.len()) + .expect("partitions count fits u32"), + compression_algorithm: 0, + message_expiry: 0, + max_topic_size: 0, + replication_factor: 1, + name: WireName::new(name).expect("test topic name fits WireName"), + }, + partitions: assignments, + }; + mux.update(build_prepare( + op, + Operation::CreateTopicWithAssignments, + &req, + )) + .expect("CreateTopicWithAssignments apply succeeds"); + } + + fn seed_delete_topic(mux: &TestMux, op: u64, stream_id: u32, topic_id: u32) { + let req = DeleteTopicRequest { + stream_id: WireIdentifier::numeric(stream_id), + topic_id: WireIdentifier::numeric(topic_id), + }; + mux.update(build_prepare(op, Operation::DeleteTopic, &req)) + .expect("DeleteTopic apply succeeds"); + } + + fn seed_delete_stream(mux: &TestMux, op: u64, stream_id: u32) { + let req = DeleteStreamRequest { + stream_id: WireIdentifier::numeric(stream_id), + }; + mux.update(build_prepare(op, Operation::DeleteStream, &req)) + .expect("DeleteStream apply succeeds"); + } + + fn test_config(tmp: &TempDir) -> ServerNgConfig { + let mut cfg = ServerNgConfig::default(); + // `SystemConfig` is not `Clone`, so `Arc::make_mut` is out; build a + // fresh value via struct-update syntax and swap the Arc wholesale. + // Only `path` differs from the default; every other field uses the + // runtime's defaults. + let system = configs::system::SystemConfig { + path: tmp.path().to_string_lossy().into_owned(), + ..configs::system::SystemConfig::default() + }; + cfg.system = Arc::new(system); + cfg + } + + /// Assemble a fully functional `ServerNgShard` for reconciler tests. + /// Uses `IggyShard::without_inbox` so no inter-shard pump runs; the + /// reconciler can be driven directly by `reconcile_once`. + fn build_test_shard(shard_id: u16, config: &ServerNgConfig, mux: TestMux) -> Rc { + let bus = Rc::new(IggyMessageBus::with_config(shard_id, config)); + let metadata: IggyMetadata< + consensus::VsrConsensus>, + journal::prepare_journal::PrepareJournal, + IggySnapshot, + _, + > = IggyMetadata::new(None, None, None, mux, None); + let partitions = IggyPartitions::new( + ShardId::new(shard_id), + PartitionsConfig { + messages_required_to_save: 1, + size_of_messages_required_to_save: iggy_common::IggyByteSize::from(1024_u64), + enforce_fsync: false, + segment_size: config.system.segment.size, + }, + ); + let shards_table = PapayaShardsTable::new(); + let partition_consensus = PartitionConsensusConfig::new( + CLUSTER_ID, + shard::ReplicaTopology::new(0, 1), + Rc::clone(&bus), + ); + let shard = TestShard::without_inbox( + ShardIdentity::new(shard_id, format!("test-shard-{shard_id}")), + Rc::clone(&bus), + metadata, + partitions, + shards_table, + partition_consensus, + ); + Rc::new(shard) + } + + fn make_ctx( + shard: Rc, + total_shards: u16, + config: Rc, + ) -> Rc { + Rc::new(ReconcilerCtx::new( + shard, + total_shards, + config, + CLUSTER_ID, + 0, + 1, + )) + } + + /// Tests run reconcile + pump-side apply inline since no real pump exists. + async fn reconcile_pass(ctx: &ReconcilerCtx) { + reconcile_once(ctx).await; + ctx.shard.apply_reconcile_ops(); + } + + /// Single-shard scenario: every committed partition is owned locally. + /// After one reconcile pass every namespace must be materialised in + /// `partitions` and addressable through `shards_table`. Disk + /// hierarchy is created under the tempdir's system path; idempotent + /// retries are exercised by a second pass. + #[compio::test] + async fn reconcile_materialises_owned_partitions_single_shard() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-a"); + seed_topic( + &mux, + 2, + 0, + "topic-a", + vec![assignment(0, 1), assignment(1, 2), assignment(2, 3)], + ); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + + let partitions = shard.plane.partitions(); + let shards_table = shard.shards_table(); + for partition_id in 0..3 { + let ns = IggyNamespace::new(0, 0, partition_id); + assert!( + partitions.contains(&ns), + "namespace {ns:?} must be materialised on its owning shard" + ); + assert_eq!( + shards_table.shard_for(ns), + Some(0), + "shards_table must point at the owning shard" + ); + } + assert_eq!(partitions.len(), 3, "exactly three partitions materialised"); + + // Idempotency: a second pass with no new commits must not double- + // insert or re-create disk hierarchy. + reconcile_pass(&ctx).await; + assert_eq!( + partitions.len(), + 3, + "second pass over an unchanged target must be a no-op" + ); + } + + /// Regression (deferred-apply window): the reconciler stages + /// `ReconcileOp::InsertOwned` from a task separate from the pump that + /// applies it, so under a commit burst it can run a second pass before + /// the pump drains the first pass's staged ops. Both passes then + /// observe `!contains(ns)` and build the same namespace. The pump's + /// apply must be idempotent, else the second `insert` orphans the first + /// partition (leaked VSR group + writers) and inflates `len`. + /// `reconcile_pass` applies inline and cannot surface this, so here we + /// run two passes and only then drain once. + #[compio::test] + async fn deferred_apply_window_does_not_duplicate_owned_partition() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-a"); + seed_topic( + &mux, + 2, + 0, + "topic-a", + vec![assignment(0, 1), assignment(1, 2)], + ); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + // Two passes with no pump drain in between: models the reconciler, + // woken by a second commit tick, running pass N+1 before the pump + // applies pass N's `InsertOwned`. Both passes see the namespaces as + // unmaterialised and stage a build for each, so the queue holds two + // `InsertOwned` per namespace when the pump finally drains. + reconcile_once(&ctx).await; + reconcile_once(&ctx).await; + + ctx.shard.apply_reconcile_ops(); + + let partitions = shard.plane.partitions(); + assert_eq!( + partitions.len(), + 2, + "deferred-apply window must not duplicate partitions: \ + each namespace materialises exactly once" + ); + for partition_id in 0..2 { + let ns = IggyNamespace::new(0, 0, partition_id); + assert!( + partitions.contains(&ns), + "namespace {ns:?} must be addressable exactly once" + ); + assert_eq!( + shard.shards_table().shard_for(ns), + Some(0), + "shards_table must point at the owning shard" + ); + } + } + + /// Multi-shard scenario: only the partition whose hash maps to + /// `self.shard_id` is materialised; every other namespace gets a + /// `shards_table` row pointing at the owning shard but no + /// `IggyPartition` instance. + #[compio::test] + async fn reconcile_only_materialises_namespaces_owned_by_self() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let total_shards: u16 = 4; + + // Pick a partition count where the murmur3 distribution lands + // entries on at least two distinct shards out of four, then + // run the test against the most-loaded shard. This makes the + // assertion "self_owned > 0 && routed_only > 0" structural + // rather than dependent on a fixed shard_id matching the + // arbitrary hash output. + let partition_count: u32 = 16; + let mut counts: std::collections::HashMap = std::collections::HashMap::new(); + for partition_id in 0..partition_count { + let ns = IggyNamespace::new(0, 0, partition_id as usize); + *counts + .entry(calculate_shard_assignment(&ns, u32::from(total_shards))) + .or_insert(0) += 1; + } + let (shard_id, _) = counts + .iter() + .max_by_key(|(_, count)| *count) + .map(|(s, c)| (*s, *c)) + .expect("hash distribution must populate at least one shard"); + assert!( + counts.len() >= 2, + "test partition count must yield a multi-shard distribution; got {counts:?}" + ); + + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-shard-aware"); + let assignments: Vec = (0..partition_count) + .map(|partition_id| assignment(partition_id, u64::from(partition_id) + 10)) + .collect(); + seed_topic(&mux, 2, 0, "topic-shard-aware", assignments); + + let shard = build_test_shard(shard_id, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), total_shards, Rc::new(config)); + + reconcile_pass(&ctx).await; + + let partitions = shard.plane.partitions(); + let shards_table = shard.shards_table(); + let mut owned = 0usize; + let mut routed_only = 0usize; + for partition_id in 0..partition_count { + let ns = IggyNamespace::new(0, 0, partition_id as usize); + let expected_owner = calculate_shard_assignment(&ns, u32::from(total_shards)); + if expected_owner == shard_id { + assert!( + partitions.contains(&ns), + "namespace {ns:?} owned by self must be materialised" + ); + owned += 1; + } else { + assert!( + !partitions.contains(&ns), + "namespace {ns:?} owned by shard {expected_owner} \ + must NOT be materialised on shard {shard_id}" + ); + routed_only += 1; + } + assert_eq!( + shards_table.shard_for(ns), + Some(expected_owner), + "shards_table must always resolve the owning shard" + ); + } + assert_eq!( + partitions.len(), + owned, + "IggyPartitions size must match the count of self-owned namespaces" + ); + assert!( + owned > 0, + "test must run on a shard that owns ≥ 1 partition" + ); + assert!( + routed_only > 0, + "test must run with ≥ 1 partition owned by another shard" + ); + } + + /// `CreatePartitions` on an existing topic adds new namespaces; the + /// reconciler picks them up on the next pass without touching the + /// partitions it already materialised. + #[compio::test] + async fn reconcile_picks_up_create_partitions_increments() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-b"); + seed_topic( + &mux, + 2, + 0, + "topic-b", + vec![assignment(0, 1), assignment(1, 2)], + ); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + assert_eq!(shard.plane.partitions().len(), 2); + + // Now commit two additional partitions on the same topic. + // `CreatePartitionsWithAssignments` applies request-relative + // offsets, so partition_id=0,1 below resolve to absolute ids + // 2,3 once the STM adds the base offset. + shard + .plane + .metadata() + .mux_stm + .update(build_prepare( + 3, + Operation::CreatePartitionsWithAssignments, + &CreatePartitionsWithAssignmentsRequest { + request: CreatePartitionsRequest { + stream_id: WireIdentifier::numeric(0), + topic_id: WireIdentifier::numeric(0), + partitions_count: 2, + }, + partitions: vec![assignment(0, 3), assignment(1, 4)], + }, + )) + .expect("CreatePartitions apply succeeds"); + + reconcile_pass(&ctx).await; + assert_eq!( + shard.plane.partitions().len(), + 4, + "reconciler must materialise the two new partitions" + ); + for partition_id in 0..4 { + let ns = IggyNamespace::new(0, 0, partition_id); + assert!( + shard.plane.partitions().contains(&ns), + "namespace {ns:?} must be materialised after CreatePartitions" + ); + } + } + + /// `DeleteTopic` removes every partition under the topic on the next + /// reconcile pass: owning shard drops the `IggyPartition`, every + /// shard prunes its `shards_table` row, and the on-disk hierarchy + /// is removed. + #[compio::test] + async fn reconcile_removes_partitions_on_delete_topic() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-c"); + seed_topic( + &mux, + 2, + 0, + "topic-c", + vec![assignment(0, 1), assignment(1, 2)], + ); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + // Verify disk hierarchy exists before the delete commits. + let partition_root_before = ctx.config.system.get_partition_path(0, 0, 0); + assert!( + std::path::Path::new(&partition_root_before).exists(), + "partition directory must exist post-materialisation" + ); + + seed_delete_topic(&shard.plane.metadata().mux_stm, 3, 0, 0); + reconcile_pass(&ctx).await; + + assert_eq!( + shard.plane.partitions().len(), + 0, + "DeleteTopic must drop every partition under it" + ); + for partition_id in 0..2 { + let ns = IggyNamespace::new(0, 0, partition_id); + assert!( + !shard.plane.partitions().contains(&ns), + "namespace {ns:?} must be removed from IggyPartitions" + ); + assert_eq!( + shard.shards_table().shard_for(ns), + None, + "shards_table row must be pruned for {ns:?}" + ); + let path = ctx.config.system.get_partition_path( + ns.stream_id(), + ns.topic_id(), + ns.partition_id(), + ); + assert!( + !std::path::Path::new(&path).exists(), + "on-disk hierarchy for {ns:?} must be removed" + ); + } + } + + /// `DeleteStream` removes everything beneath it in one shot: every + /// topic, every partition, every routing row. + #[compio::test] + async fn reconcile_removes_partitions_on_delete_stream() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-d"); + seed_topic( + &mux, + 2, + 0, + "topic-d1", + vec![assignment(0, 1), assignment(1, 2)], + ); + seed_topic(&mux, 3, 0, "topic-d2", vec![assignment(0, 3)]); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + assert_eq!( + shard.plane.partitions().len(), + 3, + "two topics × (2+1) partitions must materialise before delete" + ); + + seed_delete_stream(&shard.plane.metadata().mux_stm, 4, 0); + reconcile_pass(&ctx).await; + assert_eq!( + shard.plane.partitions().len(), + 0, + "DeleteStream must remove every partition transitively" + ); + assert!( + shard.shards_table().namespaces().is_empty(), + "shards_table must be empty after DeleteStream" + ); + } + + /// A delete+recreate of the same (stream, topic, partition) tuple + /// reuses the freed slab key, so the namespace is byte-identical but + /// its committed `created_revision` is greater. The reconciler must + /// notice the stale local partition (old segments / offsets / log), + /// tear it down, and rebuild fresh rather than keep serving the prior + /// incarnation under the recycled identity. + #[compio::test] + async fn reconcile_rebuilds_stale_partition_after_slab_key_reuse() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-reuse"); + seed_topic(&mux, 2, 0, "topic-reuse", vec![assignment(0, 1)]); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + let ns = IggyNamespace::new(0, 0, 0); + assert!(shard.plane.partitions().contains(&ns)); + let epoch_before = shard + .shards_table() + .epoch_for(ns) + .expect("materialised row carries an epoch"); + + // Delete then recreate the SAME tuple. The STM frees + reuses + // topic slab key 0, so `ns` is identical but `created_revision` + // is strictly greater. The reconciler never ran between the two + // commits, so the stale partition is still materialised here. + seed_delete_topic(&shard.plane.metadata().mux_stm, 3, 0, 0); + seed_topic( + &shard.plane.metadata().mux_stm, + 4, + 0, + "topic-reuse", + vec![assignment(0, 1)], + ); + + // Pass 1: detect the stale incarnation and tear it down. The + // absent partition afterwards proves the old one was dropped, not + // merely left in place. + reconcile_pass(&ctx).await; + assert!( + !shard.plane.partitions().contains(&ns), + "stale partition must be torn down before rebuild" + ); + + // Pass 2: rebuild fresh at the new epoch. + reconcile_pass(&ctx).await; + assert!( + shard.plane.partitions().contains(&ns), + "fresh partition must materialise after the teardown" + ); + let epoch_after = shard.shards_table().epoch_for(ns); + assert!(epoch_after.is_some(), "rebuilt row must carry an epoch"); + assert_ne!( + epoch_after, + Some(epoch_before), + "rebuilt row must carry a new epoch, proving the stale partition was replaced" + ); + } + + /// Once converged, a pass with an unchanged + /// `Streams::revision` fast-skips the O(N) diff instead of re-scanning + /// every committed namespace every periodic tick. A fresh + /// partition-shaping commit bumps the revision and defeats the skip. + #[compio::test] + async fn reconcile_fast_skips_when_revision_unchanged() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-skip"); + seed_topic(&mux, 2, 0, "topic-skip", vec![assignment(0, 1)]); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + // First pass materialises (work); the verify pass that follows a + // working pass does nothing and arms the fast-skip. + assert!(reconcile_once(&ctx).await, "first pass must run"); + ctx.shard.apply_reconcile_ops(); + assert!( + reconcile_once(&ctx).await, + "the verify pass after a working pass must still run" + ); + ctx.shard.apply_reconcile_ops(); + + // No commit since: revision unchanged + last pass a no-op → skip. + assert!( + !reconcile_once(&ctx).await, + "unchanged revision after convergence must fast-skip the diff" + ); + + // A new partition-shaping commit bumps the revision → next pass runs. + seed_topic( + &shard.plane.metadata().mux_stm, + 3, + 0, + "topic-skip-2", + vec![assignment(0, 2)], + ); + assert!( + reconcile_once(&ctx).await, + "a fresh commit must defeat the fast-skip" + ); + } + + /// Permanent-tombstone-wedge regression: a teardown whose disk delete + /// fails sets the tombstone and removes the `shards_table` row but never + /// enqueues `ConfirmRemove`, so the tombstone never lifts. If the same + /// `(stream, topic, partition)` is then recreated, `ns` is back in the + /// committed target: the additions pass used to see `contains + + /// is_tombstoned` and defer forever while the removals pass no longer + /// treated `ns` as a ghost, fencing the partition for good and dropping + /// every data-plane frame. The additions pass must instead notice the + /// recorded delete failure (no `ConfirmRemove` in flight) and re-drive + /// teardown, retrying the delete so the partition recovers. + #[compio::test] + async fn reconcile_recovers_permanently_wedged_tombstone() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-wedge"); + seed_topic(&mux, 2, 0, "topic-wedge", vec![assignment(0, 1)]); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + let ns = IggyNamespace::new(0, 0, 0); + let partitions = shard.plane.partitions(); + assert!(partitions.contains(&ns)); + let partition_root = ctx.config.system.get_partition_path(0, 0, 0); + assert!(std::path::Path::new(&partition_root).exists()); + + // Reconstruct the post-failed-teardown state: tombstone set + + // shards_table row gone + a `FailureCause::Delete` record, but the + // partition still in the map and its directory still on disk (the + // disk delete "failed"). `ns` is still in the committed target, so + // this is the recreate-after-failed-delete shape. The injected + // record's `next_retry_at` is captured now, so it is already due by + // the time teardown checks the backoff (the monotonic clock only + // advances). + partitions.tombstone(ns); + shard.shards_table().remove(&ns); + ctx.failure_state.borrow_mut().insert( + (ns, FailureCause::Delete), + FailureRecord { + attempts: 1, + next_retry_at: Instant::now(), + }, + ); + + // Pass 1: additions must re-drive teardown (the delete now succeeds, + // the directory is present), enqueue `ConfirmRemove`, and the inline + // pump drops the partition + clears the tombstone. Without the fix + // this pass defers and leaves the partition tombstoned forever. + reconcile_pass(&ctx).await; + assert!( + !partitions.contains(&ns), + "re-driven teardown must drop the wedged partition" + ); + assert!( + !partitions.is_tombstoned(&ns), + "ConfirmRemove must clear the tombstone once the delete succeeds" + ); + assert!( + !std::path::Path::new(&partition_root).exists(), + "re-driven teardown must delete the on-disk hierarchy" + ); + + // Pass 2: with the tombstone cleared the partition rebuilds fresh + // and is addressable again. + reconcile_pass(&ctx).await; + assert!( + partitions.contains(&ns), + "partition must rebuild fresh after the wedge is cleared" + ); + assert!(!partitions.is_tombstoned(&ns)); + assert_eq!( + shard.shards_table().shard_for(ns), + Some(0), + "rebuilt partition must be addressable through shards_table" + ); + assert!( + std::path::Path::new(&partition_root).exists(), + "rebuilt partition must recreate its on-disk hierarchy" + ); + } + + /// The wedge fix must not break the legitimate defer: when teardown's + /// disk delete SUCCEEDED a `ConfirmRemove` is in flight, so the + /// additions pass must still defer the rebuild to the post-drain wake + /// rather than re-driving teardown. The absence of a + /// `FailureCause::Delete` record is exactly what separates this from the + /// wedge, so none is injected here. + #[compio::test] + async fn reconcile_defers_rebuild_while_confirm_remove_in_flight() { + let tmp = TempDir::new().expect("tempdir for system path"); + let config = test_config(&tmp); + let mux = TestMux::default(); + seed_stream(&mux, 1, "stream-defer"); + seed_topic(&mux, 2, 0, "topic-defer", vec![assignment(0, 1)]); + + let shard = build_test_shard(0, &config, mux); + let ctx = make_ctx(Rc::clone(&shard), 1, Rc::new(config)); + + reconcile_pass(&ctx).await; + let ns = IggyNamespace::new(0, 0, 0); + let partitions = shard.plane.partitions(); + assert!(partitions.contains(&ns)); + let partition_root = ctx.config.system.get_partition_path(0, 0, 0); + + // Post-successful-teardown, pre-drain state: tombstone set + + // shards_table row gone, NO delete failure (the disk delete + // succeeded and a `ConfirmRemove` is queued). The partition is left + // in the map to model the not-yet-drained pump queue. + partitions.tombstone(ns); + shard.shards_table().remove(&ns); + + // A pass with no inline drain must defer: the partition stays in the + // map, stays tombstoned, and its directory is untouched (teardown + // was NOT re-driven). + reconcile_once(&ctx).await; + assert!( + partitions.contains(&ns), + "defer must leave the partition in the map" + ); + assert!( + partitions.is_tombstoned(&ns), + "defer must not clear the tombstone" + ); + assert!( + std::path::Path::new(&partition_root).exists(), + "defer must not re-drive teardown: the directory must remain" + ); + } +} diff --git a/core/server-ng/tests/sdk_e2e.rs b/core/server-ng/tests/sdk_e2e.rs new file mode 100644 index 0000000000..e7f5fccf37 --- /dev/null +++ b/core/server-ng/tests/sdk_e2e.rs @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! End-to-end smoke test for the partition reconciliation loop. +//! +//! Spawns the `iggy-server-ng` binary with an isolated tempdir + ephemeral +//! TCP port, drives an SDK client through the metadata commit path +//! (`Login` → `CreateStream` → `CreateTopic` with N partitions), then +//! verifies the per-partition on-disk hierarchy exists. That hierarchy is +//! created only by +//! [`server_ng::partition_helpers::create_partition_file_hierarchy`], +//! inside the reconciler's +//! [`build_partition_fresh`](server_ng::partition_helpers::build_partition_fresh) +//! path, so a materialisation observable from outside the process proves +//! the full commit-notifier → reconciler → disk pipeline. +//! +//! Deliberately stops short of `SendMessages` / `PollMessages`: the +//! partition data-plane SDK round-trip in server-ng is still evolving, so +//! a smoke test failing for unrelated data-plane reasons would be noisy. +//! The on-disk hierarchy assertion proves the reconciler ran without +//! depending on the still-evolving wire surface. + +use assert_cmd::prelude::CommandCargoExt; +use iggy::prelude::{ + AutoLogin, Client, CompressionAlgorithm, IggyByteSize, IggyClient, IggyClientBuilder, + IggyDuration, IggyExpiry, MaxTopicSize, StreamClient, TopicClient, UserClient, +}; +use std::net::{SocketAddr, TcpStream as StdTcpStream}; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command, Stdio}; +use std::thread::sleep; +use std::time::{Duration, Instant}; +use tempfile::TempDir; +use toml::Value; + +/// `iggy-server-ng` bootstrap can take ~5s on cold caches before the TCP +/// listener binds and `current_config.toml` is written; 30s is generous +/// enough for slow CI runners without hanging the suite indefinitely. +const STARTUP_TIMEOUT: Duration = Duration::from_secs(30); + +/// Cadence for polling `current_config.toml` and the partition +/// directories. Small enough to keep observable latency under a second +/// without wasting CPU. +const POLL_INTERVAL: Duration = Duration::from_millis(50); + +/// Spawned binary handle. Drops on test exit kill the child + wait so +/// the test does not leave zombie processes if it panics mid-assertion. +struct TestServer { + child: Child, + tcp_addr: SocketAddr, + data_dir: TempDir, +} + +impl TestServer { + fn start() -> Self { + let data_dir = TempDir::new().expect("tempdir for system.path"); + let mut cmd = Command::cargo_bin("iggy-server-ng") + .expect("iggy-server-ng binary must be built by the test runner"); + cmd.env("IGGY_SYSTEM_PATH", data_dir.path()) + // Ephemeral port; the actual bound port is read back from + // `runtime/current_config.toml` after the listener binds. + .env("IGGY_TCP_ADDRESS", "127.0.0.1:0") + // Disable every other transport so the test does not race + // unrelated listeners (HTTP UI bind, QUIC self-signed cert + // material, WebSocket handshake, etc.). + .env("IGGY_HTTP_ENABLED", "false") + .env("IGGY_QUIC_ENABLED", "false") + .env("IGGY_WEBSOCKET_ENABLED", "false") + // Tighten the reconciler tick so the post-CreateTopic + // materialisation window stays small under CI latency + // jitter. Hard-coded 200ms is still way above the per-tick + // re-read cost yet small enough to bound test wall-time. + .env("IGGY_SYSTEM_SHARDING_RECONCILE_PERIODIC_INTERVAL", "200 ms") + .stdout(Stdio::null()) + .stderr(Stdio::piped()); + + let mut child = cmd + .spawn() + .expect("iggy-server-ng must spawn under cargo test"); + + let runtime_path = data_dir.path().join("runtime").join("current_config.toml"); + let tcp_addr = match wait_for_bound_tcp(&runtime_path, STARTUP_TIMEOUT) { + Ok(addr) => addr, + Err(err) => { + // Reap the spawn-failed child so the test does not + // leave a zombie behind; the tempdir drops naturally + // when `data_dir` goes out of scope at the panic. + let _ = child.kill(); + let _ = child.wait(); + panic!("server-ng startup failed: {err}"); + } + }; + + Self { + child, + tcp_addr, + data_dir, + } + } + + fn data_path(&self) -> &Path { + self.data_dir.path() + } +} + +impl std::fmt::Debug for TestServer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TestServer") + .field("tcp_addr", &self.tcp_addr) + .field("data_dir", &self.data_dir.path()) + .field("child_pid", &self.child.id()) + .finish() + } +} + +impl Drop for TestServer { + fn drop(&mut self) { + // Best-effort: SIGKILL the child, drain its exit status so the + // OS reclaims the PID, then drop the tempdir. + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +/// Poll `runtime/current_config.toml` until it exists and reports a +/// non-zero bound TCP port. Returns the resolved [`SocketAddr`]. +fn wait_for_bound_tcp(config_path: &Path, timeout: Duration) -> Result { + let deadline = Instant::now() + timeout; + let mut last_err = String::from("config file never appeared"); + while Instant::now() < deadline { + match read_bound_tcp(config_path) { + Ok(addr) => { + // Confirm the port is reachable before handing it back. + // The TOML write races slightly ahead of `bind()` + // becoming `connect()`-able on some kernels. + if StdTcpStream::connect_timeout(&addr, Duration::from_millis(100)).is_ok() { + return Ok(addr); + } + last_err = format!("bound TCP {addr} not connectable yet"); + } + Err(err) => last_err = err, + } + sleep(POLL_INTERVAL); + } + Err(format!( + "timed out after {timeout:?} waiting for {}: {last_err}", + config_path.display() + )) +} + +fn read_bound_tcp(config_path: &Path) -> Result { + let raw = std::fs::read_to_string(config_path) + .map_err(|e| format!("read {}: {e}", config_path.display()))?; + let parsed: Value = + toml::from_str(&raw).map_err(|e| format!("parse {}: {e}", config_path.display()))?; + let address = parsed + .get("tcp") + .and_then(|t| t.get("address")) + .and_then(Value::as_str) + .ok_or_else(|| format!("{} missing tcp.address", config_path.display()))?; + let addr: SocketAddr = address + .parse() + .map_err(|e| format!("invalid tcp.address {address:?}: {e}"))?; + if addr.port() == 0 { + return Err(format!("tcp.address {addr} still on port 0")); + } + Ok(addr) +} + +/// Wait until every per-partition directory has been created by the +/// reconciler. Returns `Ok(())` once all expected paths exist; surfaces +/// a structured error on timeout listing which paths were missing. +fn wait_for_partition_dirs( + data_dir: &Path, + stream_id: u32, + topic_id: u32, + partition_count: u32, + timeout: Duration, +) -> Result<(), String> { + let expected: Vec = (0..partition_count) + .map(|partition_id| partition_dir(data_dir, stream_id, topic_id, partition_id)) + .collect(); + let deadline = Instant::now() + timeout; + loop { + let missing: Vec = expected.iter().filter(|p| !p.exists()).cloned().collect(); + if missing.is_empty() { + return Ok(()); + } + if Instant::now() >= deadline { + return Err(format!( + "timed out after {timeout:?}; reconciler did not materialise: {missing:?}" + )); + } + sleep(POLL_INTERVAL); + } +} + +fn partition_dir(data_dir: &Path, stream_id: u32, topic_id: u32, partition_id: u32) -> PathBuf { + data_dir + .join("streams") + .join(stream_id.to_string()) + .join("topics") + .join(topic_id.to_string()) + .join("partitions") + .join(partition_id.to_string()) +} + +async fn connected_client(server_addr: SocketAddr) -> IggyClient { + let client = IggyClientBuilder::new() + .with_tcp() + .with_server_address(server_addr.to_string()) + .with_auto_sign_in(AutoLogin::Disabled) + .with_reconnection_max_retries(Some(3)) + .with_reconnection_interval(IggyDuration::from(200_000_u64)) + .build() + .expect("TCP client build"); + client.connect().await.expect("SDK TCP connect"); + client + .login_user("iggy", "iggy") + .await + .expect("SDK login as root"); + client +} + +/// Spawn `iggy-server-ng` and verify the production binary bootstraps +/// cleanly. Proves the test infrastructure (binary path, env-var +/// overrides, runtime config discovery) is wired correctly so the +/// `#[ignore]`d full end-to-end test below can be flipped on the +/// moment the wire bridge lands. +/// +/// Specifically asserts: +/// * The binary path is reachable via `cargo_bin`. +/// * Env overrides for `IGGY_TCP_ADDRESS=127.0.0.1:0` cause the +/// server to bind an OS-assigned port (`port != 0` in +/// `current_config.toml`). +/// * The bound port is `connect()`-able from this test process. +#[tokio::test(flavor = "current_thread")] +async fn server_ng_bootstraps_and_binds_ephemeral_tcp_port() { + let server = TestServer::start(); + assert_ne!( + server.tcp_addr.port(), + 0, + "TCP listener must bind an OS-assigned port; got {}", + server.tcp_addr + ); + // A successful `wait_for_bound_tcp` already proved the port is + // `connect()`-able once; doing it again here is a sanity check that + // the listener stays bound for the lifetime of the test harness. + let _ = StdTcpStream::connect_timeout(&server.tcp_addr, Duration::from_secs(1)) + .expect("server-ng TCP listener must accept connections post-bootstrap"); +} + +/// End-to-end smoke test against a live `iggy-server-ng` binary. +/// +/// Currently gated `#[ignore]` because server-ng's client-facing wire +/// surface is not yet SDK-compatible: the SDK's TCP framing (length- +/// prefixed binary, see `core/sdk/src/tcp/tcp_client.rs`) does not match +/// the server-ng client listener which speaks the consensus +/// [`RequestHeader`] / [`PrepareHeader`] frame layout. The TCP socket +/// accepts, but `login_user` either hangs on the framing read or +/// retries until the SDK surfaces +/// [`iggy_common::IggyError::CannotEstablishConnection`]. +/// +/// The structural pieces this test exercises are still useful as +/// infrastructure once the wire bridge lands (tracked as a follow-up): +/// * tempdir + ephemeral-port spawn of the production binary +/// * `runtime/current_config.toml` port discovery +/// * SDK-driven `CreateStream` + `CreateTopic` round-trip +/// * on-disk hierarchy assertion proving the reconciler +/// materialised every partition end-to-end +/// +/// Flip the `#[ignore]` to active once `iggy-server-ng` accepts SDK +/// frames; no other changes should be needed. +#[ignore = "server-ng client wire surface is not yet SDK-compatible"] +#[tokio::test(flavor = "current_thread")] +async fn reconciler_materialises_partitions_on_create_topic_e2e() { + let server = TestServer::start(); + let client = connected_client(server.tcp_addr).await; + + // CreateStream commits a `CreateStream` op; the metadata STM + // assigns slab key 0 (first stream on a fresh server). The notifier + // does NOT broadcast a `MetadataCommitTick` for plain stream commits + // (the reconciler only cares about partition-shaped ops) but the + // call validates the wire is reachable before we move on. + client + .create_stream("e2e-stream") + .await + .expect("create stream"); + + // CreateTopic commits as `CreateTopicWithAssignments` after the + // primary's allocator stamps consensus_group_id values; the + // notifier broadcasts `MetadataCommitTick`, the reconciler wakes, + // and `build_partition_fresh` calls + // `create_partition_file_hierarchy` for each assigned partition. + let partitions: u32 = 3; + client + .create_topic( + &"e2e-stream".try_into().expect("stream identifier"), + "e2e-topic", + partitions, + CompressionAlgorithm::default(), + None, + IggyExpiry::ServerDefault, + MaxTopicSize::Custom(IggyByteSize::from(1024_u64 * 1024 * 1024)), + ) + .await + .expect("create topic"); + + // Reconciler is asynchronous: wait for the on-disk hierarchy to + // appear. The 200ms periodic tick configured in `TestServer::start` + // bounds the worst-case wait to ~two ticks even if the post-commit + // wake-up frame got coalesced. + wait_for_partition_dirs( + server.data_path(), + /* stream_id */ 0, + /* topic_id */ 0, + partitions, + Duration::from_secs(5), + ) + .expect("reconciler materialised all partition directories"); +} diff --git a/core/server/src/main.rs b/core/server/src/main.rs index 9e7a9641f3..fc3dd4a4b4 100644 --- a/core/server/src/main.rs +++ b/core/server/src/main.rs @@ -47,7 +47,7 @@ use server::streaming::diagnostics::metrics::Metrics; use server::streaming::storage::SystemStorage; use server::streaming::utils::ptr::EternalPtr; use server_common::MemoryPool; -use server_common::sharding::{IggyNamespace, LocalIdx, PartitionLocation, ShardId}; +use server_common::sharding::{IggyNamespace, PartitionLocation, ShardId}; use std::panic::AssertUnwindSafe; use std::rc::Rc; use std::str::FromStr; @@ -343,8 +343,8 @@ fn main() -> Result<(), ServerError> { &ns, shard_assignment.len() as u32, )); - // TODO(hubcio): LocalIdx is 0 until IggyPartitions is integrated - let location = PartitionLocation::new(shard_id, LocalIdx::new(0)); + // epoch is reconciler-only; unused by legacy server. + let location = PartitionLocation::new(shard_id, 0); shards_table.insert(ns, location); } } diff --git a/core/server/src/shard/system/partitions.rs b/core/server/src/shard/system/partitions.rs index 4fb083d07d..519601bc1e 100644 --- a/core/server/src/shard/system/partitions.rs +++ b/core/server/src/shard/system/partitions.rs @@ -32,7 +32,7 @@ use iggy_common::Identifier; use iggy_common::IggyError; use iggy_common::IggyTimestamp; use server_common::sharding::IggyNamespace; -use server_common::sharding::{LocalIdx, PartitionLocation, ShardId}; +use server_common::sharding::{PartitionLocation, ShardId}; use std::sync::Arc; use std::time::Duration; use tracing::{info, warn}; @@ -102,9 +102,8 @@ impl IggyShard { let ns = IggyNamespace::new(stream, topic_id, partition_id); let shard_id = ShardId::new(calculate_shard_assignment(&ns, shards_count)); let is_current_shard = self.id == *shard_id; - // TODO(hubcio): LocalIdx(0) is wrong.. When IggyPartitions is integrated into - // IggyShard, this should use the actual index returned by IggyPartitions::insert(). - let location = PartitionLocation::new(shard_id, LocalIdx::new(0)); + // epoch is reconciler-only; unused by legacy server. + let location = PartitionLocation::new(shard_id, 0); self.insert_shard_table_record(ns, location); if is_current_shard { diff --git a/core/server_common/src/segment_storage/index_writer.rs b/core/server_common/src/segment_storage/index_writer.rs index c8a1c06a6b..59e11a4e27 100644 --- a/core/server_common/src/segment_storage/index_writer.rs +++ b/core/server_common/src/segment_storage/index_writer.rs @@ -48,6 +48,10 @@ impl IndexWriter { ) -> Result { let mut opts = OpenOptions::new(); opts.create(true).write(true); + // Mirror MessagesWriter: truncate on fresh-build retry. + if !file_exists { + opts.truncate(true); + } let file = opts .open(file_path) .await diff --git a/core/server_common/src/segment_storage/messages_writer.rs b/core/server_common/src/segment_storage/messages_writer.rs index 3749d99d47..214c719b0c 100644 --- a/core/server_common/src/segment_storage/messages_writer.rs +++ b/core/server_common/src/segment_storage/messages_writer.rs @@ -51,6 +51,11 @@ impl MessagesWriter { ) -> Result { let mut opts = OpenOptions::new(); opts.create(true).write(true); + // `file_exists = false` asserts a fresh start; truncate so a + // stale file from a partial prior attempt doesn't survive. + if !file_exists { + opts.truncate(true); + } let file = opts .open(file_path) .await diff --git a/core/server_common/src/sharding/partition_location.rs b/core/server_common/src/sharding/partition_location.rs index 2bd201f902..933d211575 100644 --- a/core/server_common/src/sharding/partition_location.rs +++ b/core/server_common/src/sharding/partition_location.rs @@ -15,20 +15,22 @@ // specific language governing permissions and limitations // under the License. -use super::{LocalIdx, ShardId}; +use super::ShardId; -/// Location of a partition: which shard owns it and its local index within that shard. +/// Which shard owns a partition, plus the committed-metadata epoch the row +/// was written at. #[derive(Debug, Clone, Copy)] pub struct PartitionLocation { pub shard_id: ShardId, - pub local_idx: LocalIdx, + /// `Partition::created_revision` from the committed metadata at the + /// time this row was written. The reconciler compares it against the + /// STM's current value to detect a stale local partition after a + /// delete+recreate reused the same slab-key namespace tuple. + pub epoch: u64, } impl PartitionLocation { - pub fn new(shard_id: ShardId, local_idx: LocalIdx) -> Self { - Self { - shard_id, - local_idx, - } + pub fn new(shard_id: ShardId, epoch: u64) -> Self { + Self { shard_id, epoch } } } diff --git a/core/shard/Cargo.toml b/core/shard/Cargo.toml index 9c315dfcf7..41ff8f2b0a 100644 --- a/core/shard/Cargo.toml +++ b/core/shard/Cargo.toml @@ -22,6 +22,14 @@ edition = "2024" license = "Apache-2.0" publish = false +[features] +# Simulator-only test hook (`IggyShard::init_partition`): bypasses the +# reconciler's `ReconcileOp::InsertOwned` funnel, mutating `IggyPartitions` +# off the pump task. A `-p iggy-server-ng` build excludes it; `cargo build +# --workspace` unifies features and compiles it into the shared `shard` +# unit (simulator requests it). Benign: no production caller. +simulator = [] + [dependencies] bytes = { workspace = true } compio = { workspace = true } diff --git a/core/shard/src/lib.rs b/core/shard/src/lib.rs index 0a7cc5c5b8..7c7f848b0e 100644 --- a/core/shard/src/lib.rs +++ b/core/shard/src/lib.rs @@ -24,14 +24,17 @@ pub mod shards_table; pub use config::CoordinatorConfig; +#[cfg(any(test, feature = "simulator"))] +use consensus::LocalPipeline; use consensus::{ - LocalPipeline, MetadataHandle, MuxPlane, PartitionsHandle, Pipeline, Plane, PlaneKind, - Sequencer, VsrAction, VsrConsensus, + MetadataHandle, MuxPlane, PartitionsHandle, Pipeline, Plane, PlaneKind, Sequencer, VsrAction, + VsrConsensus, }; use iggy_binary_protocol::{ Command2, CommitHeader, DoViewChangeHeader, GenericHeader, PrepareHeader, PrepareOkHeader, RequestHeader, StartViewChangeHeader, StartViewHeader, }; +#[cfg(any(test, feature = "simulator"))] use iggy_common::PartitionStats; use iggy_common::variadic; use journal::{Journal, JournalHandle}; @@ -44,10 +47,13 @@ use metadata::IggyMetadata; use metadata::impls::metadata::StreamsFrontend; use metadata::stm::StateMachine; use partitions::{IggyPartition, IggyPartitions}; -use server_common::sharding::IggyNamespace; +use server_common::sharding::{IggyNamespace, PartitionLocation, ShardId}; use server_common::{MESSAGE_ALIGN, Message, MessageBag, iobuf::Frozen}; use shards_table::ShardsTable; +use std::cell::RefCell; +use std::collections::VecDeque; use std::rc::Rc; +#[cfg(any(test, feature = "simulator"))] use std::sync::Arc; pub type ShardPlane = @@ -70,19 +76,43 @@ where B: MessageBus, { pub cluster_id: u128, + /// Cluster-wide VSR replica id; independent of `IggyShard::id`. + pub self_replica_id: u8, pub replica_count: u8, pub bus: B, } +/// Replica id + count bundle. +/// +/// Adjacent `u8` params (`self_replica_id`, `replica_count`) were a +/// silent-swap hazard at the call site; the named struct gives the type +/// system a chance to catch a misorder. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ReplicaTopology { + pub self_replica_id: u8, + pub replica_count: u8, +} + +impl ReplicaTopology { + #[must_use] + pub const fn new(self_replica_id: u8, replica_count: u8) -> Self { + Self { + self_replica_id, + replica_count, + } + } +} + impl PartitionConsensusConfig where B: MessageBus, { #[must_use] - pub const fn new(cluster_id: u128, replica_count: u8, bus: B) -> Self { + pub const fn new(cluster_id: u128, topology: ReplicaTopology, bus: B) -> Self { Self { cluster_id, - replica_count, + self_replica_id: topology.self_replica_id, + replica_count: topology.replica_count, bus, } } @@ -369,6 +399,48 @@ pub enum LifecycleFrame { ListClients { reply: Sender>, }, + /// Shard 0 broadcasts after a partition-shaped metadata commit; wakes + /// the per-shard reconciler. No payload: reconciler re-reads target + /// state. Drops covered by the periodic safety tick. + MetadataCommitTick, + /// Wake marker for the reconciler-to-pump funnel. Pump drains the + /// shard's `reconcile_queue` on receipt; tail drain on every frame + /// catches dropped markers. + ReconcileApply, +} + +/// Reconciler-staged partition mutation. +/// +/// Funnelling through the pump keeps `IggyPartitions` single-writer: +/// without it the cooperative `.await` scheduler would race +/// `insert` / `remove` against the pump's live `&mut IggyPartition` (UB). +pub enum ReconcileOp +where + B: MessageBus, +{ + /// Materialise an owned partition. Boxed to keep variants size-balanced + /// (`clippy::large_enum_variant`). `epoch` is the committed + /// `Partition::created_revision`, stored on the routing row so a later + /// reconcile pass can detect a slab-key-reused stale partition. + InsertOwned { + namespace: IggyNamespace, + partition: Box>, + epoch: u64, + }, + /// Seed a routing row for a partition owned by a peer shard. + InsertRouted { + namespace: IggyNamespace, + owner: ShardId, + epoch: u64, + }, + /// Final phase of teardown: drop the `IggyPartition` value and clear + /// the tombstone. The reconciler sets the tombstone + removes the + /// `shards_table` row synchronously *before* awaiting the disk delete + /// (writers are fenced via [`IggyPartitions::is_tombstoned`]), so by + /// the time this op runs the disk hierarchy is already gone. + ConfirmRemove { namespace: IggyNamespace }, + /// Drop a routing row (peer's partition gone from committed metadata). + RemoveRouted { namespace: IggyNamespace }, } /// Inter-shard channel envelope. @@ -475,6 +547,10 @@ where /// Partition namespace -> owning shard lookup. shards_table: T, + /// Stored for `init_partition` (simulator-only). Production materialises + /// VSR replicas through `partition_helpers::build_partition_fresh`, which + /// passes the topology + cluster id directly. + #[cfg_attr(not(any(test, feature = "simulator")), allow(dead_code))] partition_consensus: PartitionConsensusConfig, /// Shard 0 coordinator, supplied at construction. Holds round-robin @@ -485,11 +561,19 @@ where /// Per-shard observability counters. Cloned at metric increment sites, /// so cheap (`Arc` clone) regardless of label cardinality. metrics: crate::metrics::ShardMetrics, + + /// Late-bound `MetadataCommitTick` handler. `None` until reconciler + /// wires it; pre-wire ticks drop with a metric bump. + metadata_tick_handler: RefCell>>, + + /// Reconciler → pump funnel. Borrow discipline: every push / drain + /// runs without `.await` inside the borrow. + reconcile_queue: RefCell>>, } impl IggyShard where - B: MessageBus, + B: MessageBus + 'static, T: ShardsTable, { /// Create a new shard with channel links and a shards table. @@ -558,6 +642,8 @@ where partition_consensus, coordinator, metrics, + metadata_tick_handler: RefCell::new(None), + reconcile_queue: RefCell::new(VecDeque::new()), }) } @@ -705,6 +791,8 @@ where shards_table, partition_consensus, metrics: crate::metrics::ShardMetrics::for_shard(), + metadata_tick_handler: RefCell::new(None), + reconcile_queue: RefCell::new(VecDeque::new()), } } @@ -712,6 +800,140 @@ where pub const fn shards_table(&self) -> &T { &self.shards_table } + + #[must_use] + pub const fn metrics(&self) -> &crate::metrics::ShardMetrics { + &self.metrics + } + + /// `None` removes the handler; subsequent ticks drop with a metric bump. + pub fn set_metadata_tick_handler(&self, handler: Option>) { + *self.metadata_tick_handler.borrow_mut() = handler; + } + + /// Returns `true` if a handler ran. Pump bumps the drop metric on `false`. + pub fn dispatch_metadata_commit_tick(&self) -> bool { + self.signal_reconcile_wake() + } + + /// Internal: invoke the installed wake handler (same channel the + /// metadata commit tick uses). Called from `ConfirmRemove` so the + /// reconciler re-runs immediately after the pump drops a tombstoned + /// partition, tightening the delete-recreate-same-ns latency window + /// from one `reconcile_periodic_interval` to one pump-iter. + fn signal_reconcile_wake(&self) -> bool { + let handler = self.metadata_tick_handler.borrow().clone(); + handler.is_some_and(|handler| { + handler(); + true + }) + } + + /// Stage a partition mutation for the pump. + /// + /// Marker `try_send` is best-effort; the pump's tail drain on every + /// frame catches dropped markers, so the queue never strands ops. + pub fn enqueue_reconcile_op(&self, op: ReconcileOp) { + self.reconcile_queue.borrow_mut().push_back(op); + let Some(sender) = self.senders.get(self.id as usize) else { + return; + }; + let _ = sender.try_send(ShardFrame::lifecycle(LifecycleFrame::ReconcileApply)); + } + + /// Drain and apply staged [`ReconcileOp`]s on the pump task. + /// Synchronous: every arm is in-memory only. `ConfirmRemove`'s fsync + + /// blocking close is offloaded to a detached task so the pump doesn't + /// stall on bulk teardown. + pub fn apply_reconcile_ops(&self) + where + B: MessageBus + 'static, + { + let staged: Vec> = { + let mut q = self.reconcile_queue.borrow_mut(); + if q.is_empty() { + return; + } + q.drain(..).collect() + }; + let self_shard_id = self.id; + let partitions = self.plane.partitions(); + let mut confirmed_remove = false; + for op in staged { + match op { + ReconcileOp::InsertOwned { + namespace, + partition, + epoch, + } => { + // Idempotent apply, mirroring `ConfirmRemove` (idempotent + // via `remove`'s `None` early-return). The reconciler + // stages this from a task separate from the pump, so under + // a commit burst two passes can each observe + // `!contains(ns)` and build the same namespace before + // either drains here. A second unconditional `insert` + // would push a duplicate partition and overwrite the + // `ns -> idx` entry, orphaning the first (its VSR group + + // segment writers leak and `len` inflates). The discarded + // build is a fresh empty incarnation over the same on-disk + // path the kept one owns, so dropping it just closes a few + // fds. + if partitions.contains(&namespace) { + drop(partition); + continue; + } + partitions.insert(namespace, *partition); + self.shards_table.insert( + namespace, + PartitionLocation::new(ShardId::new(self_shard_id), epoch), + ); + self.metrics.record_partition_materialised(); + } + ReconcileOp::InsertRouted { + namespace, + owner, + epoch, + } => { + self.shards_table + .insert(namespace, PartitionLocation::new(owner, epoch)); + } + ReconcileOp::ConfirmRemove { namespace } => { + // Tombstone bit set + shards_table row removed synchronously + // by the reconciler before this op was enqueued, so no + // in-flight frame can reach the partition between `remove` + // and the drop here. Teardown already unlinked the on-disk + // hierarchy via `delete_partitions_from_disk`, so the + // partition drops inline: its compio file handles close + // through io_uring without blocking, and no fsync is wanted + // on data that is already gone. + let removed = partitions.remove(&namespace); + partitions.untombstone(&namespace); + self.metrics.record_partition_removed(); + confirmed_remove = true; + if removed.is_none() { + tracing::trace!( + shard = self_shard_id, + namespace_raw = namespace.inner(), + "ConfirmRemove with no in-memory partition (retry after disk-delete failure)" + ); + } + } + ReconcileOp::RemoveRouted { namespace } => { + self.shards_table.remove(&namespace); + } + } + } + + if confirmed_remove { + // Re-wake the reconciler once per drain batch so a delete→recreate + // of a namespace that landed in STM while the unlink was in-flight + // materialises within one pump-iter, not one + // `reconcile_periodic_interval`. The wake channel is capacity-1, so + // a per-op wake would coalesce anyway; firing once avoids K + // redundant handler borrows on a bulk DeleteStream. + self.signal_reconcile_wake(); + } + } } /// Local message processing — these methods handle messages that have been @@ -874,11 +1096,12 @@ where namespace_scratch.extend(planes.1.0.namespaces().copied()); for namespace in namespace_scratch.drain(..) { - let partition = planes - .1 - .0 - .get_by_ns(&namespace) - .expect("partition namespace must resolve during loopback drain"); + // `get_by_ns` returns `None` for tombstoned namespaces: skip + // draining their loopback queue so we don't surface PrepareOk + // frames targeting a partition the reconciler is tearing down. + let Some(partition) = planes.1.0.get_by_ns(&namespace) else { + continue; + }; partition.consensus().drain_loopback_into(buf); } let count = buf.len(); @@ -893,33 +1116,27 @@ where total } - /// Initializes a partition and its dedicated consensus instance on this shard. - /// - /// Used only by the `simulator` crate to seed partitions on the - /// in-process replica array. Production boots via the - /// `load_partition` helper in `server-ng` bootstrap, which takes the - /// cluster `self_replica_id` explicitly from `topology` and never - /// folds the local shard id into the consensus replica space. - /// - /// # Panics - /// Panics if `self.id > u8::MAX` (256+). The simulator currently - /// caps replica count at `MAX_REPLICAS = 256`, so the cast always - /// succeeds in the only caller; the constraint is recorded here so - /// any future caller in production code reads it before the cast. - pub fn init_partition(&mut self, namespace: IggyNamespace) + /// Simulator-only. Mutates `IggyPartitions` off the pump task, + /// bypassing the reconciler's `ReconcileOp::InsertOwned` funnel (the + /// production runtime path; bootstrap recovery uses `load_partition`), + /// so it must never run in production. VSR replica id comes from + /// `PartitionConsensusConfig`, not `self.id` (the local shard index). A + /// `-p iggy-server-ng` build excludes the `simulator` feature and this + /// method; `cargo build --workspace` compiles it in but with no + /// production caller. + #[cfg(any(test, feature = "simulator"))] + pub fn init_partition(&self, namespace: IggyNamespace) where B: MessageBus + Clone, { - let partitions = self.plane.partitions_mut(); + let partitions = self.plane.partitions(); if partitions.contains(&namespace) { return; } - let replica_id = - u8::try_from(self.id).expect("shard id must fit in u8 for partition consensus"); let consensus = VsrConsensus::new( self.partition_consensus.cluster_id, - replica_id, + self.partition_consensus.self_replica_id, self.partition_consensus.replica_count, namespace.inner(), self.partition_consensus.bus.clone(), diff --git a/core/shard/src/metrics.rs b/core/shard/src/metrics.rs index 6188fd485d..c1c45160e7 100644 --- a/core/shard/src/metrics.rs +++ b/core/shard/src/metrics.rs @@ -78,6 +78,7 @@ pub mod frame_drop_variant { pub const PARTITION: &str = "partition"; pub const FORWARD_CLIENT_SEND: &str = "forward_client_send"; pub const FORWARD_REPLICA_SEND: &str = "forward_replica_send"; + pub const METADATA_COMMIT_TICK: &str = "metadata_commit_tick"; } /// Reason labels used in `frame_drops_total`. @@ -97,7 +98,7 @@ pub mod frame_drop_reason { pub const MISROUTED: &str = "misrouted"; } -const VARIANT_COUNT: usize = 5; +const VARIANT_COUNT: usize = 6; const REASON_COUNT: usize = 5; const VARIANTS: [&str; VARIANT_COUNT] = [ @@ -106,6 +107,7 @@ const VARIANTS: [&str; VARIANT_COUNT] = [ frame_drop_variant::PARTITION, frame_drop_variant::FORWARD_CLIENT_SEND, frame_drop_variant::FORWARD_REPLICA_SEND, + frame_drop_variant::METADATA_COMMIT_TICK, ]; const REASONS: [&str; REASON_COUNT] = [ @@ -132,10 +134,18 @@ fn reason_index(s: &str) -> Option { /// at construction so the drop-site hot path never re-enters /// `Family::get_or_create` (which acquires a `RwLock` read guard per /// drop and stalls under VSR retransmit / drop-burst storms). +/// +/// `partitions_materialised_total` / `partitions_removed_total` / +/// `partitions_reconcile_failures_total` are simple unlabelled counters +/// bumped by the partition reconciliation loop; shard id is +/// resolved at scrape time via the per-shard registry, not as a label. #[derive(Clone)] pub struct ShardMetrics { frame_drops_total: Family, cached_counters: [[Counter; REASON_COUNT]; VARIANT_COUNT], + partitions_materialised_total: Counter, + partitions_removed_total: Counter, + partitions_reconcile_failures_total: Counter, } impl ShardMetrics { @@ -162,6 +172,9 @@ impl ShardMetrics { Self { frame_drops_total, cached_counters, + partitions_materialised_total: Counter::default(), + partitions_removed_total: Counter::default(), + partitions_reconcile_failures_total: Counter::default(), } } @@ -181,6 +194,50 @@ impl ShardMetrics { .inc(); } } + + /// Bumped on the owning shard each time the partition reconciliation + /// loop materialises a newly committed namespace via + /// `build_partition_fresh`. + pub fn record_partition_materialised(&self) { + self.partitions_materialised_total.inc(); + } + + /// Bumped on the owning shard each time the partition reconciliation + /// loop drops an `IggyPartition` whose namespace left the committed + /// metadata. + pub fn record_partition_removed(&self) { + self.partitions_removed_total.inc(); + } + + /// Bumped each time `build_partition_fresh` or + /// `delete_partitions_from_disk` returns `Err`. The reconciler retries + /// next tick, but a sustained climb surfaces a stuck partition (disk + /// full, permission denied, ENOENT on a path it cannot recreate, etc.). + pub fn record_partition_reconcile_failure(&self) { + self.partitions_reconcile_failures_total.inc(); + } + + /// Snapshot of `partitions_materialised_total`. Test-only accessor; + /// production scrape goes through the prometheus registry. + #[cfg(test)] + #[must_use] + pub fn partitions_materialised_value(&self) -> u64 { + self.partitions_materialised_total.get() + } + + /// Snapshot of `partitions_removed_total`. Test-only accessor. + #[cfg(test)] + #[must_use] + pub fn partitions_removed_value(&self) -> u64 { + self.partitions_removed_total.get() + } + + /// Snapshot of `partitions_reconcile_failures_total`. Test-only accessor. + #[cfg(test)] + #[must_use] + pub fn partitions_reconcile_failures_value(&self) -> u64 { + self.partitions_reconcile_failures_total.get() + } } #[cfg(test)] diff --git a/core/shard/src/router.rs b/core/shard/src/router.rs index 33bffc86c5..e4bf136d20 100644 --- a/core/shard/src/router.rs +++ b/core/shard/src/router.rs @@ -74,7 +74,7 @@ fn extract_routing(bag: MessageBag) -> (Operation, u64, Message) /// pump), preventing concurrent access from independent async tasks. impl IggyShard where - B: MessageBus + ConnectionInstaller, + B: MessageBus + ConnectionInstaller + 'static, T: ShardsTable, { /// Network-receive entry point. Classifies the raw @@ -233,7 +233,7 @@ where #[allow(clippy::future_not_send)] pub async fn run_message_pump(&self, stop: Receiver<()>) where - B: MessageBus, + B: MessageBus + 'static, MJ: JournalHandle, ::Target: Journal< ::Storage, @@ -259,6 +259,8 @@ where if self.accept_frame_for_self(&frame) { self.process_frame(frame).await; self.process_loopback(&mut loopback_buf, &mut namespace_scratch).await; + // Tail drain catches reconcile ops whose marker was dropped. + self.apply_reconcile_ops(); } } Err(_) => break, @@ -273,6 +275,7 @@ where self.process_frame(frame).await; self.process_loopback(&mut loopback_buf, &mut namespace_scratch) .await; + self.apply_reconcile_ops(); } } } @@ -313,7 +316,7 @@ where #[allow(clippy::future_not_send)] async fn process_frame(&self, frame: ShardFrame) where - B: MessageBus, + B: MessageBus + 'static, MJ: JournalHandle, ::Target: Journal< ::Storage, @@ -337,7 +340,7 @@ where #[allow(clippy::future_not_send)] async fn process_lifecycle(&self, payload: LifecycleFrame) where - B: MessageBus, + B: MessageBus + 'static, { match payload { LifecycleFrame::ReplicaConnectionSetup { fd, replica_id } => { @@ -418,6 +421,26 @@ where // and pushes the list over `reply`. (self.on_list_clients)(reply); } + LifecycleFrame::MetadataCommitTick => { + // Reconciler may not yet be wired (e.g. mid-bootstrap, or + // single-shard tests that never enable the reconciler loop). + // Count the drop so operators can detect a stuck handler + // slot; the reconciler also runs a periodic tick that + // recovers any missed wake-up. + if !self.dispatch_metadata_commit_tick() { + self.metrics.record_frame_drop( + frame_drop_variant::METADATA_COMMIT_TICK, + frame_drop_reason::UNROUTABLE, + ); + tracing::trace!( + shard = self.id, + "metadata commit tick received before reconciler handler installed; dropping" + ); + } + } + LifecycleFrame::ReconcileApply => { + self.apply_reconcile_ops(); + } } } } diff --git a/core/shard/src/shards_table.rs b/core/shard/src/shards_table.rs index b7c7204e3f..a6833a5fb9 100644 --- a/core/shard/src/shards_table.rs +++ b/core/shard/src/shards_table.rs @@ -30,6 +30,17 @@ pub trait ShardsTable { /// namespace is not yet registered (partition not created or update /// hasn't propagated). fn shard_for(&self, namespace: IggyNamespace) -> Option; + + /// Epoch (`Partition::created_revision`) stored on the row, if any. + /// The reconciler compares it to the committed value to detect a + /// stale local partition after slab-key reuse. + fn epoch_for(&self, namespace: IggyNamespace) -> Option; + + fn insert(&self, namespace: IggyNamespace, location: PartitionLocation); + + fn remove(&self, namespace: &IggyNamespace) -> Option; + + fn namespaces(&self) -> Vec; } /// Always-`None` impl. Satisfies the trait for test / simulator paths @@ -41,6 +52,20 @@ impl ShardsTable for () { fn shard_for(&self, _namespace: IggyNamespace) -> Option { None } + + fn epoch_for(&self, _namespace: IggyNamespace) -> Option { + None + } + + fn insert(&self, _namespace: IggyNamespace, _location: PartitionLocation) {} + + fn remove(&self, _namespace: &IggyNamespace) -> Option { + None + } + + fn namespaces(&self) -> Vec { + Vec::new() + } } /// Lock-free shards table backed by [`papaya::HashMap`]. @@ -68,21 +93,32 @@ impl PapayaShardsTable { inner: papaya::HashMap::with_capacity(capacity), } } +} - pub fn insert(&self, namespace: IggyNamespace, location: PartitionLocation) { +impl ShardsTable for PapayaShardsTable { + fn shard_for(&self, namespace: IggyNamespace) -> Option { + let guard = self.inner.guard(); + self.inner.get(&namespace, &guard).map(|loc| *loc.shard_id) + } + + fn epoch_for(&self, namespace: IggyNamespace) -> Option { + let guard = self.inner.guard(); + self.inner.get(&namespace, &guard).map(|loc| loc.epoch) + } + + fn insert(&self, namespace: IggyNamespace, location: PartitionLocation) { self.inner.pin().insert(namespace, location); } - pub fn remove(&self, namespace: &IggyNamespace) -> Option { + fn remove(&self, namespace: &IggyNamespace) -> Option { let guard = self.inner.guard(); self.inner.remove(namespace, &guard).copied() } -} -impl ShardsTable for PapayaShardsTable { - fn shard_for(&self, namespace: IggyNamespace) -> Option { + /// Snapshot: papaya guards cannot escape the call site. + fn namespaces(&self) -> Vec { let guard = self.inner.guard(); - self.inner.get(&namespace, &guard).map(|loc| *loc.shard_id) + self.inner.iter(&guard).map(|(ns, _)| *ns).collect() } } diff --git a/core/simulator/Cargo.toml b/core/simulator/Cargo.toml index 3cd47974a7..45b1c15608 100644 --- a/core/simulator/Cargo.toml +++ b/core/simulator/Cargo.toml @@ -38,7 +38,7 @@ partitions = { path = "../partitions" } rand = { workspace = true } rand_xoshiro = { workspace = true } server_common = { path = "../server_common" } -shard = { path = "../shard" } +shard = { path = "../shard", features = ["simulator"] } strum = { workspace = true } tracing = { workspace = true } diff --git a/core/simulator/src/bus.rs b/core/simulator/src/bus.rs index a6c65915d9..f0247e139d 100644 --- a/core/simulator/src/bus.rs +++ b/core/simulator/src/bus.rs @@ -108,6 +108,8 @@ impl SimOutbox { } impl MessageBus for SimOutbox { + fn track_background(&self, _handle: message_bus::JoinHandle<()>) {} + async fn send_to_client( &self, client_id: u128, @@ -163,6 +165,10 @@ impl Deref for SharedSimOutbox { } impl MessageBus for SharedSimOutbox { + fn track_background(&self, handle: message_bus::JoinHandle<()>) { + self.0.track_background(handle); + } + async fn send_to_client( &self, client_id: u128, diff --git a/core/simulator/src/replica.rs b/core/simulator/src/replica.rs index 399706d769..2dc02436be 100644 --- a/core/simulator/src/replica.rs +++ b/core/simulator/src/replica.rs @@ -88,7 +88,7 @@ pub fn new_replica(id: u8, name: String, bus: &Arc, replica_count: u8 (), shard::PartitionConsensusConfig::new( CLUSTER_ID, - replica_count, + shard::ReplicaTopology::new(id, replica_count), SharedSimOutbox(Arc::clone(bus)), ), )