From f1409c790ea81cafdb5cd090e302fb8fda6f8a0a Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 12:55:29 -0500
Subject: [PATCH 1/6] test: pin sign candidate-generation contract ahead of
 tiled internals

Independent oracle (score_all + full lexicographic sort by hamming asc,
doc_id asc) pins top_m_candidates and top_m_candidates_batched_serial_csr
exactly: random corpora across block boundaries, massive-tie and
duplicate-run corpora exercising boundary tie-breaks, edge geometries
(m >= n, single doc, empty batch), and the dim=1024 shape. Must pass
bit-identically before and after the tiling swap.
---
 tests/tiled_candgen.rs | 157 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 tests/tiled_candgen.rs
diff --git a/tests/tiled_candgen.rs b/tests/tiled_candgen.rs
new file mode 100644
index 0000000..5a416f5
--- /dev/null
+++ b/tests/tiled_candgen.rs
@@ -0,0 +1,157 @@
+//! Contract-pinning tests for sign candidate generation, written ahead of the
+//! tiled internals swap of `top_m_candidates` /
+//! `top_m_candidates_batched_serial_csr`. The oracle is independent of the
+//! implementation under test: `score_all` (dense agreement counts) plus a
+//! full lexicographic sort by `(hamming asc, doc_id asc)`. These tests pin
+//! today's behavior exactly — including tie handling at the m-th position —
+//! and must pass bit-identically before and after the swap.
+
+use ordvec::SignBitmap;
+
+/// Deterministic xorshift so corpora are reproducible without a rand dep.
+struct XorShift(u64);
+
+impl XorShift {
+    fn next_f32(&mut self) -> f32 {
+        self.0 ^= self.0 << 13;
+        self.0 ^= self.0 >> 7;
+        self.0 ^= self.0 << 17;
+        // Map to [-1, 1) with plenty of sign variety.
+        ((self.0 >> 40) as f32 / 8_388_608.0) - 1.0
+    }
+}
+
+fn random_corpus(dim: usize, n: usize, seed: u64) -> Vec<f32> {
+    let mut rng = XorShift(seed | 1);
+    (0..n * dim).map(|_| rng.next_f32()).collect()
+}
+
+/// Tie-heavy corpus: every coordinate is +/-1 drawn from a tiny pattern set,
+/// so hamming distances collide massively and the (hamming, doc_id)
+/// tie-break does real work at the selection boundary.
+fn tie_heavy_corpus(dim: usize, n: usize) -> Vec<f32> {
+    (0..n)
+        .flat_map(|doc| {
+            let pattern = doc % 4;
+            (0..dim).map(move |c| {
+                if (c + pattern) % 3 == 0 {
+                    -1.0
+                } else {
+                    1.0
+                }
+            })
+        })
+        .collect()
+}
+
+fn oracle_top_m(sign: &SignBitmap, q: &[f32], m: usize) -> Vec<u32> {
+    let dim_u32 = u32::try_from(q.len()).unwrap();
+    // score_all returns agreement (dim - hamming), higher is better.
+    let agreements = sign.score_all(q);
+    let mut ids: Vec<u32> = (0..agreements.len() as u32).collect();
+    ids.sort_by_key(|&i| (dim_u32 - agreements[i as usize], i));
+    ids.truncate(m.min(agreements.len()));
+    ids
+}
+
+fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label: &str) {
+    let mut sign = SignBitmap::new(dim);
+    sign.add(vectors);
+    let nq = queries.len() / dim;
+
+    // Single-query path.
+    for qi in 0..nq {
+        let q = &queries[qi * dim..(qi + 1) * dim];
+        let got = sign.top_m_candidates(q, m);
+        let want = oracle_top_m(&sign, q, m);
+        assert_eq!(got, want, "{label}: single-query mismatch at query {qi}, m={m}");
+    }
+
+    // Batched serial CSR path: row qi must equal the single-query result.
+    let cb = sign.top_m_candidates_batched_serial_csr(queries, m);
+    assert_eq!(cb.offsets.len(), nq + 1, "{label}: CSR offsets length");
+    for qi in 0..nq {
+        let row = &cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]];
+        let want = oracle_top_m(&sign, &queries[qi * dim..(qi + 1) * dim], m);
+        assert_eq!(row, &want[..], "{label}: CSR row mismatch at query {qi}, m={m}");
+    }
+}
+
+/// Random corpus large enough to span many doc blocks under any plausible
+/// tile size, at a SIMD-friendly dim.
+#[test]
+fn random_corpus_matches_oracle_across_block_boundaries() {
+    let dim = 128;
+    let n = 10_240;
+    let vectors = random_corpus(dim, n, 0xC0FFEE);
+    let queries = random_corpus(dim, 33, 0xBEEF);
+    for m in [1, 7, 256, 500] {
+        assert_contract(dim, &vectors, &queries, m, "random");
+    }
+}
+
+/// Massive hamming ties: selection at the boundary is decided purely by
+/// doc_id ascending. This is the case a streaming collector most easily gets
+/// subtly wrong.
+#[test]
+fn tie_heavy_corpus_selects_lowest_doc_ids_at_boundary() {
+    let dim = 64;
+    let n = 4_096;
+    let vectors = tie_heavy_corpus(dim, n);
+    let queries = random_corpus(dim, 9, 0xABCD);
+    for m in [1, 3, 100, 1_000] {
+        assert_contract(dim, &vectors, &queries, m, "tie-heavy");
+    }
+}
+
+/// Exact duplicate documents: every duplicate group is one giant tie run,
+/// longer than m, exercising equal-hamming runs that exceed the collector.
+#[test]
+fn duplicate_documents_tie_runs_longer_than_m() {
+    let dim = 64;
+    let base = random_corpus(dim, 8, 0x1234);
+    // 8 distinct vectors, each repeated 512 times => tie runs of 512.
+    let mut vectors = Vec::with_capacity(8 * 512 * dim);
+    for rep in 0..512 {
+        let _ = rep;
+        vectors.extend_from_slice(&base);
+    }
+    let queries = random_corpus(dim, 5, 0x9999);
+    for m in [10, 100, 513] {
+        assert_contract(dim, &vectors, &queries, m, "duplicates");
+    }
+}
+
+/// Edge geometry: m >= n, m == n, single doc, single query, nq == 0.
+#[test]
+fn edge_geometries_match_oracle() {
+    let dim = 64;
+    let vectors = random_corpus(dim, 17, 0x42);
+    let queries = random_corpus(dim, 3, 0x43);
+    for m in [17, 25, 1] {
+        assert_contract(dim, &vectors, &queries, m, "edge");
+    }
+
+    let single_doc = random_corpus(dim, 1, 0x77);
+    assert_contract(dim, &single_doc, &queries, 4, "single-doc");
+
+    // Empty query batch: CSR must be a single zero offset and no candidates.
+    let mut sign = SignBitmap::new(dim);
+    sign.add(&vectors);
+    let cb = sign.top_m_candidates_batched_serial_csr(&[], 8);
+    assert_eq!(cb.offsets, vec![0]);
+    assert!(cb.candidates.is_empty());
+}
+
+/// Large-dim smoke at the shape the arXiv corpus uses (1024 dims), enough
+/// rows to cross several L2-sized doc blocks.
+#[test]
+fn dim_1024_shape_matches_oracle() {
+    let dim = 1024;
+    let n = 6_000;
+    let vectors = random_corpus(dim, n, 0xA5A5);
+    let queries = random_corpus(dim, 8, 0x5A5A);
+    for m in [256, 320] {
+        assert_contract(dim, &vectors, &queries, m, "dim1024");
+    }
+}

From 8af3352302fa5c72071901497ddfcab94c115d6d Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 13:11:31 -0500
Subject: [PATCH 2/6] perf: stream the corpus once per call in sign candidate
 generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

top_m_candidates_batched_serial_csr previously looped the single-query
path, re-streaming the full sign bitmap per query (documented-naive
Track-1). The internals now scan the corpus once per call in L2-sized
doc blocks, score every query of the call against each hot block in
query tiles via the existing batched kernel, and select per-query
top-m with bounded (hamming, doc_id) min-collectors — bit-identical
to a full sort by construction, independent of processing order (the
key IS the contract's sort key). top_m_candidates routes through the
same core, dropping its per-call n-row Hamming materialisation.

Per-query corpus traffic drops by the call's query count: at 1.26M
rows x 1024 dims, a 2048-query call reads the 161MB sidecar once
instead of 2048 times. Serial contract preserved (no rayon); the
oracle suite (tests/tiled_candgen.rs) pins bit-identical outputs
across random, tie-heavy, duplicate-run, and edge geometries.
---
 src/sign_bitmap.rs | 117 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 89 insertions(+), 28 deletions(-)

diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index 66f971a..e2296c9 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -39,6 +39,7 @@
 //! scalar path. See [`crate::avx512vpop_supported`].
 
 use rayon::prelude::*;
+use std::collections::BinaryHeap;
 
 use crate::OrdvecError;
 
@@ -220,6 +221,77 @@ impl SignBitmap {
     /// SIMD dispatch paths — same audit discipline as
     /// [`crate::Bitmap::top_m_candidates`].
     #[must_use = "this scans the corpus to generate candidates; dropping the result discards that work"]
+    /// Streamed exact top-m selection shared by [`Self::top_m_candidates`]
+    /// and [`Self::top_m_candidates_batched_serial_csr`]: the corpus is
+    /// scanned once per call in L2-sized doc blocks, each hot block is
+    /// scored against every query (in small query tiles), and per-query
+    /// bounded min-m collectors keyed by `(hamming, doc_id)` select exactly
+    /// the lexicographic top-m — bit-identical to a full sort, independent
+    /// of processing order. Serial by contract: no rayon.
+    fn top_m_candidates_streamed(&self, queries: &[f32], m_eff: usize) -> Vec<Vec<u32>> {
+        const TILE_QUERIES: usize = 32;
+        const BLOCK_BYTES: usize = 256 * 1024;
+
+        let dim = self.dim;
+        let nq = queries.len() / dim;
+        let qpv = self.qwords_per_vec;
+        let n = self.n_vectors;
+        debug_assert!(m_eff >= 1 && m_eff <= n);
+
+        let mut q_bitmaps = vec![0u64; nq * qpv];
+        for qi in 0..nq {
+            let qb = self.build_query_bitmap(&queries[qi * dim..(qi + 1) * dim]);
+            q_bitmaps[qi * qpv..(qi + 1) * qpv].copy_from_slice(&qb);
+        }
+
+        let block_docs = (BLOCK_BYTES / (qpv * 8)).max(64).min(n);
+        let tile = TILE_QUERIES.min(nq);
+        let mut block_scores = vec![0u32; tile * block_docs];
+        // Max-heap keeps the current worst kept key at the top, so the
+        // retained set is always the m lexicographically smallest
+        // (hamming, doc_id) keys seen so far.
+        let mut heaps: Vec<BinaryHeap<(u32, u32)>> = (0..nq)
+            .map(|_| BinaryHeap::with_capacity(m_eff + 1))
+            .collect();
+
+        let mut block_start = 0usize;
+        while block_start < n {
+            let bn = block_docs.min(n - block_start);
+            let block = &self.bitmaps[block_start * qpv..(block_start + bn) * qpv];
+            let mut tile_start = 0usize;
+            while tile_start < nq {
+                let tq = tile.min(nq - tile_start);
+                let qb_tile = &q_bitmaps[tile_start * qpv..(tile_start + tq) * qpv];
+                let scores = &mut block_scores[..tq * bn];
+                sign_scan_collect_batched(block, bn, qpv, qb_tile, tq, scores);
+                for ti in 0..tq {
+                    let heap = &mut heaps[tile_start + ti];
+                    let row = &scores[ti * bn..(ti + 1) * bn];
+                    for (d, &hamming) in row.iter().enumerate() {
+                        let key = (hamming, (block_start + d) as u32);
+                        if heap.len() < m_eff {
+                            heap.push(key);
+                        } else if key < *heap.peek().expect("non-empty full collector") {
+                            heap.pop();
+                            heap.push(key);
+                        }
+                    }
+                }
+                tile_start += tq;
+            }
+            block_start += bn;
+        }
+
+        heaps
+            .into_iter()
+            .map(|heap| {
+                let mut kept = heap.into_vec();
+                kept.sort_unstable();
+                kept.into_iter().map(|(_, doc)| doc).collect()
+            })
+            .collect()
+    }
+
     pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec<u32> {
         assert_eq!(q.len(), self.dim);
         crate::util::assert_all_finite(q);
@@ -227,27 +299,9 @@ impl SignBitmap {
         if m_eff == 0 {
             return Vec::new();
         }
-        let qb = self.build_query_bitmap(q);
-        let mut scores = vec![0u32; self.n_vectors]; // Hamming distance per doc
-        sign_scan_collect(
-            &self.bitmaps,
-            self.n_vectors,
-            self.qwords_per_vec,
-            &qb,
-            &mut scores,
-        );
-        let mut idx: Vec<u32> = (0..self.n_vectors as u32).collect();
-        // Ascending Hamming = best candidates first. Composite key
-        // ensures deterministic partition at boundary ties.
-        let cmp = |a: &u32, b: &u32| {
-            scores[*a as usize]
-                .cmp(&scores[*b as usize])
-                .then_with(|| a.cmp(b))
-        };
-        idx.select_nth_unstable_by(m_eff - 1, cmp);
-        let mut head = idx[..m_eff].to_vec();
-        head.sort_unstable_by(cmp);
-        head
+        self.top_m_candidates_streamed(q, m_eff)
+            .pop()
+            .expect("streamed selection returns one row per query")
     }
 
     /// Batched variant: stream the sign bitmaps **once** and produce
@@ -313,10 +367,12 @@ impl SignBitmap {
     /// pool. (The existing [`Self::top_m_candidates_batched`] remains the
     /// internally-parallel standalone convenience.)
     ///
-    /// Track-1 implementation is intentionally naive — it loops the single-query
-    /// [`Self::top_m_candidates`] (which materialises a per-query `n` Hamming
-    /// row). A future release may replace the internals with streaming top-m
-    /// behind this frozen signature; the CSR output contract will not change.
+    /// The internals stream the corpus **once per call** in L2-sized doc
+    /// blocks, scoring every query of the call against each hot block and
+    /// selecting per-query top-m with bounded `(hamming, doc_id)` collectors
+    /// — per-query corpus traffic drops by the call's query count relative
+    /// to the historical per-query rescan. The CSR output contract is
+    /// unchanged and bit-identical to the previous implementation.
     ///
     /// # Example
     /// ```no_run
@@ -345,9 +401,14 @@ impl SignBitmap {
         let mut offsets = Vec::with_capacity(nq + 1);
         offsets.push(0usize);
         let mut candidates = Vec::with_capacity(nq.saturating_mul(m_eff));
-        for qi in 0..nq {
-            let q = &queries[qi * dim..(qi + 1) * dim];
-            let row = self.top_m_candidates(q, m);
+        if nq == 0 || m_eff == 0 {
+            offsets.extend(std::iter::repeat_n(0usize, nq));
+            return CandidateBatch {
+                candidates,
+                offsets,
+            };
+        }
+        for row in self.top_m_candidates_streamed(queries, m_eff) {
             candidates.extend_from_slice(&row);
             offsets.push(candidates.len());
         }

From 3f65e03c87f0eb970a3c5467b4bf2f50080d49f2 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 13:23:31 -0500
Subject: [PATCH 3/6] perf: keep the dense partition path for single-query
 candidates

Audit remediation: routing top_m_candidates through the streamed core
measured +50-90% at small/medium n with m in the hundreds (bounded
heap O(n log m) vs select_nth_unstable_by O(n)); with one query there
is no scan to share, so nq=1 stays on the dense path (parity-or-better
at every measured size). Also per audit: the block-boundary oracle
test now genuinely spans three blocks (the dim=128 shape fit one
block), and adds the dim=768 AVX-512 tail-residue x multi-block case
to the permanent suite.
---
 src/sign_bitmap.rs     | 28 +++++++++++++++++++++++++---
 tests/tiled_candgen.rs | 38 ++++++++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index e2296c9..b47f379 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -299,9 +299,31 @@ impl SignBitmap {
         if m_eff == 0 {
             return Vec::new();
         }
-        self.top_m_candidates_streamed(q, m_eff)
-            .pop()
-            .expect("streamed selection returns one row per query")
+        // Single-query stays on the dense partition path: with one query
+        // there is no scan to share, and select_nth_unstable_by (O(n)
+        // average) measurably beats an O(n log m) bounded heap for m in the
+        // hundreds at small/medium n (audit: +50-90% regression otherwise).
+        let qb = self.build_query_bitmap(q);
+        let mut scores = vec![0u32; self.n_vectors]; // Hamming distance per doc
+        sign_scan_collect(
+            &self.bitmaps,
+            self.n_vectors,
+            self.qwords_per_vec,
+            &qb,
+            &mut scores,
+        );
+        let mut idx: Vec<u32> = (0..self.n_vectors as u32).collect();
+        // Ascending Hamming = best candidates first. Composite key
+        // ensures deterministic partition at boundary ties.
+        let cmp = |a: &u32, b: &u32| {
+            scores[*a as usize]
+                .cmp(&scores[*b as usize])
+                .then_with(|| a.cmp(b))
+        };
+        idx.select_nth_unstable_by(m_eff - 1, cmp);
+        let mut head = idx[..m_eff].to_vec();
+        head.sort_unstable_by(cmp);
+        head
     }
 
     /// Batched variant: stream the sign bitmaps **once** and produce
diff --git a/tests/tiled_candgen.rs b/tests/tiled_candgen.rs
index 5a416f5..33ac414 100644
--- a/tests/tiled_candgen.rs
+++ b/tests/tiled_candgen.rs
@@ -33,13 +33,7 @@ fn tie_heavy_corpus(dim: usize, n: usize) -> Vec<f32> {
     (0..n)
         .flat_map(|doc| {
             let pattern = doc % 4;
-            (0..dim).map(move |c| {
-                if (c + pattern) % 3 == 0 {
-                    -1.0
-                } else {
-                    1.0
-                }
-            })
+            (0..dim).map(move |c| if (c + pattern) % 3 == 0 { -1.0 } else { 1.0 })
         })
         .collect()
 }
@@ -64,7 +58,10 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label
         let q = &queries[qi * dim..(qi + 1) * dim];
         let got = sign.top_m_candidates(q, m);
         let want = oracle_top_m(&sign, q, m);
-        assert_eq!(got, want, "{label}: single-query mismatch at query {qi}, m={m}");
+        assert_eq!(
+            got, want,
+            "{label}: single-query mismatch at query {qi}, m={m}"
+        );
     }
 
     // Batched serial CSR path: row qi must equal the single-query result.
@@ -73,7 +70,11 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label
     for qi in 0..nq {
         let row = &cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]];
         let want = oracle_top_m(&sign, &queries[qi * dim..(qi + 1) * dim], m);
-        assert_eq!(row, &want[..], "{label}: CSR row mismatch at query {qi}, m={m}");
+        assert_eq!(
+            row,
+            &want[..],
+            "{label}: CSR row mismatch at query {qi}, m={m}"
+        );
     }
 }
 
@@ -81,7 +82,10 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label
 /// tile size, at a SIMD-friendly dim.
 #[test]
 fn random_corpus_matches_oracle_across_block_boundaries() {
-    let dim = 128;
+    // dim=512 -> 8 qwords/vec -> 4096-doc blocks; n=10240 spans three
+    // blocks including a final partial one (audit: the previous dim=128
+    // shape fit in a single block, so the loop never crossed a boundary).
+    let dim = 512;
     let n = 10_240;
     let vectors = random_corpus(dim, n, 0xC0FFEE);
     let queries = random_corpus(dim, 33, 0xBEEF);
@@ -155,3 +159,17 @@ fn dim_1024_shape_matches_oracle() {
         assert_contract(dim, &vectors, &queries, m, "dim1024");
     }
 }
+
+/// AVX-512 tail residue (dim=768 -> qpv=12, rem=4) composed with
+/// multi-block crossing and a final partial block — the kernel-shape case
+/// the audit flagged as untested in the permanent suite.
+#[test]
+fn dim_768_tail_residue_crosses_blocks() {
+    let dim = 768;
+    let n = 3_200; // block_docs = 262144/96 = 2730 -> 2 blocks, partial tail
+    let vectors = random_corpus(dim, n, 0x7E57);
+    let queries = random_corpus(dim, 7, 0x7E58);
+    for m in [64, 320] {
+        assert_contract(dim, &vectors, &queries, m, "dim768-tail");
+    }
+}

From 9350b41fd7e7fb5480718ffebaaed4031f1f2659 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 19:07:28 -0500
Subject: [PATCH 4/6] fix: assert whole-row query buffers in the streamed core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bot-review remediation (Qodo, PR #278): the shared core derived nq by
integer division; a ragged buffer from a future caller would silently
truncate. All current callers validate upstream — this is the cheap
in-core invariant.
---
 src/sign_bitmap.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index b47f379..d44bb78 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -233,6 +233,10 @@ impl SignBitmap {
         const BLOCK_BYTES: usize = 256 * 1024;
 
         let dim = self.dim;
+        debug_assert!(
+            queries.len().is_multiple_of(dim),
+            "queries buffer must be a whole number of rows"
+        );
         let nq = queries.len() / dim;
         let qpv = self.qwords_per_vec;
         let n = self.n_vectors;

From cf357e536832b9b48084a1db8a77c89511f09b07 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 22:41:53 -0500
Subject: [PATCH 5/6] fix: checked selection-state bounds in the streamed
 candidate core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bot-review remediation (Qodo, PR #278): nq * m_eff can overflow usize
on 32-bit/wasm32 targets, and the CSR wrapper's saturating_mul would
attempt a usize::MAX allocation. Both sites now use checked
multiplication with a clear tile-the-batch message, matching the
crate's checked-allocation discipline. The exact m_eff + 1 heap
reservation is kept deliberately: gradual growth double-allocates to
the next power of two (~2x peak per query) — the reservation is the
memory-optimal choice, now documented.
---
 src/sign_bitmap.rs | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index d44bb78..ec8cd20 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -254,6 +254,18 @@ impl SignBitmap {
         // Max-heap keeps the current worst kept key at the top, so the
         // retained set is always the m lexicographically smallest
         // (hamming, doc_id) keys seen so far.
+        // Selection state is O(nq * m_eff) on top of the CSR output — an
+        // explicit checked bound (32-bit/wasm32 targets can overflow the
+        // multiplication) with a clear message, per the crate's
+        // checked-allocation discipline. Exact per-heap reservation of
+        // m_eff + 1 is deliberate: gradual growth would double-allocate to
+        // the next power of two (~2x m_eff peak per query); callers with
+        // extreme nq * m_eff should tile the query batch (as OrdinalDB's
+        // chunk scheduler does).
+        let selection_cells = nq.checked_mul(m_eff).unwrap_or_else(|| {
+            panic!("selection state nq ({nq}) * m ({m_eff}) overflows usize; tile the query batch")
+        });
+        let _ = selection_cells;
         let mut heaps: Vec<BinaryHeap<(u32, u32)>> = (0..nq)
             .map(|_| BinaryHeap::with_capacity(m_eff + 1))
             .collect();
@@ -426,7 +438,9 @@ impl SignBitmap {
         let m_eff = m.min(self.n_vectors);
         let mut offsets = Vec::with_capacity(nq + 1);
         offsets.push(0usize);
-        let mut candidates = Vec::with_capacity(nq.saturating_mul(m_eff));
+        let mut candidates = Vec::with_capacity(nq.checked_mul(m_eff).unwrap_or_else(|| {
+            panic!("CSR output nq ({nq}) * m ({m_eff}) overflows usize; tile the query batch")
+        }));
         if nq == 0 || m_eff == 0 {
             offsets.extend(std::iter::repeat_n(0usize, nq));
             return CandidateBatch {

From e091df5f174dfa4ac3fd5348509ae69e7262d69d Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Fri, 3 Jul 2026 23:12:07 -0500
Subject: [PATCH 6/6] perf: build query bitmaps in place in the streamed core

Bot-review remediation (Qodo, #283 inline): build_query_bitmap
allocated a fresh Vec and re-validated finiteness per query; the
entry points already validate the whole buffer and the destination
is preallocated. Oracle suites pin bit-identical output.
---
 src/sign_bitmap.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index ec8cd20..8d2e764 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -242,10 +242,18 @@ impl SignBitmap {
         let n = self.n_vectors;
         debug_assert!(m_eff >= 1 && m_eff <= n);
 
+        // Build bitmaps in place: the entry points already validated the
+        // whole query buffer, and build_query_bitmap would allocate a fresh
+        // Vec (and re-validate) per query on this hot path.
         let mut q_bitmaps = vec![0u64; nq * qpv];
         for qi in 0..nq {
-            let qb = self.build_query_bitmap(&queries[qi * dim..(qi + 1) * dim]);
-            q_bitmaps[qi * qpv..(qi + 1) * qpv].copy_from_slice(&qb);
+            let q = &queries[qi * dim..(qi + 1) * dim];
+            let bm = &mut q_bitmaps[qi * qpv..(qi + 1) * qpv];
+            for (j, &value) in q.iter().enumerate() {
+                if value > 0.0 {
+                    bm[j / 64] |= 1u64 << (j % 64);
+                }
+            }
         }
 
         let block_docs = (BLOCK_BYTES / (qpv * 8)).max(64).min(n);