From f1409c790ea81cafdb5cd090e302fb8fda6f8a0a Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 12:55:29 -0500 Subject: [PATCH 1/6] test: pin sign candidate-generation contract ahead of tiled internals Independent oracle (score_all + full lexicographic sort by hamming asc, doc_id asc) pins top_m_candidates and top_m_candidates_batched_serial_csr exactly: random corpora across block boundaries, massive-tie and duplicate-run corpora exercising boundary tie-breaks, edge geometries (m >= n, single doc, empty batch), and the dim=1024 shape. Must pass bit-identically before and after the tiling swap. --- tests/tiled_candgen.rs | 157 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 tests/tiled_candgen.rs diff --git a/tests/tiled_candgen.rs b/tests/tiled_candgen.rs new file mode 100644 index 0000000..5a416f5 --- /dev/null +++ b/tests/tiled_candgen.rs @@ -0,0 +1,157 @@ +//! Contract-pinning tests for sign candidate generation, written ahead of the +//! tiled internals swap of `top_m_candidates` / +//! `top_m_candidates_batched_serial_csr`. The oracle is independent of the +//! implementation under test: `score_all` (dense agreement counts) plus a +//! full lexicographic sort by `(hamming asc, doc_id asc)`. These tests pin +//! today's behavior exactly — including tie handling at the m-th position — +//! and must pass bit-identically before and after the swap. + +use ordvec::SignBitmap; + +/// Deterministic xorshift so corpora are reproducible without a rand dep. +struct XorShift(u64); + +impl XorShift { + fn next_f32(&mut self) -> f32 { + self.0 ^= self.0 << 13; + self.0 ^= self.0 >> 7; + self.0 ^= self.0 << 17; + // Map to [-1, 1) with plenty of sign variety. + ((self.0 >> 40) as f32 / 8_388_608.0) - 1.0 + } +} + +fn random_corpus(dim: usize, n: usize, seed: u64) -> Vec { + let mut rng = XorShift(seed | 1); + (0..n * dim).map(|_| rng.next_f32()).collect() +} + +/// Tie-heavy corpus: every coordinate is +/-1 drawn from a tiny pattern set, +/// so hamming distances collide massively and the (hamming, doc_id) +/// tie-break does real work at the selection boundary. +fn tie_heavy_corpus(dim: usize, n: usize) -> Vec { + (0..n) + .flat_map(|doc| { + let pattern = doc % 4; + (0..dim).map(move |c| { + if (c + pattern) % 3 == 0 { + -1.0 + } else { + 1.0 + } + }) + }) + .collect() +} + +fn oracle_top_m(sign: &SignBitmap, q: &[f32], m: usize) -> Vec { + let dim_u32 = u32::try_from(q.len()).unwrap(); + // score_all returns agreement (dim - hamming), higher is better. + let agreements = sign.score_all(q); + let mut ids: Vec = (0..agreements.len() as u32).collect(); + ids.sort_by_key(|&i| (dim_u32 - agreements[i as usize], i)); + ids.truncate(m.min(agreements.len())); + ids +} + +fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label: &str) { + let mut sign = SignBitmap::new(dim); + sign.add(vectors); + let nq = queries.len() / dim; + + // Single-query path. + for qi in 0..nq { + let q = &queries[qi * dim..(qi + 1) * dim]; + let got = sign.top_m_candidates(q, m); + let want = oracle_top_m(&sign, q, m); + assert_eq!(got, want, "{label}: single-query mismatch at query {qi}, m={m}"); + } + + // Batched serial CSR path: row qi must equal the single-query result. + let cb = sign.top_m_candidates_batched_serial_csr(queries, m); + assert_eq!(cb.offsets.len(), nq + 1, "{label}: CSR offsets length"); + for qi in 0..nq { + let row = &cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]]; + let want = oracle_top_m(&sign, &queries[qi * dim..(qi + 1) * dim], m); + assert_eq!(row, &want[..], "{label}: CSR row mismatch at query {qi}, m={m}"); + } +} + +/// Random corpus large enough to span many doc blocks under any plausible +/// tile size, at a SIMD-friendly dim. +#[test] +fn random_corpus_matches_oracle_across_block_boundaries() { + let dim = 128; + let n = 10_240; + let vectors = random_corpus(dim, n, 0xC0FFEE); + let queries = random_corpus(dim, 33, 0xBEEF); + for m in [1, 7, 256, 500] { + assert_contract(dim, &vectors, &queries, m, "random"); + } +} + +/// Massive hamming ties: selection at the boundary is decided purely by +/// doc_id ascending. This is the case a streaming collector most easily gets +/// subtly wrong. +#[test] +fn tie_heavy_corpus_selects_lowest_doc_ids_at_boundary() { + let dim = 64; + let n = 4_096; + let vectors = tie_heavy_corpus(dim, n); + let queries = random_corpus(dim, 9, 0xABCD); + for m in [1, 3, 100, 1_000] { + assert_contract(dim, &vectors, &queries, m, "tie-heavy"); + } +} + +/// Exact duplicate documents: every duplicate group is one giant tie run, +/// longer than m, exercising equal-hamming runs that exceed the collector. +#[test] +fn duplicate_documents_tie_runs_longer_than_m() { + let dim = 64; + let base = random_corpus(dim, 8, 0x1234); + // 8 distinct vectors, each repeated 512 times => tie runs of 512. + let mut vectors = Vec::with_capacity(8 * 512 * dim); + for rep in 0..512 { + let _ = rep; + vectors.extend_from_slice(&base); + } + let queries = random_corpus(dim, 5, 0x9999); + for m in [10, 100, 513] { + assert_contract(dim, &vectors, &queries, m, "duplicates"); + } +} + +/// Edge geometry: m >= n, m == n, single doc, single query, nq == 0. +#[test] +fn edge_geometries_match_oracle() { + let dim = 64; + let vectors = random_corpus(dim, 17, 0x42); + let queries = random_corpus(dim, 3, 0x43); + for m in [17, 25, 1] { + assert_contract(dim, &vectors, &queries, m, "edge"); + } + + let single_doc = random_corpus(dim, 1, 0x77); + assert_contract(dim, &single_doc, &queries, 4, "single-doc"); + + // Empty query batch: CSR must be a single zero offset and no candidates. + let mut sign = SignBitmap::new(dim); + sign.add(&vectors); + let cb = sign.top_m_candidates_batched_serial_csr(&[], 8); + assert_eq!(cb.offsets, vec![0]); + assert!(cb.candidates.is_empty()); +} + +/// Large-dim smoke at the shape the arXiv corpus uses (1024 dims), enough +/// rows to cross several L2-sized doc blocks. +#[test] +fn dim_1024_shape_matches_oracle() { + let dim = 1024; + let n = 6_000; + let vectors = random_corpus(dim, n, 0xA5A5); + let queries = random_corpus(dim, 8, 0x5A5A); + for m in [256, 320] { + assert_contract(dim, &vectors, &queries, m, "dim1024"); + } +} From 8af3352302fa5c72071901497ddfcab94c115d6d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 13:11:31 -0500 Subject: [PATCH 2/6] perf: stream the corpus once per call in sign candidate generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit top_m_candidates_batched_serial_csr previously looped the single-query path, re-streaming the full sign bitmap per query (documented-naive Track-1). The internals now scan the corpus once per call in L2-sized doc blocks, score every query of the call against each hot block in query tiles via the existing batched kernel, and select per-query top-m with bounded (hamming, doc_id) min-collectors — bit-identical to a full sort by construction, independent of processing order (the key IS the contract's sort key). top_m_candidates routes through the same core, dropping its per-call n-row Hamming materialisation. Per-query corpus traffic drops by the call's query count: at 1.26M rows x 1024 dims, a 2048-query call reads the 161MB sidecar once instead of 2048 times. Serial contract preserved (no rayon); the oracle suite (tests/tiled_candgen.rs) pins bit-identical outputs across random, tie-heavy, duplicate-run, and edge geometries. --- src/sign_bitmap.rs | 117 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 28 deletions(-) diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 66f971a..e2296c9 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -39,6 +39,7 @@ //! scalar path. See [`crate::avx512vpop_supported`]. use rayon::prelude::*; +use std::collections::BinaryHeap; use crate::OrdvecError; @@ -220,6 +221,77 @@ impl SignBitmap { /// SIMD dispatch paths — same audit discipline as /// [`crate::Bitmap::top_m_candidates`]. #[must_use = "this scans the corpus to generate candidates; dropping the result discards that work"] + /// Streamed exact top-m selection shared by [`Self::top_m_candidates`] + /// and [`Self::top_m_candidates_batched_serial_csr`]: the corpus is + /// scanned once per call in L2-sized doc blocks, each hot block is + /// scored against every query (in small query tiles), and per-query + /// bounded min-m collectors keyed by `(hamming, doc_id)` select exactly + /// the lexicographic top-m — bit-identical to a full sort, independent + /// of processing order. Serial by contract: no rayon. + fn top_m_candidates_streamed(&self, queries: &[f32], m_eff: usize) -> Vec> { + const TILE_QUERIES: usize = 32; + const BLOCK_BYTES: usize = 256 * 1024; + + let dim = self.dim; + let nq = queries.len() / dim; + let qpv = self.qwords_per_vec; + let n = self.n_vectors; + debug_assert!(m_eff >= 1 && m_eff <= n); + + let mut q_bitmaps = vec![0u64; nq * qpv]; + for qi in 0..nq { + let qb = self.build_query_bitmap(&queries[qi * dim..(qi + 1) * dim]); + q_bitmaps[qi * qpv..(qi + 1) * qpv].copy_from_slice(&qb); + } + + let block_docs = (BLOCK_BYTES / (qpv * 8)).max(64).min(n); + let tile = TILE_QUERIES.min(nq); + let mut block_scores = vec![0u32; tile * block_docs]; + // Max-heap keeps the current worst kept key at the top, so the + // retained set is always the m lexicographically smallest + // (hamming, doc_id) keys seen so far. + let mut heaps: Vec> = (0..nq) + .map(|_| BinaryHeap::with_capacity(m_eff + 1)) + .collect(); + + let mut block_start = 0usize; + while block_start < n { + let bn = block_docs.min(n - block_start); + let block = &self.bitmaps[block_start * qpv..(block_start + bn) * qpv]; + let mut tile_start = 0usize; + while tile_start < nq { + let tq = tile.min(nq - tile_start); + let qb_tile = &q_bitmaps[tile_start * qpv..(tile_start + tq) * qpv]; + let scores = &mut block_scores[..tq * bn]; + sign_scan_collect_batched(block, bn, qpv, qb_tile, tq, scores); + for ti in 0..tq { + let heap = &mut heaps[tile_start + ti]; + let row = &scores[ti * bn..(ti + 1) * bn]; + for (d, &hamming) in row.iter().enumerate() { + let key = (hamming, (block_start + d) as u32); + if heap.len() < m_eff { + heap.push(key); + } else if key < *heap.peek().expect("non-empty full collector") { + heap.pop(); + heap.push(key); + } + } + } + tile_start += tq; + } + block_start += bn; + } + + heaps + .into_iter() + .map(|heap| { + let mut kept = heap.into_vec(); + kept.sort_unstable(); + kept.into_iter().map(|(_, doc)| doc).collect() + }) + .collect() + } + pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec { assert_eq!(q.len(), self.dim); crate::util::assert_all_finite(q); @@ -227,27 +299,9 @@ impl SignBitmap { if m_eff == 0 { return Vec::new(); } - let qb = self.build_query_bitmap(q); - let mut scores = vec![0u32; self.n_vectors]; // Hamming distance per doc - sign_scan_collect( - &self.bitmaps, - self.n_vectors, - self.qwords_per_vec, - &qb, - &mut scores, - ); - let mut idx: Vec = (0..self.n_vectors as u32).collect(); - // Ascending Hamming = best candidates first. Composite key - // ensures deterministic partition at boundary ties. - let cmp = |a: &u32, b: &u32| { - scores[*a as usize] - .cmp(&scores[*b as usize]) - .then_with(|| a.cmp(b)) - }; - idx.select_nth_unstable_by(m_eff - 1, cmp); - let mut head = idx[..m_eff].to_vec(); - head.sort_unstable_by(cmp); - head + self.top_m_candidates_streamed(q, m_eff) + .pop() + .expect("streamed selection returns one row per query") } /// Batched variant: stream the sign bitmaps **once** and produce @@ -313,10 +367,12 @@ impl SignBitmap { /// pool. (The existing [`Self::top_m_candidates_batched`] remains the /// internally-parallel standalone convenience.) /// - /// Track-1 implementation is intentionally naive — it loops the single-query - /// [`Self::top_m_candidates`] (which materialises a per-query `n` Hamming - /// row). A future release may replace the internals with streaming top-m - /// behind this frozen signature; the CSR output contract will not change. + /// The internals stream the corpus **once per call** in L2-sized doc + /// blocks, scoring every query of the call against each hot block and + /// selecting per-query top-m with bounded `(hamming, doc_id)` collectors + /// — per-query corpus traffic drops by the call's query count relative + /// to the historical per-query rescan. The CSR output contract is + /// unchanged and bit-identical to the previous implementation. /// /// # Example /// ```no_run @@ -345,9 +401,14 @@ impl SignBitmap { let mut offsets = Vec::with_capacity(nq + 1); offsets.push(0usize); let mut candidates = Vec::with_capacity(nq.saturating_mul(m_eff)); - for qi in 0..nq { - let q = &queries[qi * dim..(qi + 1) * dim]; - let row = self.top_m_candidates(q, m); + if nq == 0 || m_eff == 0 { + offsets.extend(std::iter::repeat_n(0usize, nq)); + return CandidateBatch { + candidates, + offsets, + }; + } + for row in self.top_m_candidates_streamed(queries, m_eff) { candidates.extend_from_slice(&row); offsets.push(candidates.len()); } From 3f65e03c87f0eb970a3c5467b4bf2f50080d49f2 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 13:23:31 -0500 Subject: [PATCH 3/6] perf: keep the dense partition path for single-query candidates Audit remediation: routing top_m_candidates through the streamed core measured +50-90% at small/medium n with m in the hundreds (bounded heap O(n log m) vs select_nth_unstable_by O(n)); with one query there is no scan to share, so nq=1 stays on the dense path (parity-or-better at every measured size). Also per audit: the block-boundary oracle test now genuinely spans three blocks (the dim=128 shape fit one block), and adds the dim=768 AVX-512 tail-residue x multi-block case to the permanent suite. --- src/sign_bitmap.rs | 28 +++++++++++++++++++++++++--- tests/tiled_candgen.rs | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index e2296c9..b47f379 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -299,9 +299,31 @@ impl SignBitmap { if m_eff == 0 { return Vec::new(); } - self.top_m_candidates_streamed(q, m_eff) - .pop() - .expect("streamed selection returns one row per query") + // Single-query stays on the dense partition path: with one query + // there is no scan to share, and select_nth_unstable_by (O(n) + // average) measurably beats an O(n log m) bounded heap for m in the + // hundreds at small/medium n (audit: +50-90% regression otherwise). + let qb = self.build_query_bitmap(q); + let mut scores = vec![0u32; self.n_vectors]; // Hamming distance per doc + sign_scan_collect( + &self.bitmaps, + self.n_vectors, + self.qwords_per_vec, + &qb, + &mut scores, + ); + let mut idx: Vec = (0..self.n_vectors as u32).collect(); + // Ascending Hamming = best candidates first. Composite key + // ensures deterministic partition at boundary ties. + let cmp = |a: &u32, b: &u32| { + scores[*a as usize] + .cmp(&scores[*b as usize]) + .then_with(|| a.cmp(b)) + }; + idx.select_nth_unstable_by(m_eff - 1, cmp); + let mut head = idx[..m_eff].to_vec(); + head.sort_unstable_by(cmp); + head } /// Batched variant: stream the sign bitmaps **once** and produce diff --git a/tests/tiled_candgen.rs b/tests/tiled_candgen.rs index 5a416f5..33ac414 100644 --- a/tests/tiled_candgen.rs +++ b/tests/tiled_candgen.rs @@ -33,13 +33,7 @@ fn tie_heavy_corpus(dim: usize, n: usize) -> Vec { (0..n) .flat_map(|doc| { let pattern = doc % 4; - (0..dim).map(move |c| { - if (c + pattern) % 3 == 0 { - -1.0 - } else { - 1.0 - } - }) + (0..dim).map(move |c| if (c + pattern) % 3 == 0 { -1.0 } else { 1.0 }) }) .collect() } @@ -64,7 +58,10 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label let q = &queries[qi * dim..(qi + 1) * dim]; let got = sign.top_m_candidates(q, m); let want = oracle_top_m(&sign, q, m); - assert_eq!(got, want, "{label}: single-query mismatch at query {qi}, m={m}"); + assert_eq!( + got, want, + "{label}: single-query mismatch at query {qi}, m={m}" + ); } // Batched serial CSR path: row qi must equal the single-query result. @@ -73,7 +70,11 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label for qi in 0..nq { let row = &cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]]; let want = oracle_top_m(&sign, &queries[qi * dim..(qi + 1) * dim], m); - assert_eq!(row, &want[..], "{label}: CSR row mismatch at query {qi}, m={m}"); + assert_eq!( + row, + &want[..], + "{label}: CSR row mismatch at query {qi}, m={m}" + ); } } @@ -81,7 +82,10 @@ fn assert_contract(dim: usize, vectors: &[f32], queries: &[f32], m: usize, label /// tile size, at a SIMD-friendly dim. #[test] fn random_corpus_matches_oracle_across_block_boundaries() { - let dim = 128; + // dim=512 -> 8 qwords/vec -> 4096-doc blocks; n=10240 spans three + // blocks including a final partial one (audit: the previous dim=128 + // shape fit in a single block, so the loop never crossed a boundary). + let dim = 512; let n = 10_240; let vectors = random_corpus(dim, n, 0xC0FFEE); let queries = random_corpus(dim, 33, 0xBEEF); @@ -155,3 +159,17 @@ fn dim_1024_shape_matches_oracle() { assert_contract(dim, &vectors, &queries, m, "dim1024"); } } + +/// AVX-512 tail residue (dim=768 -> qpv=12, rem=4) composed with +/// multi-block crossing and a final partial block — the kernel-shape case +/// the audit flagged as untested in the permanent suite. +#[test] +fn dim_768_tail_residue_crosses_blocks() { + let dim = 768; + let n = 3_200; // block_docs = 262144/96 = 2730 -> 2 blocks, partial tail + let vectors = random_corpus(dim, n, 0x7E57); + let queries = random_corpus(dim, 7, 0x7E58); + for m in [64, 320] { + assert_contract(dim, &vectors, &queries, m, "dim768-tail"); + } +} From 9350b41fd7e7fb5480718ffebaaed4031f1f2659 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 19:07:28 -0500 Subject: [PATCH 4/6] fix: assert whole-row query buffers in the streamed core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bot-review remediation (Qodo, PR #278): the shared core derived nq by integer division; a ragged buffer from a future caller would silently truncate. All current callers validate upstream — this is the cheap in-core invariant. --- src/sign_bitmap.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index b47f379..d44bb78 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -233,6 +233,10 @@ impl SignBitmap { const BLOCK_BYTES: usize = 256 * 1024; let dim = self.dim; + debug_assert!( + queries.len().is_multiple_of(dim), + "queries buffer must be a whole number of rows" + ); let nq = queries.len() / dim; let qpv = self.qwords_per_vec; let n = self.n_vectors; From cf357e536832b9b48084a1db8a77c89511f09b07 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 22:41:53 -0500 Subject: [PATCH 5/6] fix: checked selection-state bounds in the streamed candidate core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bot-review remediation (Qodo, PR #278): nq * m_eff can overflow usize on 32-bit/wasm32 targets, and the CSR wrapper's saturating_mul would attempt a usize::MAX allocation. Both sites now use checked multiplication with a clear tile-the-batch message, matching the crate's checked-allocation discipline. The exact m_eff + 1 heap reservation is kept deliberately: gradual growth double-allocates to the next power of two (~2x peak per query) — the reservation is the memory-optimal choice, now documented. --- src/sign_bitmap.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index d44bb78..ec8cd20 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -254,6 +254,18 @@ impl SignBitmap { // Max-heap keeps the current worst kept key at the top, so the // retained set is always the m lexicographically smallest // (hamming, doc_id) keys seen so far. + // Selection state is O(nq * m_eff) on top of the CSR output — an + // explicit checked bound (32-bit/wasm32 targets can overflow the + // multiplication) with a clear message, per the crate's + // checked-allocation discipline. Exact per-heap reservation of + // m_eff + 1 is deliberate: gradual growth would double-allocate to + // the next power of two (~2x m_eff peak per query); callers with + // extreme nq * m_eff should tile the query batch (as OrdinalDB's + // chunk scheduler does). + let selection_cells = nq.checked_mul(m_eff).unwrap_or_else(|| { + panic!("selection state nq ({nq}) * m ({m_eff}) overflows usize; tile the query batch") + }); + let _ = selection_cells; let mut heaps: Vec> = (0..nq) .map(|_| BinaryHeap::with_capacity(m_eff + 1)) .collect(); @@ -426,7 +438,9 @@ impl SignBitmap { let m_eff = m.min(self.n_vectors); let mut offsets = Vec::with_capacity(nq + 1); offsets.push(0usize); - let mut candidates = Vec::with_capacity(nq.saturating_mul(m_eff)); + let mut candidates = Vec::with_capacity(nq.checked_mul(m_eff).unwrap_or_else(|| { + panic!("CSR output nq ({nq}) * m ({m_eff}) overflows usize; tile the query batch") + })); if nq == 0 || m_eff == 0 { offsets.extend(std::iter::repeat_n(0usize, nq)); return CandidateBatch { From e091df5f174dfa4ac3fd5348509ae69e7262d69d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 3 Jul 2026 23:12:07 -0500 Subject: [PATCH 6/6] perf: build query bitmaps in place in the streamed core Bot-review remediation (Qodo, #283 inline): build_query_bitmap allocated a fresh Vec and re-validated finiteness per query; the entry points already validate the whole buffer and the destination is preallocated. Oracle suites pin bit-identical output. --- src/sign_bitmap.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index ec8cd20..8d2e764 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -242,10 +242,18 @@ impl SignBitmap { let n = self.n_vectors; debug_assert!(m_eff >= 1 && m_eff <= n); + // Build bitmaps in place: the entry points already validated the + // whole query buffer, and build_query_bitmap would allocate a fresh + // Vec (and re-validate) per query on this hot path. let mut q_bitmaps = vec![0u64; nq * qpv]; for qi in 0..nq { - let qb = self.build_query_bitmap(&queries[qi * dim..(qi + 1) * dim]); - q_bitmaps[qi * qpv..(qi + 1) * qpv].copy_from_slice(&qb); + let q = &queries[qi * dim..(qi + 1) * dim]; + let bm = &mut q_bitmaps[qi * qpv..(qi + 1) * qpv]; + for (j, &value) in q.iter().enumerate() { + if value > 0.0 { + bm[j / 64] |= 1u64 << (j % 64); + } + } } let block_docs = (BLOCK_BYTES / (qpv * 8)).max(64).min(n);