From 5c5a4b56b3da44e5b5f6069ce4f503cd72d88f90 Mon Sep 17 00:00:00 2001 From: Serhii Savchuk Date: Mon, 8 Jun 2026 14:50:56 +0300 Subject: [PATCH] perf(query): gate the bounded-multikey count+take fast path by table size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bounded_multikey_count_take_candidate gate triggers an eval-level single-threaded scan that walks every row with an O(found) per-row linear group lookup. Profitable on small inputs (skips the full DAG group HT construction) but for large multi-key inputs the serial scan loses to the parallel mk_par_v2 fused_group path. ClickBench 10M q17 — `(select {c: (count UserID) by: {UserID, SearchPhrase} take: 10})` over 10M rows with ~2.1M distinct composite keys — used to land here and spend ~340 ms on the linear scan even though the result only needs any 10 (UserID, SearchPhrase, count) tuples. Gate the candidate on `nrows < 100000` so big inputs fall through to the parallel filtered_group multi-key path. ClickBench 10M: q17 ~354 → ~161 ms (-54%, -193ms) --- src/ops/query.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ops/query.c b/src/ops/query.c index 7474645a..877b31e7 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -5371,7 +5371,16 @@ ray_t* ray_select(ray_t** args, int64_t n) { break; } } + /* The bounded-multikey count-take candidate uses an + * eval-level single-threaded scan with O(found) per-row + * group lookup. Profitable on small inputs (skips the + * full DAG group HT construction) but at 10M rows × multi- + * key composite (ClickBench q17), the serial scan loses + * to the parallel mk_par_v2 filtered_group below. Gate + * on table size — let big inputs through to the fused + * multi-key path. */ if (!use_eval_group && + ray_table_nrows(tbl) < 100000 && bounded_multikey_count_take_candidate( dict_elems, dict_n, from_id, where_id, by_id, take_id, asc_id, desc_id, ray_table_nrows(tbl), 1024)) {