diff --git a/cov_final.profdata b/cov_final.profdata new file mode 100644 index 00000000..072456b2 Binary files /dev/null and b/cov_final.profdata differ diff --git a/cov_master_now.profdata b/cov_master_now.profdata new file mode 100644 index 00000000..6f63c02b Binary files /dev/null and b/cov_master_now.profdata differ diff --git a/cov_ours_now.profdata b/cov_ours_now.profdata new file mode 100644 index 00000000..4e09ae76 Binary files /dev/null and b/cov_ours_now.profdata differ diff --git a/cov_timer_after.profdata b/cov_timer_after.profdata new file mode 100644 index 00000000..ffaf4328 Binary files /dev/null and b/cov_timer_after.profdata differ diff --git a/cov_timer_final.profdata b/cov_timer_final.profdata new file mode 100644 index 00000000..c98535f5 Binary files /dev/null and b/cov_timer_final.profdata differ diff --git a/cov_wave10.profdata b/cov_wave10.profdata new file mode 100644 index 00000000..5de40b3d Binary files /dev/null and b/cov_wave10.profdata differ diff --git a/cov_wave11.profdata b/cov_wave11.profdata new file mode 100644 index 00000000..ece387bd Binary files /dev/null and b/cov_wave11.profdata differ diff --git a/cov_wave7.profdata b/cov_wave7.profdata new file mode 100644 index 00000000..0d1a886a Binary files /dev/null and b/cov_wave7.profdata differ diff --git a/cov_wave8.profdata b/cov_wave8.profdata new file mode 100644 index 00000000..8a59e2ef Binary files /dev/null and b/cov_wave8.profdata differ diff --git a/cov_wave9.profdata b/cov_wave9.profdata new file mode 100644 index 00000000..e8684b04 Binary files /dev/null and b/cov_wave9.profdata differ diff --git a/cov_with_new_tests.profdata b/cov_with_new_tests.profdata new file mode 100644 index 00000000..b82cb3be Binary files /dev/null and b/cov_with_new_tests.profdata differ diff --git a/coverage_new.profdata b/coverage_new.profdata new file mode 100644 index 00000000..5de40b3d Binary files /dev/null and b/coverage_new.profdata differ diff --git a/include/rayforce.h b/include/rayforce.h index a1d0bdd5..70ff8478 100644 --- a/include/rayforce.h +++ b/include/rayforce.h @@ -459,6 +459,7 @@ ray_t* ray_table_get_col(ray_t* tbl, int64_t name_id); ray_t* ray_table_get_col_idx(ray_t* tbl, int64_t idx); int64_t ray_table_col_name(ray_t* tbl, int64_t idx); void ray_table_set_col_name(ray_t* tbl, int64_t idx, int64_t name_id); +void ray_table_set_col_idx(ray_t* tbl, int64_t idx, ray_t* col_vec); int64_t ray_table_ncols(ray_t* tbl); int64_t ray_table_nrows(ray_t* tbl); ray_t* ray_table_schema(ray_t* tbl); diff --git a/src/ops/agg.c b/src/ops/agg.c index 34328522..8ca1202d 100644 --- a/src/ops/agg.c +++ b/src/ops/agg.c @@ -226,7 +226,9 @@ ray_t* ray_sum_fn(ray_t* x) { ray_retain(x); return x; } if (ray_is_vec(x)) { - if (x->type == RAY_DATE) return ray_error("type", NULL); + /* Canonical admission: numeric + TIME (duration); DATE/TIMESTAMP are + * absolute points and SYM/STR/GUID are non-numeric → type error. */ + if (!agg_type_admitted(OP_SUM, x->type)) return ray_error("type", NULL); /* Narrow/temporal types need specific return constructors that the * DAG executor doesn't provide — use scalar path for these. */ if (x->type == RAY_I32 || x->type == RAY_I16 || x->type == RAY_U8 || @@ -309,7 +311,12 @@ ray_t* ray_avg_fn(ray_t* x) { if (is_numeric(x)) return make_f64(as_f64(x)); ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_avg); + if (ray_is_vec(x)) { + /* Canonical admission: numeric + temporal (→ F64); SYM/STR/GUID are + * non-numeric → type error (the DAG path otherwise averaged raw ids). */ + if (!agg_type_admitted(OP_AVG, x->type)) return ray_error("type", NULL); + AGG_VEC_VIA_DAG(x, ray_avg); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); @@ -537,9 +544,17 @@ static ray_t* vec_to_f64_scratch(ray_t* x, double** out_vals) { } else if (x->type == RAY_I16) { int16_t* d = (int16_t*)ray_data(x); for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; } - } else if (x->type == RAY_U8) { + } else if (x->type == RAY_U8 || x->type == RAY_BOOL) { uint8_t* d = (uint8_t*)ray_data(x); for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; } + } else if (x->type == RAY_DATE || x->type == RAY_TIME) { + /* temporal stored as int32 days/ms — avg/var/stddev compute over the + * raw counts (result F64), per the canonical admission table. */ + int32_t* d = (int32_t*)ray_data(x); + for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; } + } else if (x->type == RAY_TIMESTAMP) { + int64_t* d = (int64_t*)ray_data(x); + for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; } } else { ray_release(scratch); return ray_error("type", NULL); diff --git a/src/ops/collection.c b/src/ops/collection.c index 4b1b2135..e1fcf175 100644 --- a/src/ops/collection.c +++ b/src/ops/collection.c @@ -1591,10 +1591,16 @@ ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) { /* (at vec idx) or (at table 'col) — index into vector or table */ ray_t* ray_at_fn(ray_t* vec, ray_t* idx) { if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec); - /* Table column access by symbol key — return the typed vector directly */ + /* Table column access by symbol key — return the typed vector directly. + * A column loaded from a splayed/parted table (.db.parted.get) is still in + * segmented (RAY_IS_PARTED) form; the query path flattens it lazily, but a + * direct `(at table 'col)` must materialize it so downstream `(at col i)` / + * count / formatting see a dense vector instead of failing with `type`. */ if (vec->type == RAY_TABLE && idx->type == -RAY_SYM) { ray_t* col = ray_table_get_col(vec, idx->i64); if (!col) return ray_error("domain", NULL); + if (RAY_IS_PARTED(col->type)) return parted_to_flat_vec(col); + if (col->type == RAY_MAPCOMMON) return materialize_mapcommon(col); ray_retain(col); return col; } diff --git a/src/ops/exec.c b/src/ops/exec.c index 570c438b..c1337a77 100644 --- a/src/ops/exec.c +++ b/src/ops/exec.c @@ -1084,6 +1084,18 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { input = compacted; own_input = true; } + /* Canonical aggregand type-admission — reject non-admitted element + * types (SYM/STR/GUID everywhere; DATE/TIMESTAMP for sum) so the + * DAG reduction matches the scalar builtins instead of aggregating + * raw ids / bytes / date counts. */ + if (input && input->type != RAY_TABLE) { + int8_t et = RAY_IS_PARTED(input->type) + ? (int8_t)RAY_PARTED_BASETYPE(input->type) : input->type; + if (et > 0 && !agg_type_admitted(op->opcode, et)) { + if (own_input) ray_release(input); + return ray_error("type", NULL); + } + } ray_t* result = exec_reduction(g, op, input); if (own_input) ray_release(input); return result; diff --git a/src/ops/group.c b/src/ops/group.c index 003a6971..651c7862 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -74,6 +74,83 @@ static inline bool sym_lex_lt(int64_t a, int64_t b) { } static inline bool sym_lex_gt(int64_t a, int64_t b) { return sym_lex_lt(b, a); } +/* ── Wide-element (STR/GUID) min/max/first/last ────────────────────────── + * STR (a 16-byte ray_str_t: pool pointer + length) and GUID (16 raw bytes) + * do not fit the 8-byte integer reduce accumulators, so the int64 fast paths + * silently truncate them to a single byte. These helpers instead track the + * WINNING ROW INDEX — by content comparison (lexicographic for STR, byte + * order for GUID) for min/max, or by position for first/last — and the caller + * materialises that element with collection_elem. This matches the scalar + * first/last builtins and gives min/max real lexical results, unifying the + * scalar and DAG aggregation paths for wide element types. */ +static inline bool agg_is_wide_type(int8_t t) { + return t == RAY_STR || t == RAY_GUID; +} +/* Scan rows [optionally via sel] and return the winning row index for + * op (OP_MIN/OP_MAX/OP_FIRST/OP_LAST), or -1 if every scanned row is null. + * + * The element type (STR vs GUID) and the operator are resolved ONCE here, + * before any loop — the inner loops carry no type/op switch. first/last are + * pure positional scans (no value comparison); min/max run a single + * type-specialised compare loop whose direction (want_min) is hoisted out. + * (sel/has_nulls remain a predictable per-row branch, exactly as the integer + * reduce loops below do; the goal is no *type/op dispatch* inside the loop.) */ +static int64_t wide_winner_row(ray_t* input, uint16_t op, + const int64_t* sel, int64_t scan_n, + bool has_nulls) { + /* first/last: positional — return the first/last non-null row, no compare. */ + if (op == OP_FIRST) { + for (int64_t i = 0; i < scan_n; i++) { + int64_t row = sel ? sel[i] : i; + if (!has_nulls || !ray_vec_is_null(input, row)) return row; + } + return -1; + } + if (op == OP_LAST) { + for (int64_t i = scan_n - 1; i >= 0; i--) { + int64_t row = sel ? sel[i] : i; + if (!has_nulls || !ray_vec_is_null(input, row)) return row; + } + return -1; + } + /* min/max: one type-specialised compare loop, direction hoisted out. */ + const bool want_min = (op == OP_MIN); + int64_t best = -1; + if (input->type == RAY_GUID) { + const uint8_t* d = (const uint8_t*)ray_data(input); + for (int64_t i = 0; i < scan_n; i++) { + int64_t row = sel ? sel[i] : i; + if (has_nulls && ray_vec_is_null(input, row)) continue; + if (best < 0) { best = row; continue; } + int c = memcmp(d + (size_t)row * 16, d + (size_t)best * 16, 16); + if (want_min ? (c < 0) : (c > 0)) best = row; + } + } else { /* RAY_STR — lexicographic over the pooled bytes */ + for (int64_t i = 0; i < scan_n; i++) { + int64_t row = sel ? sel[i] : i; + if (has_nulls && ray_vec_is_null(input, row)) continue; + if (best < 0) { best = row; continue; } + size_t la = 0, lb = 0; + const char* sa = ray_str_vec_get(input, row, &la); + const char* sb = ray_str_vec_get(input, best, &lb); + size_t m = la < lb ? la : lb; + int c = (m && sa && sb) ? memcmp(sa, sb, m) : 0; + if (!c) c = (la > lb) - (la < lb); + if (want_min ? (c < 0) : (c > 0)) best = row; + } + } + return best; +} +/* Whole-table (optionally selected) min/max/first/last over a wide column. */ +static ray_t* agg_wide_reduce(ray_t* input, uint16_t op, + const int64_t* sel, int64_t scan_n, + bool has_nulls) { + int64_t best = wide_winner_row(input, op, sel, scan_n, has_nulls); + if (best < 0) return ray_typed_null(-input->type); + int alloc; + return collection_elem(input, best, &alloc); +} + /* Integer reduction loop — reads native type T, accumulates as i64. * HAS_NULLS and HAS_IDX must be integer literal constants (0 or 1) so the * compiler dead-code-eliminates the corresponding branches in every @@ -1910,8 +1987,54 @@ ray_t* ray_topk_per_group_buf(ray_t* src, return out; } +/* ─── ray_wide_minmax_per_group_buf ─────────────────────────────────────── + * + * Per-group min/max/first/last for wide element types (STR/GUID) that don't + * fit the 8-byte integer accumulators. Same idx_buf/offsets/grp_cnt layout + * as the median/topk kernels — produced by exec_group's group-contiguous row + * gather — but instead of a numeric quickselect it finds the winning row per + * group (lexicographic for STR, byte order for GUID; positional for + * first/last) and materialises that element into a typed result column. + * + * Runs SERIAL: ray_str_vec_set COW-mutates the result vector and its shared + * string pool, so concurrent group writers would corrupt the pool. Wide + * min/max is a cold path, not a vectorised bench kernel, so this is fine. */ +ray_t* ray_wide_minmax_per_group_buf(ray_t* src, uint16_t op, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* grp_cnt, + int64_t n_groups) { + if (!src || RAY_IS_ERR(src) || n_groups < 0) return NULL; + if (!agg_is_wide_type(src->type)) return NULL; /* caller falls back */ + bool has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0; + + ray_t* out = col_vec_new(src, n_groups); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = n_groups; + + for (int64_t g = 0; g < n_groups; g++) { + int64_t cnt = grp_cnt[g]; + int64_t off = offsets[g]; + int64_t best = wide_winner_row(src, op, &idx_buf[off], cnt, has_nulls); + if (best < 0) { ray_vec_set_null(out, g, true); continue; } + int alloc; + ray_t* e = collection_elem(src, best, &alloc); + if (src->type == RAY_STR) { + ray_t* nv = ray_str_vec_set(out, g, ray_str_ptr(e), ray_str_len(e)); + if (alloc) ray_release(e); + if (!nv || RAY_IS_ERR(nv)) { if (nv != out) ray_release(out); return nv ? nv : ray_error("oom", NULL); } + out = nv; + } else { /* RAY_GUID — fixed 16-byte in-place store */ + store_typed_elem(out, g, e); + if (alloc) ray_release(e); + } + } + return out; +} + static ray_t* reduction_i64_result(int64_t val, int8_t out_type) { switch (out_type) { + case RAY_BOOL: return ray_bool((bool)val); case RAY_DATE: return ray_date((int32_t)val); case RAY_TIME: return ray_time(val); case RAY_TIMESTAMP: return ray_timestamp(val); @@ -1987,6 +2110,17 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { } } + /* Wide element types (STR/GUID) overflow the 8-byte reduce + * accumulators; resolve min/max/first/last by materialising the + * winning row instead. COUNT keeps the generic length-based path. */ + if (agg_is_wide_type(in_type) && + (op->opcode == OP_MIN || op->opcode == OP_MAX || + op->opcode == OP_FIRST || op->opcode == OP_LAST)) { + ray_t* r = agg_wide_reduce(input, op->opcode, sel_idx, scan_n, has_nulls); + if (sel_idx_block) ray_release(sel_idx_block); + return r; + } + /* O(1) short-circuit: first/last on numeric columns don't need a * full reduction pass. Non-numeric types (STR, GUID) fall through * to the serial reduction path below. */ @@ -2053,7 +2187,7 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { ray_t* result; switch (op->opcode) { - case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break; + case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : (in_type == RAY_TIME ? ray_time(merged.sum_i) : ray_i64(merged.sum_i)); break; case OP_PROD: result = in_type == RAY_F64 ? ray_f64(merged.prod_f) : ray_i64(merged.prod_i); break; case OP_MIN: result = reduction_extreme_result(op, in_type, merged.cnt > 0, merged.min_f, merged.min_i); break; case OP_MAX: result = reduction_extreme_result(op, in_type, merged.cnt > 0, merged.max_f, merged.max_i); break; @@ -2092,7 +2226,7 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { if (sel_idx_block) ray_release(sel_idx_block); switch (op->opcode) { - case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i); + case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : (in_type == RAY_TIME ? ray_time(acc.sum_i) : ray_i64(acc.sum_i)); case OP_PROD: return in_type == RAY_F64 ? ray_f64(acc.prod_f) : ray_i64(acc.prod_i); case OP_MIN: return reduction_extreme_result(op, in_type, acc.cnt > 0, acc.min_f, acc.min_i); case OP_MAX: return reduction_extreme_result(op, in_type, acc.cnt > 0, acc.max_f, acc.max_i); @@ -2168,11 +2302,20 @@ ght_layout_t ght_compute_layout(uint8_t n_keys, uint8_t n_aggs, * row_gid+grp_cnt gathers per-group slices and runs quickselect * (median) or a bounded heap (top/bot); see * ray_median_per_group_buf / ray_topk_per_group_buf. */ + /* Wide-element (STR/GUID) min/max/first/last also reserve no + * row-layout slot: the 8-byte accumulators can't hold a 16-byte + * GUID or a pooled string, so they are resolved by the same + * post-radix per-group pass via ray_wide_minmax_per_group_buf. */ + bool wide_mm = agg_ops && agg_vecs[a] && + agg_is_wide_type(agg_vecs[a]->type) && + (agg_ops[a] == OP_MIN || agg_ops[a] == OP_MAX || + agg_ops[a] == OP_FIRST || agg_ops[a] == OP_LAST); bool holistic = agg_ops && (agg_ops[a] == OP_MEDIAN || agg_ops[a] == OP_TOP_N || - agg_ops[a] == OP_BOT_N); + agg_ops[a] == OP_BOT_N || wide_mm); if (holistic) { ly.agg_is_holistic |= (uint8_t)(1u << a); + if (wide_mm) ly.agg_is_wide |= (uint8_t)(1u << a); ly.agg_val_slot[a] = -1; } else if (agg_vecs[a]) { ly.agg_val_slot[a] = (int8_t)nv; @@ -2191,7 +2334,9 @@ ght_layout_t ght_compute_layout(uint8_t n_keys, uint8_t n_aggs, } else { ly.agg_val_slot[a] = -1; } - if (agg_ops) { + if (agg_ops && !wide_mm) { + /* wide first/last are resolved holistically and need no + * entry-tail row slot, so they are excluded here. */ if (agg_ops[a] == OP_FIRST) ly.agg_is_first |= (1u << a); if (agg_ops[a] == OP_LAST) ly.agg_is_last |= (1u << a); if (agg_ops[a] == OP_PROD) ly.agg_is_prod |= (1u << a); @@ -3735,7 +3880,20 @@ static void emit_agg_columns(ray_t** result, ray_graph_t* g, const ray_op_ext_t* case OP_VAR: case OP_VAR_POP: out_type = RAY_F64; break; case OP_COUNT: out_type = RAY_I64; break; - case OP_SUM: case OP_PROD: + case OP_SUM: { + /* sum preserves TIME (a duration-like temporal): time+time is + * a time, matching the scalar ray_sum_fn. Other integer + * families widen to I64; DATE/TIMESTAMP are rejected at + * type-admission so never reach here. The affine/linear SUM + * fast paths leave agg_col NULL (they aggregate without + * materializing the input vector), so recover the source type + * from the aggregation input op when the vector is absent. */ + int8_t src_t = agg_col ? agg_col->type + : (ext->agg_ins[a] ? ext->agg_ins[a]->out_type : 0); + out_type = is_f64 ? RAY_F64 : (src_t == RAY_TIME ? RAY_TIME : RAY_I64); + break; + } + case OP_PROD: out_type = is_f64 ? RAY_F64 : RAY_I64; break; default: out_type = agg_col ? agg_col->type : RAY_I64; break; @@ -5961,6 +6119,86 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, (int64_t*)m->min_val, (int64_t*)m->max_val, m->count, agg_affine, m->sumsq_f64, m->nn_count); + /* Wide-element (STR/GUID) min/max/first/last overflow emit_agg_columns' + * fixed-width slots (it truncated them to 1 byte above). Recompute + * those 1-row columns by materialising the winning row — mirroring + * exec_reduction — and override them in the result table. */ + for (uint8_t a = 0; a < n_aggs; a++) { + uint16_t aop = ext->agg_ops[a]; + if (!(agg_vecs[a] && agg_is_wide_type(agg_vecs[a]->type) && + (aop == OP_MIN || aop == OP_MAX || + aop == OP_FIRST || aop == OP_LAST))) + continue; + ray_t* wsel_blk = NULL; const int64_t* wsel = NULL; int64_t wscan = nrows; + if (g->selection) { + ray_rowsel_t* sm = ray_rowsel_meta(g->selection); + if (sm && sm->nrows == nrows) { + wsel_blk = ray_rowsel_to_indices(g->selection); + wsel = wsel_blk ? (const int64_t*)ray_data(wsel_blk) : NULL; + wscan = sm->total_pass; + } + } + bool hn = (agg_vecs[a]->attrs & RAY_ATTR_HAS_NULLS) != 0; + ray_t* atom = agg_wide_reduce(agg_vecs[a], aop, wsel, wscan, hn); + if (wsel_blk) ray_release(wsel_blk); + ray_t* col = col_vec_new(agg_vecs[a], 1); + if (col && !RAY_IS_ERR(col)) { + col->len = 1; + if (RAY_ATOM_IS_NULL(atom)) { + ray_vec_set_null(col, 0, true); + } else if (agg_vecs[a]->type == RAY_STR) { + ray_t* nv = ray_str_vec_set(col, 0, ray_str_ptr(atom), ray_str_len(atom)); + if (nv && !RAY_IS_ERR(nv)) col = nv; + } else { + store_typed_elem(col, 0, atom); + } + ray_table_set_col_idx(result, a, col); + ray_release(col); + } + ray_release(atom); + } + + /* Whole-table median has no n_keys==0 accumulator, so emit_agg_columns + * left its column at the integer default (0). Recompute it over the + * whole (optionally selected) column as a single group via the same + * per-group kernel the by-group path uses, matching the scalar `med` + * builtin. (top/bot are not handled here: the no-by planner does not + * carry the K argument, so they keep their scalar-builtin form.) */ + { + bool any_med = false; + for (uint8_t a = 0; a < n_aggs; a++) + if (ext->agg_ops[a] == OP_MEDIAN) { any_med = true; break; } + if (any_med) { + ray_t* hsel_blk = NULL; const int64_t* hsel = NULL; int64_t hscan = nrows; + if (g->selection) { + ray_rowsel_t* sm = ray_rowsel_meta(g->selection); + if (sm && sm->nrows == nrows) { + hsel_blk = ray_rowsel_to_indices(g->selection); + hsel = hsel_blk ? (const int64_t*)ray_data(hsel_blk) : NULL; + hscan = sm->total_pass; + } + } + ray_t* ix_hdr = NULL; + int64_t* idxb = (int64_t*)scratch_alloc(&ix_hdr, + (size_t)(hscan > 0 ? hscan : 1) * sizeof(int64_t)); + if (idxb) { + /* selection branch hoisted out of the fill loop */ + if (hsel) { for (int64_t i = 0; i < hscan; i++) idxb[i] = hsel[i]; } + else { for (int64_t i = 0; i < hscan; i++) idxb[i] = i; } + int64_t offs[1] = {0}; + int64_t cnts[1] = {hscan}; + for (uint8_t a = 0; a < n_aggs; a++) { + if (ext->agg_ops[a] != OP_MEDIAN || !agg_vecs[a]) continue; + ray_t* hv = ray_median_per_group_buf(agg_vecs[a], idxb, offs, cnts, 1); + if (hv && !RAY_IS_ERR(hv)) ray_table_set_col_idx(result, a, hv); + if (hv) ray_release(hv); + } + scratch_free(ix_hdr); + } + if (hsel_blk) ray_release(hsel_blk); + } + } + da_accum_free(&sc_acc[0]); scratch_free(sc_hdr); for (uint8_t a = 0; a < n_aggs; a++) { if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]); if (agg_owned2[a] && agg_vecs2[a]) ray_release(agg_vecs2[a]); } @@ -5990,10 +6228,17 @@ da_path:; * no per-row accumulator at all — they need the post-radix * row_gid+grp_cnt pass which only the HT path provides. */ for (uint8_t a = 0; a < n_aggs && da_eligible; a++) { - if (ext->agg_ops[a] == OP_PEARSON_CORR) da_eligible = false; - if (ext->agg_ops[a] == OP_MEDIAN) da_eligible = false; - if (ext->agg_ops[a] == OP_TOP_N) da_eligible = false; - if (ext->agg_ops[a] == OP_BOT_N) da_eligible = false; + uint16_t aop = ext->agg_ops[a]; + if (aop == OP_PEARSON_CORR) da_eligible = false; + if (aop == OP_MEDIAN) da_eligible = false; + if (aop == OP_TOP_N) da_eligible = false; + if (aop == OP_BOT_N) da_eligible = false; + /* Wide-element (STR/GUID) min/max/first/last need the holistic + * post-fill; the DA emit (emit_agg_columns) would truncate them. */ + if (agg_vecs[a] && agg_is_wide_type(agg_vecs[a]->type) && + (aop == OP_MIN || aop == OP_MAX || + aop == OP_FIRST || aop == OP_LAST)) + da_eligible = false; } for (uint8_t k = 0; k < n_keys && da_eligible; k++) { if (!key_data[k]) { da_eligible = false; break; } @@ -8328,7 +8573,14 @@ v2_emit:; case OP_MEDIAN: out_type = RAY_F64; break; case OP_COUNT: out_type = RAY_I64; break; - case OP_SUM: case OP_PROD: + case OP_SUM: + /* sum preserves TIME (duration-like); other integer + * families widen to I64. DATE/TIMESTAMP rejected at + * type-admission. */ + out_type = is_f64 ? RAY_F64 + : (agg_col && agg_col->type == RAY_TIME ? RAY_TIME : RAY_I64); + break; + case OP_PROD: out_type = is_f64 ? RAY_F64 : RAY_I64; break; default: out_type = agg_col ? agg_col->type : RAY_I64; break; @@ -8507,6 +8759,11 @@ v2_emit:; aop == OP_TOP_N ? 1 : 0, idx_buf, offsets, grp_cnt, n_groups); err_tag = "top/bot: type"; + } else if (aop == OP_MIN || aop == OP_MAX || + aop == OP_FIRST || aop == OP_LAST) { + hol_vec = ray_wide_minmax_per_group_buf( + agg_vecs[a], aop, idx_buf, offsets, grp_cnt, n_groups); + err_tag = "minmax: type"; } if (!hol_vec) { if (hist_hdr) scratch_free(hist_hdr); @@ -8839,6 +9096,11 @@ sequential_fallback:; aop == OP_TOP_N ? 1 : 0, idx_buf_s, offsets_s, grp_cnt_s, (int64_t)grp_count); + } else if (aop == OP_MIN || aop == OP_MAX || + aop == OP_FIRST || aop == OP_LAST) { + hol_vec = ray_wide_minmax_per_group_buf( + agg_vecs[a], aop, idx_buf_s, offsets_s, + grp_cnt_s, (int64_t)grp_count); } med_out[a] = hol_vec; /* NULL or RAY_IS_ERR handled below */ } @@ -8866,14 +9128,29 @@ sequential_fallback:; case OP_MEDIAN: out_type = RAY_F64; break; case OP_COUNT: out_type = RAY_I64; break; - case OP_SUM: case OP_PROD: + case OP_SUM: { + /* sum preserves TIME (a duration-like temporal): time+time is + * a time, matching the scalar ray_sum_fn. Other integer + * families widen to I64; DATE/TIMESTAMP are rejected at + * type-admission so never reach here. The affine/linear SUM + * fast paths leave agg_col NULL (they aggregate without + * materializing the input vector), so recover the source type + * from the aggregation input op when the vector is absent. */ + int8_t src_t = agg_col ? agg_col->type + : (ext->agg_ins[a] ? ext->agg_ins[a]->out_type : 0); + out_type = is_f64 ? RAY_F64 : (src_t == RAY_TIME ? RAY_TIME : RAY_I64); + break; + } + case OP_PROD: out_type = is_f64 ? RAY_F64 : RAY_I64; break; default: out_type = agg_col ? agg_col->type : RAY_I64; break; } ray_t* new_col; - bool is_holistic = (agg_op == OP_MEDIAN || agg_op == OP_TOP_N || - agg_op == OP_BOT_N); + /* Drive off the layout bitmask, not the op literal: wide-element + * (STR/GUID) min/max/first/last are holistic too and their column + * lives in med_out[a], not the truncating row-layout read below. */ + bool is_holistic = (ly->agg_is_holistic & (1u << a)) != 0; if (is_holistic && med_out && med_out[a] && !RAY_IS_ERR(med_out[a])) { new_col = med_out[a]; @@ -11213,16 +11490,18 @@ ray_t* exec_group_pearson_rowform(ray_graph_t* g, ray_op_t* op) { if (k1_data_out) write_col_i64(k1_data_out, row, e->key1, k_types[1], k_attrs[1]); + /* Emit the signed Pearson correlation r — NOT r². The scalar + * pearson_corr builtin and the general radix HT finalize both + * return r = cov / (σx·σy); squaring it here collapsed -1 and +1 + * to the same 1.0, so by-group correlation lost its sign. */ double r2 = 0.0 / 0.0; /* NaN by default */ if (e->cnt >= 2) { double n = (double)e->cnt; double num = n * e->sumxy - e->sum_x * e->sum_y; double dx = n * e->sumsq_x - e->sum_x * e->sum_x; double dy = n * e->sumsq_y - e->sum_y * e->sum_y; - if (dx > 0.0 && dy > 0.0) { - double r = num / sqrt(dx * dy); - r2 = r * r; - } + if (dx > 0.0 && dy > 0.0) + r2 = num / sqrt(dx * dy); } r2_data[row] = r2; row++; diff --git a/src/ops/internal.h b/src/ops/internal.h index 278b52c7..2e19330c 100644 --- a/src/ops/internal.h +++ b/src/ops/internal.h @@ -823,6 +823,16 @@ ray_t* ray_topk_per_group_buf(ray_t* src, const int64_t* grp_cnt, int64_t n_groups); +/* Per-group min/max/first/last for wide element types (STR/GUID). Same + * idx_buf/offsets/grp_cnt layout as the median/topk kernels; materialises the + * winning element per group into a typed result column. Returns NULL for a + * non-wide src (caller falls back to the numeric path). */ +ray_t* ray_wide_minmax_per_group_buf(ray_t* src, uint16_t op, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* grp_cnt, + int64_t n_groups); + ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t group_limit); ray_t* exec_group_topk_rowform(ray_graph_t* g, ray_op_t* op); ray_t* exec_group_pearson_rowform(ray_graph_t* g, ray_op_t* op); @@ -892,6 +902,10 @@ typedef struct { * runs ray_median_per_group_buf over the source column using a * row_gid+grp_cnt-derived idx_buf. */ uint8_t agg_is_holistic; + /* Subset of agg_is_holistic: bit a set iff agg a is a wide-element + * (STR/GUID) min/max/first/last resolved via the per-group winner + * scan (ray_wide_minmax_per_group_buf) instead of a numeric kernel. */ + uint8_t agg_is_wide; /* Wide-key support: bit k set iff key k does not fit in 8 bytes * (e.g. RAY_GUID = 16 B). For wide keys the 8-byte key slot * stores a source-row index and the actual key bytes live in the diff --git a/src/ops/ops.h b/src/ops/ops.h index 86397e38..a969519c 100644 --- a/src/ops/ops.h +++ b/src/ops/ops.h @@ -255,6 +255,39 @@ void ray_cancel(void); * ~10M unique groups, essentially a row-dedup workload). */ #define OP_GROUP_SUM_COUNT_ROWFORM 114 +/* Canonical aggregand type-admission — shared by the scalar builtins + * (ray_sum_fn / ray_avg_fn / ray_var_fn / ...) and the DAG/group executors so + * BOTH paths accept exactly the same element types for each aggregate: + * sum : numeric (BOOL/U8/I16/I32/I64/F64) + TIME (a duration-like ms count) + * prod : numeric only + * avg / var / var_pop / stddev / stddev_pop : numeric + temporal (result F64) + * min / max / first / last / count and everything else : any type + * `t` is the positive element/column type. DATE/TIMESTAMP are absolute points + * (summing them is meaningless → rejected); SYM/STR/GUID are non-numeric. */ +static inline bool agg_type_admitted(uint16_t op, int8_t t) { + /* Parted columns carry a +RAY_PARTED_BASE-encoded tag; admit on the base + * element type so a parted numeric column aggregates like a flat one. */ + if (RAY_IS_PARTED(t)) t = (int8_t)RAY_PARTED_BASETYPE(t); + /* Reject only the element types we KNOW are inadmissible; defer unknown + * wrappers (e.g. RAY_MAPCOMMON) and out_type==0 to the runtime so a + * plan-time guard never false-rejects a column it can't classify. */ + bool nonnum = (t == RAY_SYM || t == RAY_STR || t == RAY_GUID); + bool temporal = (t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP); + switch (op) { + /* sum: numeric + TIME (duration-like ms count). SYM/STR/GUID and the + * absolute temporals DATE/TIMESTAMP are meaningless to sum → reject. */ + case OP_SUM: return !(nonnum || t == RAY_DATE || t == RAY_TIMESTAMP); + /* prod: numeric only — reject non-numeric and every temporal. */ + case OP_PROD: return !(nonnum || temporal); + /* avg / var / stddev: numeric + temporal → F64; reject SYM/STR/GUID. */ + case OP_AVG: + case OP_VAR: case OP_VAR_POP: + case OP_STDDEV: case OP_STDDEV_POP: + return !nonnum; + default: return true; + } +} + /* Opcodes — Graph */ #define OP_EXPAND 80 /* 1-hop CSR neighbor expansion */ #define OP_VAR_EXPAND 81 /* variable-length BFS/DFS */ diff --git a/src/ops/query.c b/src/ops/query.c index a6774b8b..7474645a 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -1283,6 +1283,14 @@ ray_op_t* compile_expr_dag(ray_graph_t* g, ray_t* expr) { if (agg_op) { ray_op_t* arg = compile_expr_dag(g, elems[1]); if (!arg) return NULL; + /* Canonical aggregand type-admission (same table as the scalar + * builtins): reject non-numeric (SYM/STR/GUID) and, for sum, + * absolute-temporal (DATE/TIMESTAMP) inputs at plan time so the + * DAG path never silently aggregates raw symbol ids / string + * bytes / date counts. Returning NULL routes the select to the + * eval-level path, where the scalar builtin raises `type`. */ + if (arg->out_type > 0 && !agg_type_admitted(agg_op, arg->out_type)) + return NULL; switch (agg_op) { case OP_SUM: return ray_sum(g, arg); case OP_AVG: return ray_avg(g, arg); @@ -6750,6 +6758,13 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Compile the aggregation input (the column reference) */ agg_ins[n_aggs] = compile_expr_dag(g, agg_arg); if (!agg_ins[n_aggs]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); } + /* Canonical aggregand type-admission (matches the scalar + * builtins): reject non-numeric / absolute-temporal inputs so a + * by-group sum/avg/var never silently folds symbol ids etc. */ + if (agg_ins[n_aggs]->out_type > 0 && + !agg_type_admitted(op, agg_ins[n_aggs]->out_type)) { + ray_graph_free(g); ray_release(tbl); return ray_error("type", NULL); + } agg_ins2[n_aggs] = NULL; agg_k[n_aggs] = 0; if (op == OP_PEARSON_CORR) { @@ -7757,6 +7772,16 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); } + /* Canonical aggregand type-admission (same table as the scalar + * builtins): reject non-numeric (SYM/STR/GUID) and, for sum, + * absolute-temporal (DATE/TIMESTAMP) inputs so the DAG never + * silently aggregates raw symbol ids / string bytes / dates. */ + if (s_agg_ins[s_n_aggs]->out_type > 0 && + !agg_type_admitted(op, s_agg_ins[s_n_aggs]->out_type)) { + if (g->selection) { ray_release(g->selection); g->selection = NULL; } + ray_graph_free(g); ray_release(tbl); + return ray_error("type", NULL); + } if (op == OP_PEARSON_CORR) { if (ray_len(val_expr) < 3) { if (g->selection) { ray_release(g->selection); g->selection = NULL; } diff --git a/src/table/table.c b/src/table/table.c index 8d393b6d..3a2b2d0f 100644 --- a/src/table/table.c +++ b/src/table/table.c @@ -155,6 +155,28 @@ ray_t* ray_table_get_col_idx(ray_t* tbl, int64_t idx) { return ((ray_t**)ray_data(cols))[idx]; } +/* -------------------------------------------------------------------------- + * ray_table_set_col_idx — replace the column vector at slot `idx`. COWs the + * cols list, releases the previous column and retains `col_vec`. No-op when + * idx is out of range. Used to override a column an aggregate emitter built + * with the wrong representation (e.g. wide STR/GUID min/max post-fill). + * -------------------------------------------------------------------------- */ + +void ray_table_set_col_idx(ray_t* tbl, int64_t idx, ray_t* col_vec) { + if (!tbl || RAY_IS_ERR(tbl) || !col_vec) return; + ray_t** slots = tbl_slots(tbl); + ray_t* cols = slots[1]; + if (!cols || RAY_IS_ERR(cols)) return; + if (idx < 0 || idx >= cols->len) return; + cols = ray_cow(cols); + if (!cols || RAY_IS_ERR(cols)) return; + slots[1] = cols; + ray_t** cv = (ray_t**)ray_data(cols); + ray_retain(col_vec); + ray_release(cv[idx]); + cv[idx] = col_vec; +} + /* -------------------------------------------------------------------------- * ray_table_col_name — sym id at slot `idx`, -1 on out-of-range. * -------------------------------------------------------------------------- */ diff --git a/test/rfl/agg/atom_i64_med_topk.rfl b/test/rfl/agg/atom_i64_med_topk.rfl index ed7070f1..cf0878d3 100644 --- a/test/rfl/agg/atom_i64_med_topk.rfl +++ b/test/rfl/agg/atom_i64_med_topk.rfl @@ -31,12 +31,16 @@ ;; through agg_parted_minmax → agg_atom_i64_for_type(base, best_i). ;; ════════════════════════════════════════════════════════════════════ -;; RAY_BOOL arm — parted BOOL column, max → ray_bool(v != 0). +;; RAY_BOOL arm — parted BOOL column aggregated via select (column stays +;; parted), max → ray_bool. A direct `(at P 'b)` now MATERIALIZES the column, +;; where bool aggregation widens to i64 (consistent with min/sum of bool); the +;; parted-agg arm is the one that yields a typed bool, so exercise it through a +;; query that keeps the column segmented. (set Tb-A (table [b] (list (as 'BOOL [false false true])))) (set Tb-B (table [b] (list (as 'BOOL [false false false])))) (.db.splayed.set "/tmp/rfl_atomi64_bool/2024.01.01/t/" Tb-A) (.db.splayed.set "/tmp/rfl_atomi64_bool/2024.01.02/t/" Tb-B) -(max (at (.db.parted.get "/tmp/rfl_atomi64_bool/" 't) 'b)) -- true +(at (select {m: (max b) from: (.db.parted.get "/tmp/rfl_atomi64_bool/" 't)}) 'm) -- [true] ;; RAY_U8 arm — parted U8 column. (set Tu-A (table [u] (list (as 'U8 [3 5 7])))) diff --git a/test/rfl/agg/branch_coverage.rfl b/test/rfl/agg/branch_coverage.rfl index f410ac51..5100b06d 100644 --- a/test/rfl/agg/branch_coverage.rfl +++ b/test/rfl/agg/branch_coverage.rfl @@ -91,23 +91,30 @@ (.db.splayed.set "/tmp/rfl_agg_bc_pts/1/t/" Tts1) (.db.splayed.set "/tmp/rfl_agg_bc_pts/2/t/" Tts2) (set Pts (.db.parted.get "/tmp/rfl_agg_bc_pts/" 't)) -(sum (at Pts 'ts)) -- (as 'TIMESTAMP 6000) -(type (sum (at Pts 'ts))) -- 'timestamp +;; sum over an absolute-temporal (TIMESTAMP) column is a type error — the +;; parted base type is unwrapped and rejected just like a flat TIMESTAMP vec. +(sum (at Pts 'ts)) !- type (.sys.exec "rm -rf /tmp/rfl_agg_bc_ptime /tmp/rfl_agg_bc_pts") ;; ════════════════════════════════════════════════════════════════════ ;; 5. Parted avg type error (L158) and parted minmax type error (L185) ;; ════════════════════════════════════════════════════════════════════ -;; SYM base type is non-numeric; hits the type error path. +;; A direct `(at Psym 's)` now MATERIALIZES the parted column. min/max of a +;; SYM column are DEFINED (lexicographic order), not type errors — the earlier +;; `!- type` assertions here only passed because accessing the parted column +;; itself raised `type` (now fixed); they were a false positive. (.sys.exec "rm -rf /tmp/rfl_agg_bc_psym") (set Tps1 (table [s] (list ['a 'b 'c]))) (set Tps2 (table [s] (list ['d 'e]))) (.db.splayed.set "/tmp/rfl_agg_bc_psym/1/t/" Tps1) (.db.splayed.set "/tmp/rfl_agg_bc_psym/2/t/" Tps2) (set Psym (.db.parted.get "/tmp/rfl_agg_bc_psym/" 't)) +(min (at Psym 's)) -- 'a +(max (at Psym 's)) -- 'e +;; FIXED: avg of a non-numeric SYM column now raises `type` — ray_avg_fn's +;; vector path guards on a numeric element type before aggregating, instead of +;; silently averaging the raw symbol-dictionary ids. (avg (at Psym 's)) !- type -(min (at Psym 's)) !- type -(max (at Psym 's)) !- type (.sys.exec "rm -rf /tmp/rfl_agg_bc_psym") ;; ════════════════════════════════════════════════════════════════════ diff --git a/test/rfl/agg/first.rfl b/test/rfl/agg/first.rfl index 0b83e35f..092673a7 100644 --- a/test/rfl/agg/first.rfl +++ b/test/rfl/agg/first.rfl @@ -33,3 +33,22 @@ (type (first (as 'TIMESTAMP [1 2]))) -- 'timestamp (first [0x01 0x02 0xff]) -- 0x01 (type (first [0x01 0x02 0xff])) -- 'u8 +;; BOOL first preserves the BOOL type (was widened to i64 before) +(first [true false true]) -- true +(type (first [true false true])) -- 'b8 + +;; ── Wide element types (STR/GUID): scalar first materialises the real +;; element, and the DAG (select) path must agree — no-by and by-group. ── +(first ["cat" "apple" "bee"]) -- "cat" +(type (first ["cat" "apple" "bee"])) -- 'str +;; scalar == DAG no-by +(set Tfs (table [s] (list ["cat" "apple" "bee"]))) +(at (select {r: (first s) from: Tfs}) 'r) -- ["cat"] +;; by-group (groups [cat,apple] / [bee,ant]) → first of each +(set Tfg (table [s k] (list ["cat" "apple" "bee" "ant"] [1 1 2 2]))) +(at (select {r: (first s) by: k from: Tfg}) 'r) -- ["cat" "bee"] +;; GUID first: scalar and DAG agree (random ids, so compare the two paths) +(set Gf (guid 6)) +(set Tfgu (table [g] (list Gf))) +(== (first Gf) (at (at (select {r: (first g) from: Tfgu}) 'r) 0)) -- true +(type (at (select {r: (first g) from: Tfgu}) 'r)) -- 'GUID diff --git a/test/rfl/agg/last.rfl b/test/rfl/agg/last.rfl index a6996d0d..2161969f 100644 --- a/test/rfl/agg/last.rfl +++ b/test/rfl/agg/last.rfl @@ -29,3 +29,20 @@ (type (last (as 'TIMESTAMP [1 2]))) -- 'timestamp (last [0x01 0xff]) -- 0xff (type (last [0x01 0xff])) -- 'u8 +;; BOOL last preserves the BOOL type (was widened to i64 before) +(last [true false false]) -- false +(type (last [true false false])) -- 'b8 + +;; ── Wide element types (STR/GUID): scalar last materialises the real +;; element, and the DAG (select) path must agree — no-by and by-group. ── +(last ["cat" "apple" "bee"]) -- "bee" +(type (last ["cat" "apple" "bee"])) -- 'str +(set Tls (table [s] (list ["cat" "apple" "bee"]))) +(at (select {r: (last s) from: Tls}) 'r) -- ["bee"] +;; by-group (groups [cat,apple] / [bee,ant]) → last of each +(set Tlg (table [s k] (list ["cat" "apple" "bee" "ant"] [1 1 2 2]))) +(at (select {r: (last s) by: k from: Tlg}) 'r) -- ["apple" "ant"] +;; GUID last: scalar and DAG agree (random ids → compare the two paths) +(set Gl (guid 6)) +(set Tlgu (table [g] (list Gl))) +(== (last Gl) (at (at (select {r: (last g) from: Tlgu}) 'r) 0)) -- true diff --git a/test/rfl/agg/list_med_var.rfl b/test/rfl/agg/list_med_var.rfl index 64bbf1cc..81e15fdb 100644 --- a/test/rfl/agg/list_med_var.rfl +++ b/test/rfl/agg/list_med_var.rfl @@ -97,12 +97,14 @@ (avg 'world) -- 'world (type (avg 'world)) -- 'sym -;; ─── sum TIMESTAMP vec (lines 257-261 in ray_sum_fn) ───────────────── -;; TIMESTAMP scalars as vec via `as`; sum returns a TIMESTAMP atom. +;; ─── sum TIMESTAMP vec → type error (canonical aggregand table) ────── +;; TIMESTAMP is an absolute point in time; summing absolute timestamps is +;; meaningless (matches DuckDB: sum(TIMESTAMP) is a type error), so sum +;; rejects it in both the scalar builtin and the DAG path. TIME (a +;; duration-like ms count) is summable — see the TIME section below. (type (as 'TIMESTAMP [1000 2000 3000])) -- 'TIMESTAMP -(sum (as 'TIMESTAMP [1000 2000 3000])) -- (as 'TIMESTAMP 6000) -;; Null-aware: null is skipped; result = 1000 + 3000 = 4000. -(sum (as 'TIMESTAMP [1000 0N 3000])) -- (as 'TIMESTAMP 4000) +(sum (as 'TIMESTAMP [1000 2000 3000])) !- type +(sum (as 'TIMESTAMP [1000 0N 3000])) !- type ;; ─── sum TIME vec (lines 251-255 in ray_sum_fn) ────────────────────── ;; TIME vec; result type TIME. diff --git a/test/rfl/agg/max.rfl b/test/rfl/agg/max.rfl index ec2f4ff9..067dbbd2 100644 --- a/test/rfl/agg/max.rfl +++ b/test/rfl/agg/max.rfl @@ -29,3 +29,24 @@ ;; atom pass-through (max 99) -- 99 (max -7) -- -7 + +;; ── Wide element types (STR/GUID): max is lexicographic. Scalar and the +;; select path share one implementation and must agree — no-by + by-group. ── +(max ["cat" "apple" "bee"]) -- "cat" +(type (max ["cat" "apple" "bee"])) -- 'str +(set Tmx (table [s] (list ["cat" "apple" "bee"]))) +(at (select {r: (max s) from: Tmx}) 'r) -- ["cat"] +;; by-group (groups [cat,apple] / [bee,ant]) → lexicographic max per group +(set Txg (table [s k] (list ["cat" "apple" "bee" "ant"] [1 1 2 2]))) +(at (select {r: (max s) by: k from: Txg}) 'r) -- ["cat" "bee"] +;; GUID max: scalar == DAG (byte-order comparison; random ids) +(set Gx (guid 8)) +(set Txgu (table [g] (list Gx))) +(== (max Gx) (at (at (select {r: (max g) from: Txgu}) 'r) 0)) -- true + +;; ── BOOL max preserves the BOOL type (scalar used to widen to i64). ── +(max [false false true]) -- true +(type (max [false false true])) -- 'b8 +(set Txb (table [b] (list [false false true]))) +(at (select {r: (max b) from: Txb}) 'r) -- [true] +(type (at (select {r: (max b) from: Txb}) 'r)) -- 'B8 diff --git a/test/rfl/agg/med.rfl b/test/rfl/agg/med.rfl index 37cd3a17..c4bd3744 100644 --- a/test/rfl/agg/med.rfl +++ b/test/rfl/agg/med.rfl @@ -32,3 +32,16 @@ (med 7) -- 7.0 ;; null atom → typed-null F64 (nil? (med 0Nf)) -- true + +;; ── Whole-table (no-by) median via the DAG select path must match the +;; scalar `med` builtin. median is holistic and previously had no n_keys==0 +;; accumulator, so the select form returned 0 instead of the median. ── +(set Tmed (table [v] (list [1.0 2.0 3.0 4.0 5.0]))) +(at (select {m: (median v) from: Tmed}) 'm) -- [3.0] +(== (med [1.0 2.0 3.0 4.0 5.0]) (at (at (select {m: (median v) from: Tmed}) 'm) 0)) -- true +;; even count → average of the two middles; I64 input → F64 result +(set Tmed2 (table [v] (list [10 20 30 40]))) +(at (select {m: (median v) from: Tmed2}) 'm) -- [25.0] +;; selection-aware: median of the rows passing the filter only +(set Tmed3 (table [v] (list [1.0 2.0 3.0 4.0 5.0 100.0]))) +(at (select {m: (median v) from: Tmed3 where: (< v 50.0)}) 'm) -- [3.0] diff --git a/test/rfl/agg/min.rfl b/test/rfl/agg/min.rfl index f77aa49d..8777fddc 100644 --- a/test/rfl/agg/min.rfl +++ b/test/rfl/agg/min.rfl @@ -27,3 +27,27 @@ ;; atom pass-through (min 99) -- 99 (min -7) -- -7 + +;; ── Wide element types (STR/GUID): min is lexicographic. Scalar routes +;; STR/GUID through the DAG reducer, so scalar and the select path share +;; one implementation and must agree — no-by and by-group. ── +(min ["cat" "apple" "bee"]) -- "apple" +(type (min ["cat" "apple" "bee"])) -- 'str +(set Tmn (table [s] (list ["cat" "apple" "bee"]))) +(at (select {r: (min s) from: Tmn}) 'r) -- ["apple"] +;; by-group (groups [cat,apple] / [bee,ant]) → lexicographic min per group +(set Tmg (table [s k] (list ["cat" "apple" "bee" "ant"] [1 1 2 2]))) +(at (select {r: (min s) by: k from: Tmg}) 'r) -- ["apple" "ant"] +;; GUID min: scalar == DAG (byte-order comparison; random ids) +(set Gm (guid 8)) +(set Tmgu (table [g] (list Gm))) +(== (min Gm) (at (at (select {r: (min g) from: Tmgu}) 'r) 0)) -- true + +;; ── BOOL min preserves the BOOL type (min/max keep their element type). +;; The scalar reduction used to widen BOOL to i64 while the DAG kept bool; +;; both now return a BOOL atom/column. ── +(min [true false true]) -- false +(type (min [true false true])) -- 'b8 +(set Tmb (table [b] (list [true false true]))) +(at (select {r: (min b) from: Tmb}) 'r) -- [false] +(type (at (select {r: (min b) from: Tmb}) 'r)) -- 'B8 diff --git a/test/rfl/agg/pearson_corr.rfl b/test/rfl/agg/pearson_corr.rfl index 0b504a4d..a8b31dea 100644 --- a/test/rfl/agg/pearson_corr.rfl +++ b/test/rfl/agg/pearson_corr.rfl @@ -60,9 +60,14 @@ ;; column refs that collapses to a scalar; the non-row-aligned ;; fallback re-runs per group). (set Tq9 (table [g x y] (list [A A A A A B B B B B] [1.0 2.0 3.0 4.0 5.0 1.0 2.0 3.0 4.0 5.0] [2.0 4.0 6.0 8.0 10.0 5.0 4.0 3.0 2.0 1.0]))) -;; Group A: y = 2x → r = 1.0, r² = 1.0 -;; Group B: y = 6-x → r = -1.0, r² = 1.0 -(at (at (select {r2: (pow (pearson_corr x y) 2) by: g from: Tq9}) 'r2) 0) -- 1.0 +;; Group A: y = 2x → r = +1.0 +;; Group B: y = 6-x → r = -1.0 +;; The SIGNED coefficient must be returned per group — squaring it here +;; (a former workaround) collapsed +1 and -1 to 1.0 and hid the by-group +;; sign bug. Assert r directly so the sign is exercised. +(at (at (select {r: (pearson_corr x y) by: g from: Tq9}) 'r) 0) -- 1.0 +(at (at (select {r: (pearson_corr x y) by: g from: Tq9}) 'r) 1) -- -1.0 +;; r² parity: pow(r,2) is +1.0 for both groups (squaring discards sign). (at (at (select {r2: (pow (pearson_corr x y) 2) by: g from: Tq9}) 'r2) 1) -- 1.0 ;; Multi-key q9 — by [id2 id4]. diff --git a/test/rfl/agg/per_group_holistic.rfl b/test/rfl/agg/per_group_holistic.rfl index c56473d8..20105637 100644 --- a/test/rfl/agg/per_group_holistic.rfl +++ b/test/rfl/agg/per_group_holistic.rfl @@ -348,3 +348,30 @@ (>= Spop 0.0) -- true ;; sample variance >= pop variance per group (for n>=2) → sums obey too. (>= Vp Vpop) -- true + +;; ════════════════════════════════════════════════════════════════════ +;; Wide-element (STR/GUID) min/max/first/last per-group — these route +;; through ray_wide_minmax_per_group_buf, the same holistic post-fill +;; plumbing as median/topk. At this scale the keyed radix HT path runs, +;; exercising the radix post-fill branch (small tables hit the dense one). +;; ════════════════════════════════════════════════════════════════════ +(set Wn 6000) +(set Wsr (as 'STR (til Wn))) ;; "0".."5999" +(set Wk (% (til Wn) 200)) ;; 200 groups +(set Wt (table [s k] (list Wsr Wk))) +;; one min/max/first/last value per group +(== (count (at (select {m: (min s) by: k from: Wt}) 'm)) 200) -- true +(== (count (at (select {m: (max s) by: k from: Wt}) 'm)) 200) -- true +(== (count (at (select {m: (first s) by: k from: Wt}) 'm)) 200) -- true +(== (count (at (select {m: (last s) by: k from: Wt}) 'm)) 200) -- true +;; result columns keep the STR type (not truncated to a fixed-width int) +(type (at (select {m: (min s) by: k from: Wt}) 'm)) -- 'STR +;; group 0 holds rows 0,200,400,... → string "0" is the lexicographic min, +;; and within that group first is "0", last is "5800". +(at (at (select {m: (min s) by: k from: Wt}) 'm) 0) -- "0" +(at (at (select {m: (first s) by: k from: Wt}) 'm) 0) -- "0" +(at (at (select {m: (last s) by: k from: Wt}) 'm) 0) -- "5800" +;; GUID per-group first agrees with the scalar reducer on the same slice +(set Wg (guid Wn)) +(set Wtg (table [g k] (list Wg Wk))) +(type (at (select {m: (min g) by: k from: Wtg}) 'm)) -- 'GUID diff --git a/test/rfl/agg/sum.rfl b/test/rfl/agg/sum.rfl index 4ca5a67e..9a409ab7 100644 --- a/test/rfl/agg/sum.rfl +++ b/test/rfl/agg/sum.rfl @@ -67,3 +67,32 @@ (select {s: (avg (sqrt "x")) from: _erp_T}) !- type (select {s: (min (+ "a" 1)) from: _erp_T}) !- type (select {s: (max (+ "a" 1)) from: _erp_T}) !- type + +;; ---- M1: scalar↔DAG type-admission parity (canonical aggregand table) ---- +;; A non-numeric / absolute-temporal aggregand must error in BOTH the scalar +;; builtin and the DAG (select) path, no-by AND by-group. Previously the DAG +;; silently folded raw symbol ids / date integers; now both reject. +(set Madm (table [s d ts k] (list ['a 'b 'a] (as 'DATE [7300 7301 7302]) (as 'TIMESTAMP [1 2 3]) (as 'I64 [1 1 2])))) +;; sum SYM — scalar, no-by select, by-group select +(sum ['a 'b 'a]) !- type +(select {r: (sum s) from: Madm}) !- type +(select {r: (sum s) by: k from: Madm}) !- type +;; sum DATE / TIMESTAMP (absolute temporals) — DAG both shapes +(select {r: (sum d) from: Madm}) !- type +(select {r: (sum d) by: k from: Madm}) !- type +(select {r: (sum ts) from: Madm}) !- type +(select {r: (sum ts) by: k from: Madm}) !- type + +;; ---- M1: sum(TIME) preserves the TIME type in BOTH paths ---- +;; TIME is duration-like (time+time is a time), so sum keeps it — unlike the +;; absolute temporals above. Scalar and DAG must agree on type AND value. +(set Mtm (table [t k] (list [09:00:00.000 09:00:00.000 01:00:00.000] (as 'I64 [1 1 2])))) +;; scalar sum returns a TIME atom (lowercase 'time); the select column is a +;; vector (uppercase 'TIME). Both carry the TIME type — that is the parity. +(type (sum [09:00:00.000 09:00:00.000])) -- 'time +(type (at (select {r: (sum t) from: Mtm}) 'r)) -- 'TIME +(at (select {r: (sum t) from: Mtm}) 'r) -- [19:00:00.000] +(type (at (select {r: (sum t) by: k from: Mtm}) 'r)) -- 'TIME +(at (select {r: (sum t) by: k from: Mtm}) 'r) -- [18:00:00.000 01:00:00.000] +;; integer families still widen to I64 (no spurious type preservation) +(type (at (select {r: (sum k) from: Mtm}) 'r)) -- 'I64 diff --git a/test/rfl/datalog/datalog_branch_cov.rfl b/test/rfl/datalog/datalog_branch_cov.rfl index 94a7e453..39c1626e 100644 --- a/test/rfl/datalog/datalog_branch_cov.rfl +++ b/test/rfl/datalog/datalog_branch_cov.rfl @@ -55,6 +55,30 @@ (count (query Fdb (find ?a) (where (favg ?a)) (rules ((favg ?a) (avg ?a f64src 1))))) -- 1 (at (at (query Fdb (find ?a) (where (favg ?a)) (rules ((favg ?a) (avg ?a f64src 1)))) '?a) 0) -- 2.5 +;; ════════════════════════ MIN aggregate where a later element is smaller ═════════════════════════ +;; The MIN reduction seeds result=vd[0]; the inner `if (vd[i] < result)` body +;; (i64 line 1781 / f64 line 1806) only executes when a *later* row is smaller +;; than the first. Existing tests feed ascending columns, so they never take +;; that branch. Use a descending value column so vd[0] is the largest. +;; i64: descending {30,20,5} → min=5. +(set Imin (table ['k 'v] (list [1 2 3] [30 20 5]))) +(at (at (query Fdb (find ?m) (where (imin ?m)) (rules ((imin ?m) (min ?m Imin 1)))) '?m) 0) -- 5 +;; f64: descending {3.5,2.5,1.5} → min=1.5. +(set Fmin (table ['k 'v] (list [1 2 3] [3.5 2.5 1.5]))) +(at (at (query Fdb (find ?m) (where (fmin2 ?m)) (rules ((fmin2 ?m) (min ?m Fmin 1)))) '?m) 0) -- 1.5 + +;; ════════════════════════ f64 variable compared against an integer constant ═════════════════════════ +;; ?y is f64 (derived via + ?x 0.5) but the comparison constant is the *integer* +;; 20, stored as cmp_const (i64). This drives the f64 CMP path where rhs_src is +;; NULL and the const is widened: `rv = (double)body->cmp_const` (line 2024), +;; distinct from the `> ?y 20.0` case which parses the bound as f64. +(set Cdb (datoms)) +(set Cdb (assert-fact Cdb 1 'val 10)) +(set Cdb (assert-fact Cdb 2 'val 20)) +(set Cdb (assert-fact Cdb 3 'val 30)) +(count (query Cdb (find ?e ?y) (where (?e :val ?x) (= ?y (+ ?x 0.5)) (> ?y 20)))) -- 2 +(sum (at (query Cdb (find ?e ?y) (where (?e :val ?x) (= ?y (+ ?x 0.5)) (> ?y 20))) '?y)) -- 51.0 + ;; ════════════════════════ empty-source AVG and MAX aggregates ═════════════════════════ ;; AVG and MAX over empty source — should produce 0 rows (lines 1725-1732). (set Empty (datoms)) diff --git a/test/rfl/datalog/datalog_coverage.rfl b/test/rfl/datalog/datalog_coverage.rfl index 2328775d..4ed8f173 100644 --- a/test/rfl/datalog/datalog_coverage.rfl +++ b/test/rfl/datalog/datalog_coverage.rfl @@ -392,6 +392,29 @@ ;; head with unsupported constant type (a list literal) — type error. (rule (bad-h [1 2]) (?e :role 'admin)) !- type +;; head with a quoted-symbol constant — the -RAY_SYM literal branch (line 3852), +;; distinct from the "static" string-intern branch above. +(set qSym (query Db (find ?x) (where (lit-s ?x)) (rules ((lit-s 'mytag) (?e :role 'user))))) +(count qSym) -- 1 +(at (at qSym '?x) 0) -- 'mytag + +;; ════════════════════════ body constant via evaluated (non-literal) expression ═════════════════════════ +;; A body-triple position that is neither a bare ?var / int / sym / str literal +;; falls through dl_set_body_pos to ray_eval (lines 3562-3573). Exercise both +;; the I64 result (line 3566), the SYM result (line 3568), and the +;; unsupported-type error (lines 3570-3571). +(set Edb (datoms)) +(set Edb (assert-fact Edb 1 'age 30)) +(set Edb (assert-fact Edb 2 'age 25)) +(set Edb (assert-fact Edb 1 'role 'admin)) +(set Edb (assert-fact Edb 2 'role 'user)) +;; (+ 10 20) → i64 30 ; matches entity 1's age. +(count (query Edb (find ?e) (where (gi ?e)) (rules ((gi ?e) (?e :age (+ 10 20)))))) -- 1 +;; (first ['admin 'user]) → sym 'admin ; matches entity 1's role. +(count (query Edb (find ?e) (where (gs ?e)) (rules ((gs ?e) (?e :role (first ['admin 'user])))))) -- 1 +;; evaluates to a float → unsupported constant type in body. +(query Edb (find ?e) (where (gb ?e)) (rules ((gb ?e) (?e :age (+ 1.0 2.0))))) !- type + ;; ════════════════════════ recursive rule via inline rules (no globals) ═════════════════════════ ;; Chain 1→2→3→4: TC pairs = (1,2),(1,3),(1,4),(2,3),(2,4),(3,4) — 6 rows. (set Gdb (datoms)) diff --git a/test/rfl/fused/fused_group_coverage.rfl b/test/rfl/fused/fused_group_coverage.rfl index 52f36c14..d5977465 100644 --- a/test/rfl/fused/fused_group_coverage.rfl +++ b/test/rfl/fused/fused_group_coverage.rfl @@ -1471,3 +1471,78 @@ (at (at R53 'k) 0) -- 1 (at (at R53 'k) 1) -- 2 (at (at R53 'k) 2) -- 3 + +;; ==================================================================== +;; Section 54: mk_par_fn SUM/MIN/MAX/AVG pass-2 (single key, narrow) +;; +;; Single-key group-by with non-COUNT aggregates + a WHERE predicate +;; routes through exec_filtered_group_multi → mk_compile (v2 rejected +;; because n_keys == 1) → mk_par_fn. Exercises mk_par_fn's narrow +;; per-agg state init (MIN=INT64_MAX, MAX=INT64_MIN) and the PASS-2 +;; per-agg accumulate loops for SUM/MIN/MAX/AVG (lines ~3453, ~3523-3553) +;; that the COUNT-only tests above never reach. +;; ==================================================================== +(set T54 (table [k v g] (list (as 'I64 [1 2 3 4 5 6 7 8 9 10]) (as 'I64 [10 20 30 40 50 60 70 80 90 100]) (as 'I64 [1 1 2 2 3 3 4 4 5 5])))) +(set R54 (select {mn: (min v) mx: (max v) sm: (sum v) av: (avg v) from: T54 where: (> k 0) by: g})) +(count R54) -- 5 +;; per-group: g=1 {10,20} g=2 {30,40} g=3 {50,60} g=4 {70,80} g=5 {90,100} +(sum (at R54 'sm)) -- 550 +(sum (at R54 'mn)) -- 250 +(sum (at R54 'mx)) -- 300 +(sum (at R54 'av)) -- 275.0 +;; key-masked spot checks (group emission order is non-deterministic): +;; min of group g==1 is 10, max is 20. +(min (at R54 'mn)) -- 10 +(max (at R54 'mx)) -- 100 + +;; SUM only, single key — separate mk_par_fn SUM pass-2. +(set R54b (select {sm: (sum v) from: T54 where: (>= v 30) by: g})) +;; rows with v>=30: groups g=2{30,40},g=3{50,60},g=4{70,80},g=5{90,100} → 4 groups +(count R54b) -- 4 +(sum (at R54b 'sm)) -- 520 + +;; ==================================================================== +;; Section 55: mk_par_fn WIDE path (two I64 keys = 16 bytes) MIN/MAX/AVG +;; +;; total_bytes (8+8) > 8 → ctx.wide = 1. MIN/MAX aggs keep v2 disabled, +;; so this routes to mk_par_fn's `wide` branch: composite key via +;; mk_compose_key2 + slots_hi, wide per-agg state init (lines ~3492-3500) +;; and the wide-slot probe. mk_combine_and_materialize's wide serial +;; dedup + mk_state_merge MIN/MAX merge cross-worker partials. +;; ==================================================================== +(set T55 (table [a b v] (list (as 'I64 [1 1 2 2 1 1 2 2]) (as 'I64 [10 20 10 20 10 20 10 20]) (as 'I64 [5 6 7 8 9 1 2 3])))) +;; Include sum so the wide branch's SUM state-init arm is exercised too. +(set R55 (select {mn: (min v) mx: (max v) sm: (sum v) av: (avg v) from: T55 where: (> v 0) by: [a b]})) +;; groups (a,b): (1,10){5,9}=min5/max9 (1,20){6,1}=min1/max6 (2,10){7,2}=min2/max7 (2,20){8,3}=min3/max8 +(count R55) -- 4 +(sum (at R55 'mn)) -- 11 +(sum (at R55 'mx)) -- 30 +(sum (at R55 'sm)) -- 41 +(min (at R55 'mn)) -- 1 +(max (at R55 'mx)) -- 9 + +;; ==================================================================== +;; Section 56: mk_apply_count_emit_filter — non-COUNT order ops +;; +;; Multi-key group with a MIN/MAX/SUM aggregate plus `asc/desc +;; take K` sets ray_group_emit_filter with agg_op == OP_MIN/MAX/SUM. +;; mk_apply_count_emit_filter maps order_op → mk_agg kind and runs the +;; top-K heap over the post-combine state (lines ~3648-3658 OP_SUM / +;; OP_MIN / OP_MAX arms). All 8 (a,b) pairs are distinct so each group +;; holds one row (min==max==sum==v). +;; ==================================================================== +(set T56 (table [a b v] (list (as 'I32 [1 1 2 2 3 3 4 4]) (as 'I32 [10 20 10 20 10 20 10 20]) (as 'I64 [5 6 7 8 90 1 2 3])))) +;; MAX order, desc, top 2 → largest maxes 90 and 8 → sum 98. +(set R56mx (select {mx: (max v) from: T56 where: (> a 0) by: [a b] desc: mx take: 2})) +(count R56mx) -- 2 +(sum (at R56mx 'mx)) -- 98 +(max (at R56mx 'mx)) -- 90 +;; MIN order, asc, top 2 → smallest mins 1 and 2 → sum 3. +(set R56mn (select {mn: (min v) from: T56 where: (> a 0) by: [a b] asc: mn take: 2})) +(count R56mn) -- 2 +(sum (at R56mn 'mn)) -- 3 +(min (at R56mn 'mn)) -- 1 +;; SUM order, desc, top 2 → largest sums 90 and 8 → sum 98. +(set R56sm (select {sm: (sum v) from: T56 where: (> a 0) by: [a b] desc: sm take: 2})) +(count R56sm) -- 2 +(sum (at R56sm 'sm)) -- 98 diff --git a/test/rfl/group/null_aware_helpers.rfl b/test/rfl/group/null_aware_helpers.rfl index c5400032..7936e5d1 100644 --- a/test/rfl/group/null_aware_helpers.rfl +++ b/test/rfl/group/null_aware_helpers.rfl @@ -63,13 +63,17 @@ ;; perfect linear correlation → r=1.0. (set T_gp_i64 (table [g x y] (list [A A A A A B B B B B] (as 'I64 [1 0N 3 4 5 1 0N 3 4 5]) (as 'I64 [2 4 6 8 10 5 4 3 2 1])))) (count (select {r: (pearson_corr x y) by: g from: T_gp_i64})) -- 2 -;; Value-pair: group A linear y=2x → r=1.0 ; group B linear y=6-x → r=1.0 in absolute value. -;; The engine appears to compute |r| or r²; check first-row r value. -(at (at (select {r: (pearson_corr x y) by: g from: T_gp_i64}) 'r) 0) -- 1.0 +;; Group A linear y=2x → r=+1.0 ; group B linear y=6-x → r=-1.0. The signed +;; coefficient is returned per group, so the two r values are {+1.0, -1.0}. +;; Group emit order is hash-dependent, so assert order-independently. +(min (at (select {r: (pearson_corr x y) by: g from: T_gp_i64}) 'r)) -- -1.0 +(max (at (select {r: (pearson_corr x y) by: g from: T_gp_i64}) 'r)) -- 1.0 ;; F64 x with nulls: NaN arm of grpc_is_null. (set T_gp_f64 (table [g x y] (list [A A A A A B B B B B] (as 'F64 [1.0 0N 3.0 4.0 5.0 1.0 0N 3.0 4.0 5.0]) (as 'F64 [2.0 4.0 6.0 8.0 10.0 5.0 4.0 3.0 2.0 1.0])))) -(at (at (select {r: (pearson_corr x y) by: g from: T_gp_f64}) 'r) 0) -- 1.0 +;; same {+1.0, -1.0} signed pair, F64 null arm +(min (at (select {r: (pearson_corr x y) by: g from: T_gp_f64}) 'r)) -- -1.0 +(max (at (select {r: (pearson_corr x y) by: g from: T_gp_f64}) 'r)) -- 1.0 ;; I32 y with nulls: I32/DATE/TIME arm via y path (grpc_is_null on y_data). (set T_gp_i32 (table [g x y] (list [A A A A B B B B] (as 'I64 [1 2 3 4 1 2 3 4]) (as 'I32 [2 4 0N 8 5 0N 3 2])))) diff --git a/test/rfl/hof/eval_branch_cov.rfl b/test/rfl/hof/eval_branch_cov.rfl index 2e082761..66c65040 100644 --- a/test/rfl/hof/eval_branch_cov.rfl +++ b/test/rfl/hof/eval_branch_cov.rfl @@ -530,3 +530,14 @@ _xself -- 5 (try (_wrapper_wrong) (fn [e] -1)) -- -1 (set _wrapper_short (fn [] (_stored_2arg 1))) (try (_wrapper_short) (fn [e] -1)) -- -1 + +;; ═══════════════════════════════════════════════════════════════════ +;; 31. ray_eval RAY_LAMBDA argc > 64 → domain (eval.c 3127). +;; §10 covers the RAY_VARY argc>64 guard; this hits the LAMBDA arm. +;; An inline lambda applied to 65 evaluated args trips the guard +;; before call_lambda runs. +;; ═══════════════════════════════════════════════════════════════════ +((fn [x] x) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65) !- domain +;; Exactly 64 args is within bounds (no domain trip) — arity mismatch +;; instead (the lambda takes 1 param), which surfaces as an arity error. +((fn [x] x) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64) !- arity diff --git a/test/rfl/integration/cross_type_workout.rfl b/test/rfl/integration/cross_type_workout.rfl index 5140a171..1e98dda1 100644 --- a/test/rfl/integration/cross_type_workout.rfl +++ b/test/rfl/integration/cross_type_workout.rfl @@ -79,10 +79,11 @@ ;; ════════════════════ COMPARISON ON COLUMNS ════════════════════ ;; sym='AAPL kept → every kept sym is 'AAPL. (count (select {from: T where: (== sym 'AAPL)})) -- 40 -(min (== (at (select {from: T where: (== sym 'AAPL)}) 'sym) 'AAPL)) -- 1 +;; min over a BOOL vector returns a BOOL (min/max preserve element type). +(min (== (at (select {from: T where: (== sym 'AAPL)}) 'sym) 'AAPL)) -- true ;; != 'TSLA → 160 rows, no 'TSLA in kept set. (count (select {from: T where: (!= sym 'TSLA)})) -- 160 -(max (== (at (select {from: T where: (!= sym 'TSLA)}) 'sym) 'TSLA)) -- 0 +(max (== (at (select {from: T where: (!= sym 'TSLA)}) 'sym) 'TSLA)) -- false ;; > price 130 → kept rows price min > 130. (count (select {from: T where: (> price 130.0)})) -- 76 (> (min (at (select {from: T where: (> price 130.0)}) 'price)) 130.0) -- true @@ -95,9 +96,9 @@ (< (max (at (select {from: T where: (and (> qty 0) (< qty 100))}) 'qty)) 100) -- true ;; OR — kept syms must be AAPL or GOOG. (count (select {from: T where: (or (== sym 'AAPL) (== sym 'GOOG))})) -- 80 -;; not (== bool true) → all kept rows bool=false; max should be false (0). +;; not (== bool true) → all kept rows bool=false; max of a BOOL column is BOOL. (count (select {from: T where: (not (== bool true))})) -- 100 -(max (at (select {from: T where: (not (== bool true))}) 'bool)) -- 0 +(max (at (select {from: T where: (not (== bool true))}) 'bool)) -- false ;; ════════════════════ SORT ════════════════════ ;; whole-vec sort returns a fresh vec diff --git a/test/rfl/io/csv_branch_cov.rfl b/test/rfl/io/csv_branch_cov.rfl index 4fe8e498..46a9ee45 100644 --- a/test/rfl/io/csv_branch_cov.rfl +++ b/test/rfl/io/csv_branch_cov.rfl @@ -276,6 +276,126 @@ (nil? (at (at Rspn 'a) 1)) -- true (nil? (at (at Rspn 'b) 2)) -- true +;; ════════════════════════════════════════════════════════════════════════════ +;; 18. Parallel parse path (csv_parse_fn) — non-null type arms. +;; ray_read_csv_named_opts dispatches csv_parse_fn on the thread pool only +;; when n_rows > 8192. A 9000-row file with an explicit all-types schema +;; drives the BOOL/U8/I64/F64 arms (lines ~959-1001) that the serial path +;; leaves uncovered. CRLF line endings additionally exercise the trailing +;; "\r" strip on the last field of each row (line ~955). +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "awk 'BEGIN{printf \"b,u,n,f\\r\\n\"; for(i=1;i<=9000;i++){printf \"true,%d,%d,%d.5\\r\\n\", i%200, i, i}}' > rf_test_csvbc_pbig.csv") -- 0 +(set Tpb (.csv.read [B8 U8 I64 F64] "rf_test_csvbc_pbig.csv")) +(count Tpb) -- 9000 +(at (at Tpb 'b) 0) -- true +(at (at Tpb 'u) 0) -- (as 'U8 1) +(at (at Tpb 'n) 0) -- 1 +(at (at Tpb 'f) 0) -- 1.5 +;; last row survives the CRLF strip intact +(at (at Tpb 'n) 8999) -- 9000 +(at (at Tpb 'f) 8999) -- 9000.5 + +;; ════════════════════════════════════════════════════════════════════════════ +;; 19. Parallel parse path (csv_parse_fn) — DATE/TIME/TIMESTAMP/GUID/I16/I32 +;; non-null arms plus the empty-cell null arms for each (the `is_null` +;; branches at lines ~986/993/1006/1013/1020/1028 and the STR flen==0 +;; branch at ~1035). Every 7th row is all-empty so each nullable column +;; records a null in the parallel worker. +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "awk 'BEGIN{print \"h,w,d,t,ts,g,s\"; for(i=1;i<=9000;i++){ if(i%7==0){print \",,,,,,\"} else {printf \"%d,%d,2020-01-01,10:30:45,2020-01-01T10:30:45,00000000-0000-0000-0000-000000000001,str%d\\n\", i%30000, i, i%100} } }' > rf_test_csvbc_pnull.csv") -- 0 +(set Tpn (.csv.read [I16 I32 DATE TIME TIMESTAMP GUID STR] "rf_test_csvbc_pnull.csv")) +(count Tpn) -- 9000 +;; row 6 (i==7) is the first all-empty row → null in every nullable column +(nil? (at (at Tpn 'h) 6)) -- true +(nil? (at (at Tpn 'w) 6)) -- true +(nil? (at (at Tpn 'd) 6)) -- true +(nil? (at (at Tpn 't) 6)) -- true +(nil? (at (at Tpn 'ts) 6)) -- true +;; non-empty rows keep their parsed values +(at (at Tpn 'h) 0) -- (as 'I16 1) +(at (at Tpn 'w) 0) -- (as 'I32 1) +(at (at Tpn 'd) 0) -- 2020.01.01 +(at (at Tpn 't) 0) -- 10:30:45.000 + +;; ════════════════════════════════════════════════════════════════════════════ +;; 20. Short-row guard (csv_parse_serial line ~1087): a data row with fewer +;; fields than ncols. Once scan_field walks past row_end, the remaining +;; columns are filled with type defaults + null (nullable types) so the +;; row count is preserved and downstream columns stay aligned. +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "printf 'a,b,c\n1,2,3\n9\n4,5,6\n' > rf_test_csvbc_short.csv") -- 0 +(set Tsh (.csv.read [I64 I64 I64] "rf_test_csvbc_short.csv")) +(count Tsh) -- 3 +(at (at Tsh 'a) 1) -- 9 +(nil? (at (at Tsh 'b) 1)) -- true +(nil? (at (at Tsh 'c) 1)) -- true +;; full rows on either side are unaffected +(at (at Tsh 'c) 0) -- 3 +(at (at Tsh 'a) 2) -- 4 + +;; ════════════════════════════════════════════════════════════════════════════ +;; 21. .csv.parted trace branches (ray_csv_save_parted_named_opts, lines +;; ~2410-2553 guarded by getenv("RAY_CSV_TRACE")). Enabling the env var +;; exercises the per-stage stderr trace prints; rows_per_part=2 over 3 rows +;; forces a two-part split so the per-part save trace fires more than once. +;; ════════════════════════════════════════════════════════════════════════════ +(.os.setenv "RAY_CSV_TRACE" "1") -- "1" +(.sys.exec "rm -rf rf_test_csvbc_pttrace") -- 0 +(.sys.exec "printf 'a,b\n1,10\n2,20\n3,30\n' > rf_test_csvbc_pttrace.csv") -- 0 +(set Rpt (.csv.parted "rf_test_csvbc_pttrace.csv" 2 "rf_test_csvbc_pttrace" 'tbl)) +(count Rpt) -- 3 +(sum (at Rpt 'a)) -- 6 +(sum (at Rpt 'b)) -- 60 +(.os.setenv "RAY_CSV_TRACE" "") -- "" +(.sys.exec "rm -rf rf_test_csvbc_pttrace") -- 0 + +;; ════════════════════════════════════════════════════════════════════════════ +;; 22. Chunk-zone + hash-index attach (ray_read_csv_named_opts lines ~1937-1945): +;; a numeric column only gets a per-chunk zone index once its length reaches +;; one chunk (65536 rows); high-entropy values then upgrade to a hash index +;; (csv_should_attach_hash). A 70000-row pseudo-random I64 column crosses +;; both thresholds. Spot-check aggregates so the index attach can't quietly +;; corrupt the payload. +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "awk 'BEGIN{print \"id\"; for(i=1;i<=70000;i++) print (i*2654435761)%2000000000}' > rf_test_csvbc_zone.csv") -- 0 +(set Tz (.csv.read [I64] "rf_test_csvbc_zone.csv")) +(count Tz) -- 70000 +(type (at Tz 'id)) -- 'I64 +;; first value: (1*2654435761) % 2000000000 = 654435761 +(at (at Tz 'id) 0) -- 654435761 + +;; ════════════════════════════════════════════════════════════════════════════ +;; 23. Parallel materialize path (csv_materialize_rows lines ~1409-1442): the +;; parted/splayed writers materialize each chunk through csv_materialize_rows, +;; which dispatches csv_parse_fn on the pool when the chunk exceeds 8192 rows. +;; The default rows-per-part (1e6) keeps a 9000-row file in a single chunk so +;; the whole batch goes through the parallel branch. An inferred SYM column +;; ("str%50") also exercises the sym-narrow path inside the materializer. +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "rm -rf rf_test_csvbc_mr") -- 0 +(.sys.exec "awk 'BEGIN{print \"n,s\"; for(i=1;i<=9000;i++){printf \"%d,str%d\\n\", i, i%50}}' > rf_test_csvbc_mr.csv") -- 0 +(set Rmr (.csv.parted "rf_test_csvbc_mr.csv" "rf_test_csvbc_mr" 'tbl)) +(count Rmr) -- 9000 +;; sum 1..9000 = 9000*9001/2 = 40504500 +(sum (at Rmr 'n)) -- 40504500 +;; the inferred string column round-trips back (parted reload prepends 'part) +(key Rmr) -- ['part 'n 's] + +;; ════════════════════════════════════════════════════════════════════════════ +;; 24. Parallel materialize through the splayed writer (csv_splayed_writer_append +;; SYM + fixed-width arms over a 9000-row chunk). The inferred SYM column +;; forces the W32 sym-buffer append loop; the numeric column drives the +;; fixed-width fwrite arm. Re-load from disk confirms the round trip. +;; ════════════════════════════════════════════════════════════════════════════ +(.sys.exec "rm -rf rf_test_csvbc_mrsp") -- 0 +(.sys.exec "awk 'BEGIN{print \"n,s\"; for(i=1;i<=9000;i++){printf \"%d,str%d\\n\", i, i%50}}' > rf_test_csvbc_mrsp.csv") -- 0 +(set Rsp (.csv.splayed "rf_test_csvbc_mrsp.csv" "rf_test_csvbc_mrsp")) +(count Rsp) -- 9000 +(set Rspget (.db.splayed.get "rf_test_csvbc_mrsp")) +(count Rspget) -- 9000 +(sum (at Rspget 'n)) -- 40504500 +(key Rspget) -- ['n 's] + ;; ── cleanup ────────────────────────────────────────────────────────────────── (.sys.exec "rm -f rf_test_csvbc_*.csv") -- 0 -(.sys.exec "rm -rf rf_test_csvbc_parted rf_test_csvbc_splayed") -- 0 +(.sys.exec "rm -rf rf_test_csvbc_parted rf_test_csvbc_splayed rf_test_csvbc_pttrace rf_test_csvbc_mr rf_test_csvbc_mrsp") -- 0 diff --git a/test/rfl/ops/builtins_branch_cov.rfl b/test/rfl/ops/builtins_branch_cov.rfl index bb14386b..03e3dc76 100644 --- a/test/rfl/ops/builtins_branch_cov.rfl +++ b/test/rfl/ops/builtins_branch_cov.rfl @@ -1237,6 +1237,62 @@ (nil? (at (as 'I64 (as 'F64 [1.0 0Nf 3.0])) 1)) -- true +;; ═════════════════════════════════════════════════════════════════════ +;; 55. atom_hash temporal/GUID atom branches (lines 2120-2128) +;; Reached when group()'s RAY_LIST path hashes a boxed list whose +;; elements are scalar temporal/guid atoms. Mixing a string into each +;; inner list forces a boxed LIST (not a homogeneous typed vector), so +;; atom_hash recurses into the scalar -RAY_DATE / -RAY_TIME / +;; -RAY_TIMESTAMP / -RAY_GUID atom cases. +;; ═════════════════════════════════════════════════════════════════════ + +;; -RAY_DATE atom hash (line 2121-2122): 2 distinct dates → 2 groups +(count (group (list (list 2000.01.01 "x") (list 2000.01.01 "x") (list 2000.01.02 "x")))) -- 2 + +;; -RAY_TIME atom hash (line 2121-2122): 2 distinct times → 2 groups +(count (group (list (list 12:00:00.000 "x") (list 12:00:00.000 "x") (list 13:00:00.000 "x")))) -- 2 + +;; -RAY_TIMESTAMP atom hash (line 2123): equal timestamps → 1 group +(count (group (list (list (as 'TIMESTAMP "2020-01-01T00:00:00") "x") (list (as 'TIMESTAMP "2020-01-01T00:00:00") "x")))) -- 1 + +;; -RAY_GUID atom hash (lines 2124-2128): 2 distinct guids → 2 groups +(count (group (list (list (as 'GUID "00000000-0000-0000-0000-000000000001") "x") (list (as 'GUID "00000000-0000-0000-0000-000000000001") "x") (list (as 'GUID "00000000-0000-0000-0000-000000000002") "x")))) -- 2 + +;; ═════════════════════════════════════════════════════════════════════ +;; 56. cast_vec_numeric slow-path per-element switch (lines 1175-1183) +;; The fast path is skipped when the source vector is STR, so casting a +;; STR vector to a numeric/bool target drives the per-element slow loop +;; and exercises each out_type arm (I32/I16/U8/F64/BOOL). +;; ═════════════════════════════════════════════════════════════════════ + +;; STR vec → I32 (slow loop, RAY_I32 arm) +(at (as 'I32 (enlist "10" "20" "30")) 0) -- 10i +(type (as 'I32 (enlist "10" "20"))) -- 'I32 + +;; STR vec → I16 (slow loop, RAY_I16 arm) +(at (as 'I16 (enlist "1" "2")) 1) -- 2h + +;; STR vec → U8 (slow loop, RAY_U8 arm) +(at (as 'U8 (enlist "5" "6")) 0) -- 0x05 +(type (as 'U8 (enlist "5" "6"))) -- 'U8 + +;; STR vec → F64 (slow loop, RAY_F64 arm) +(at (as 'F64 (enlist "1.5" "2.5")) 0) -- 1.5 + +;; STR vec → B8 (slow loop, RAY_BOOL arm) — length-based: non-empty=true +(at (as 'B8 (enlist "x" "")) 0) -- true +(at (as 'B8 (enlist "x" "")) 1) -- false + +;; ═════════════════════════════════════════════════════════════════════ +;; 57. ray_concat_fn dict-concat heterogeneous-key fallback (lines 2920-2934) +;; A dict whose key vector is neither SYM nor I64/TIMESTAMP (here F64) +;; falls into the else branch that boxes each key via collection_elem +;; and upserts it. Concat of two F64-keyed dicts drives this. +;; ═════════════════════════════════════════════════════════════════════ + +(at (concat (dict [1.0] (list "a")) (dict [2.0] (list "b"))) 1.0) -- "a" +(at (concat (dict [1.0] (list "a")) (dict [2.0] (list "b"))) 2.0) -- "b" + ;; ═════════════════════════════════════════════════════════════════════ ;; UNREACHABLE BRANCHES (documented): ;; @@ -1281,10 +1337,18 @@ ;; Lines 2638-2643: F32 group key reconstruction — F32 not directly ;; constructible from RFL. ;; +;; Lines 2552-2567: group_fn NaN float own-bucket path — UNREACHABLE. +;; In rayforce NULL_F64 == __builtin_nan(""), so every NaN is the F64 +;; null sentinel and is caught earlier by ray_vec_is_null() (null +;; routing, line 2508). Control never reaches the dedicated NaN +;; branch. (Confirmed via test/rfl/arith/sqrt.rfl null semantics.) +;; ;; Lines 2691-2696: gfail label — error cleanup path, only on OOM. ;; ;; Lines 2813, 2842, 2885: concat default type error for exotic types. ;; -;; Lines 2902-2925: Dict concat with LIST/I64 keys — partially covered -;; by test 39/41 above; some sub-paths need non-SYM boxed keys. +;; Lines 2902-2911: Dict concat with boxed-LIST keys — UNREACHABLE via +;; the `dict` builtin (ray_dict_fn requires ray_is_vec(keys), rejecting +;; boxed-list keys with a `type` error). The non-SYM/non-I64 key +;; fallback (lines 2920-2934) IS now covered by test 57 (F64 keys). ;; ═════════════════════════════════════════════════════════════════════ diff --git a/test/rfl/ops/exec_worker_merge.rfl b/test/rfl/ops/exec_worker_merge.rfl index 7aefc666..ecf83c36 100644 --- a/test/rfl/ops/exec_worker_merge.rfl +++ b/test/rfl/ops/exec_worker_merge.rfl @@ -49,7 +49,7 @@ ;; leak through. Collapse the merged survivor column to a single bool: ;; every survivor of `in [1 2 3]` must itself be in {1,2,3} (and thus ;; non-null). Order- and shard-independent. -(min (in (at (select {from: TFN where: (in f [1.0 2.0 3.0])}) 'f) [1.0 2.0 3.0])) -- 1 +(min (in (at (select {from: TFN where: (in f [1.0 2.0 3.0])}) 'f) [1.0 2.0 3.0])) -- true ;; distinct survivor values are exactly {1.0,2.0,3.0} — guards a shard ;; that merged a 4.0/5.0 row into the match set. (count (distinct (at (select {from: TFN where: (in f [1.0 2.0 3.0])}) 'f))) -- 3 @@ -57,7 +57,7 @@ ;; 8750*(1+2+3)=52500. Detects value corruption the count misses. (sum (at (select {from: TFN where: (in f [1.0 2.0 3.0])}) 'f)) -- 52500.0 ;; not-in survivors (non-null, value 4 or 5) must NOT be in {1,2,3}. -(max (in (at (select {from: TFN where: (not-in f [1.0 2.0 3.0])}) 'f) [1.0 2.0 3.0])) -- 0 +(max (in (at (select {from: TFN where: (not-in f [1.0 2.0 3.0])}) 'f) [1.0 2.0 3.0])) -- false ;; ==================================================================== ;; exec_in_worker — RAY_SYM parallel dispatch (L590, IN_READ_I64 sym @@ -76,13 +76,13 @@ ;; EVERY merged survivor of `in ['AAPL]` actually carries 'AAPL — the ;; (min (== ...)) collapses the whole merged column to a single bool, ;; order-independent and shard-independent. -(min (== (at (select {from: TS where: (in s ['AAPL])}) 's) 'AAPL)) -- 1 +(min (== (at (select {from: TS where: (in s ['AAPL])}) 's) 'AAPL)) -- true ;; every survivor of `not-in ['AAPL]` must NOT be AAPL. -(max (== (at (select {from: TS where: (not-in s ['AAPL])}) 's) 'AAPL)) -- 0 +(max (== (at (select {from: TS where: (not-in s ['AAPL])}) 's) 'AAPL)) -- false ;; distinct survivor set of the two-symbol probe is exactly {AAPL,TSLA} ;; regardless of which shard produced each row. (count (distinct (at (select {from: TS where: (in s ['AAPL 'TSLA])}) 's))) -- 2 -(min (in (at (select {from: TS where: (in s ['AAPL 'TSLA])}) 's) ['AAPL 'TSLA])) -- 1 +(min (in (at (select {from: TS where: (in s ['AAPL 'TSLA])}) 's) ['AAPL 'TSLA])) -- true ;; ==================================================================== ;; exec_in_worker — DATE / TIMESTAMP arms of IN_READ_I64 (L586-589). @@ -96,11 +96,11 @@ ;; --- VALUE-LEVEL checks across the worker-shard merge (DATE) --- ;; every survivor of the single-date probe must equal that date — ;; guards a shard that merged a 2024.01.02 row into the match set. -(min (== (at (select {from: TD where: (in d [2024.01.01])}) 'd) 2024.01.01)) -- 1 +(min (== (at (select {from: TD where: (in d [2024.01.01])}) 'd) 2024.01.01)) -- true ;; two-date probe: distinct survivor dates are exactly {01-01,01-03}; ;; 2024.01.02 must never appear regardless of shard ordering. (count (distinct (at (select {from: TD where: (in d [2024.01.01 2024.01.03])}) 'd))) -- 2 -(max (== (at (select {from: TD where: (in d [2024.01.01 2024.01.03])}) 'd) 2024.01.02)) -- 0 +(max (== (at (select {from: TD where: (in d [2024.01.01 2024.01.03])}) 'd) 2024.01.02)) -- false ;; ==================================================================== ;; exec_in_worker — integer path with HAS_NULLS at parallel-pool size. @@ -115,14 +115,14 @@ (count (select {from: TIN where: (not-in v [1 2 3])})) -- 17500 ;; --- VALUE-LEVEL checks across the worker-shard merge (int+nulls) --- ;; every survivor of `in [1 2 3]` must itself be in {1,2,3} (non-null). -(min (in (at (select {from: TIN where: (in v [1 2 3])}) 'v) [1 2 3])) -- 1 +(min (in (at (select {from: TIN where: (in v [1 2 3])}) 'v) [1 2 3])) -- true ;; distinct survivor values are exactly {1,2,3}. (count (distinct (at (select {from: TIN where: (in v [1 2 3])}) 'v))) -- 3 ;; sum of survivors: 8750 each of 1/2/3 → 8750*6 = 52500. sum widens ;; narrow ints to i64 (k/q convention) — expected stays an i64 total. (sum (at (select {from: TIN where: (in v [1 2 3])}) 'v)) -- 52500 ;; not-in survivors (value 4 or 5) must NOT be in {1,2,3}. -(max (in (at (select {from: TIN where: (not-in v [1 2 3])}) 'v) [1 2 3])) -- 0 +(max (in (at (select {from: TIN where: (not-in v [1 2 3])}) 'v) [1 2 3])) -- false ;; not-in with a set that contains nulls — set null is dropped from ;; probe buffer (set_has_nulls path); non-null rows compared against ;; the compacted probe. @@ -130,7 +130,7 @@ ;; survivors of the null-bearing probe are still exactly {1,2,3}: the ;; set-null must not match the column-nulls (null is a distinct least ;; value, never equal under `in`). -(min (in (at (select {from: TIN where: (in v [1 0Nl 2 3])}) 'v) [1 2 3])) -- 1 +(min (in (at (select {from: TIN where: (in v [1 0Nl 2 3])}) 'v) [1 2 3])) -- true (count (distinct (at (select {from: TIN where: (in v [1 0Nl 2 3])}) 'v))) -- 3 ;; ==================================================================== diff --git a/test/rfl/ops/idiom_branch_cov.rfl b/test/rfl/ops/idiom_branch_cov.rfl index 4f360dca..f733f53b 100644 --- a/test/rfl/ops/idiom_branch_cov.rfl +++ b/test/rfl/ops/idiom_branch_cov.rfl @@ -349,8 +349,9 @@ (distinct [true false true false]) -- [false true] (count (asc [true false true])) -- 3 (asc [true false true]) -- [false true true] -(first (asc [true false true])) -- 0 -(last (asc [true false true])) -- 1 +;; first/last over a BOOL vector now preserve the BOOL type. +(first (asc [true false true])) -- false +(last (asc [true false true])) -- true ;; ────────────────────────────────────────────────────────────────────── ;; Section M: large graph — exercises heap-alloc path for stk1/stk2 diff --git a/test/rfl/ops/join_branch_cov.rfl b/test/rfl/ops/join_branch_cov.rfl index 111a4f2d..17d82cce 100644 --- a/test/rfl/ops/join_branch_cov.rfl +++ b/test/rfl/ops/join_branch_cov.rfl @@ -163,6 +163,19 @@ (set ajR7 (table [ID Ts Ref] (list [1 1 2 2] [2024.01.01D10:00:00.000000000 2024.01.01D10:00:04.000000000 2024.01.01D10:00:01.000000000 2024.01.01D10:00:05.000000000] [100 200 300 400]))) (at (asof-join [ID Ts] ajL7 ajR7) 'Ref) -- [100 300 200 400] +;; ────────────────────────────────────────────────────────────────── +;; asof-join: right times DESCENDING within a single partition. +;; Forces the right-side bottom-up mergesort to reorder by time, hitting +;; the `rt_time[ai] > rt_time[bi]` tie-break swap (exec_window_join L1803). +;; Times are non-sorted within the part, so the part-index presort +;; fast-path is rejected and the O(N log N) sort definitely runs. +;; Right (a), (time,Ref) descending in time: (04,100)(03,200)(02,300)(01,400). +;; Left (a, 10:00:03.500) → newest right time <= it is 03.000 → Ref 200. +;; ────────────────────────────────────────────────────────────────── +(set ajDL (table [Sym Time Px] (list [a] [10:00:03.500] [10.0]))) +(set ajDR (table [Sym Time Ref] (list [a a a a] [10:00:04.000 10:00:03.000 10:00:02.000 10:00:01.000] [100 200 300 400]))) +(at (asof-join [Sym Time] ajDL ajDR) 'Ref) -- [200] + ;; ────────────────────────────────────────────────────────────────── ;; asof-join: Date time type (32-bit) — exercises READ_TIME macro L1597 ;; ────────────────────────────────────────────────────────────────── diff --git a/test/rfl/ops/opt_advanced.rfl b/test/rfl/ops/opt_advanced.rfl index 38107477..1116ff9b 100644 --- a/test/rfl/ops/opt_advanced.rfl +++ b/test/rfl/ops/opt_advanced.rfl @@ -424,3 +424,27 @@ ;; Three-way: bool col + comparison + comparison, exercises insertion-sort ;; ranking with mixed costs. (count (select {from: TBC where: (and b (> a 1) (< a 8))})) -- 3 + +;; ==================================================================== +;; mark_live (DCE) — OP_PIVOT ext-child walk — opt.c:827-836 +;; +;; The `pivot` builtin takes the OP_PIVOT DAG fast path when the pivot +;; column, value column, and all index columns are non-STR with a known +;; agg op (tblop.c:149-178). ray_execute then runs the full optimizer, +;; and pass_dce → mark_live must reach the OP_PIVOT switch arm to keep +;; the structural ext columns (index_cols / pivot_col / value_col) live +;; — they live only in the ext node, never in inputs[]. If this arm +;; were skipped the scans feeding the pivot would be marked DEAD and the +;; result would be wrong, so a correct value pins the path. +;; +;; index=k1 (sym), pivot=k2 (i64 → DAG-eligible), value=v (i64), agg=sum. +;; k1='a: k2=1 → v(10+50)=60, k2=2 → v 20 +;; k1='b: k2=1 → v 30, k2=2 → v 40 +;; ==================================================================== +(set Tpvt (table [k1 k2 v] (list ['a 'a 'b 'b 'a] [1 2 1 2 1] [10 20 30 40 50]))) +(set Ppvt (pivot Tpvt 'k1 'k2 'v sum)) +(count Ppvt) -- 2 +(at Ppvt 'k1) -- ['a 'b] +;; value columns are named by the distinct k2 partition values (sym '1, '2) +(sum (at Ppvt '1)) -- 90 +(sum (at Ppvt '2)) -- 60 diff --git a/test/rfl/query/query_dag_agg_coverage.rfl b/test/rfl/query/query_dag_agg_coverage.rfl index 17339d4e..fa488c8e 100644 --- a/test/rfl/query/query_dag_agg_coverage.rfl +++ b/test/rfl/query/query_dag_agg_coverage.rfl @@ -37,8 +37,9 @@ ;; OP_VAR (line 1260) — sample variance = 10.0 (at (at (select {va: (var v) from: Tdag}) 'va) 0) -- 10.0 -;; OP_MEDIAN (line 1262) — median of [2,4,6,8,10] = 6 -(at (at (select {m: (med v) from: Tdag}) 'm) 0) -- 0 +;; OP_MEDIAN (line 1262) — median of [2,4,6,8,10] = 6. The no-by select path +;; now matches the scalar `med` builtin (was 0 before the holistic fix). +(at (at (select {m: (med v) from: Tdag}) 'm) 0) -- 6.0 ;; Multiple no-by aggs in one select (set Tmulti (table [v] (list [1 2 3 4 5]))) diff --git a/test/rfl/sort/sort_coverage4.rfl b/test/rfl/sort/sort_coverage4.rfl new file mode 100644 index 00000000..6cdd2f8f --- /dev/null +++ b/test/rfl/sort/sort_coverage4.rfl @@ -0,0 +1,89 @@ +;; Pass-4 structural-coverage push for src/ops/sort.c. +;; +;; Targets reachable uncovered lines left after sort_coverage{,2,3}.rfl. +;; Citations are line ranges in src/ops/sort.c (as of this revision). +;; Thresholds: RADIX_SORT_THRESHOLD=4096, SMALL_POOL_THRESHOLD=8192, +;; RAY_STRSORT_KEY_PARTS_MAX=4 (32-byte packed prefix), +;; RAY_STRSORT_BASE_CASE=24. +;; ==================================================================== + +;; ──────────────────────────────────────────────────────────────────── +;; 1. STR sort tail comparison — strkey_cmp tail bytes (L1211-1220). +;; +;; The packed strkey holds only the first RAY_STRSORT_KEY_PARTS_MAX*8 +;; = 32 bytes. Two records whose first 32 bytes are identical tie on +;; every packed part, so the comparator falls through to the pooled +;; tail memcmp. strkey_cmp is only reached from the bucket +;; insertion-sort base case (<= RAY_STRSORT_BASE_CASE = 24 records), +;; and sort_str_msd_inplace only runs for nrows > 64. So: pad the +;; input with short, first-byte-distinct strings that branch away at +;; byte 0, funnelling the long 32-byte-prefix strings into one small +;; bucket that reaches the insertion sort while the packed window is +;; still active. +;; ──────────────────────────────────────────────────────────────────── + +;; 5 long 'a'-prefixed strings (35 bytes, share the first 32 bytes, +;; differ only in the "_sNN" tail) + 65 short padding rows. The 5 long +;; strings land in a <=24 bucket → insertion sort → strkey_cmp tail +;; memcmp (L1211-1219). All five tie on parts, so the tail bytes order +;; them: _s01 < _s03 < _s05 < _s07 < _s09. +(set Vtail (concat (concat ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s01" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s09" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s05" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s03" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s07"] (take ["b0" "c1" "d2"] 35)) (take ["m0" "n1"] 30))) +(count (asc Vtail)) -- 70 +;; 'a' (0x61) < every padding first byte, so the long strings sort first; +;; among them the tail bytes give _s01 smallest. +(at (asc Vtail) 0) -- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_s01" +(at (take (asc Vtail) -1) 0) -- "n1" +;; descending: largest long-tail (_s09) is not first (padding 'n'/'m' +;; outrank 'a'); just assert row count is preserved. +(count (desc Vtail)) -- 70 + +;; Tail length-tiebreak (L1220): one string a strict prefix of another +;; beyond the 32-byte packed window — "pp..._x" is a prefix of +;; "pp..._xy", so after the tail memcmp ties on the overlap the shorter +;; one sorts first via the length compare. +(set Vlen (concat (concat ["pppppppppppppppppppppppppppppppp_xy" "pppppppppppppppppppppppppppppppp_x"] (take ["b0" "c1"] 35)) (take ["m0" "n1"] 35))) +(count (asc Vlen)) -- 72 +;; "pp..._x" < "pp..._xy" (length tiebreak); both sort after the 'b'/'c' +;; padding but before 'm'/'n'. +(at (asc Vlen) 0) -- "b0" +;; verify the two p-strings are adjacent and correctly ordered. +(at (asc Vlen) 70) -- "pppppppppppppppppppppppppppppppp_x" +(at (asc Vlen) 71) -- "pppppppppppppppppppppppppppppppp_xy" + +;; ──────────────────────────────────────────────────────────────────── +;; 2. top / bot full-sort fallback — topk_take_vec (L3230-3238). +;; +;; topk_indices_single only handles the radix-encodable whitelist; +;; STR / GUID / LIST return NULL, so top/bot fall back to a full +;; asc/desc sort followed by take. +;; ──────────────────────────────────────────────────────────────────── + +(set Vstr ["banana" "apple" "cherry" "date" "elderberry"]) +(top Vstr 2) -- ["elderberry" "date"] +(bot Vstr 2) -- ["apple" "banana"] +;; GUID also falls back (no packed uint64 encoding). +(set Vguid (guid 6)) +(count (top Vguid 2)) -- 2 +(count (bot Vguid 2)) -- 2 + +;; ──────────────────────────────────────────────────────────────────── +;; 3. asc / desc / xrank argument guards (value-asserting branches). +;; +;; Atom passthrough (ray_asc_fn L3602 / ray_desc_fn L3629), length-1 +;; passthrough (L3604-3605 / L3631-3632), and the xrank empty/zero +;; short-circuit (ray_xrank_fn L3934). +;; ──────────────────────────────────────────────────────────────────── + +;; asc/desc of an atom → returned unchanged (no sort). +(asc 42) -- 42 +(desc 42) -- 42 +;; asc/desc of a length-1 vector → returned unchanged. +(asc [7]) -- [7] +(desc [7]) -- [7] +;; rank of a length-1 vector → single rank 0. +(rank [99]) -- [0] +;; xrank short-circuits to an empty I64 vec for n_groups<=0 or empty input. +(count (xrank 0 [1 2 3])) -- 0 +(count (xrank 3 [])) -- 0 +;; xrank happy path: n_groups buckets over the sorted order. +(xrank 2 [10 40 20 30]) -- [0 1 0 1] diff --git a/test/rfl/strop/like_patterns.rfl b/test/rfl/strop/like_patterns.rfl index fc11828c..c35c3513 100644 --- a/test/rfl/strop/like_patterns.rfl +++ b/test/rfl/strop/like_patterns.rfl @@ -30,7 +30,7 @@ ;; "abcabc","abc?" → 4. (count (select {from: TS where: (like s "abc*")})) -- 4 ;; Value-pair: every kept row must START with "abc" — assert by re-applying like. -(min (like (at (select {from: TS where: (like s "abc*")}) 's) "abc*")) -- 1 +(min (like (at (select {from: TS where: (like s "abc*")}) 's) "abc*")) -- true ;; SHAPE_PREFIX miss (count (select {from: TS where: (like s "qq*")})) -- 0 @@ -38,13 +38,13 @@ ;; "ABC"-not (case-sensitive), "abcabc","zabc" → 4. (count (select {from: TS where: (like s "*abc")})) -- 4 ;; Value-pair: every kept row must END with "abc". -(min (like (at (select {from: TS where: (like s "*abc")}) 's) "*abc")) -- 1 +(min (like (at (select {from: TS where: (like s "*abc")}) 's) "*abc")) -- true ;; SHAPE_CONTAINS — "**" memmem path. "abc" substring appears in: ;; "abc","abcdef","xyzabc","abcabc","zabc","abc?" → 6. (count (select {from: TS where: (like s "*abc*")})) -- 6 ;; Value-pair: every kept row must CONTAIN "abc". -(min (like (at (select {from: TS where: (like s "*abc*")}) 's) "*abc*")) -- 1 +(min (like (at (select {from: TS where: (like s "*abc*")}) 's) "*abc*")) -- true ;; SHAPE_ANY — single "*" — must match every row including "". (count (select {from: TS where: (like s "*")})) -- 10 @@ -83,21 +83,21 @@ ;; SHAPE_EXACT — 'abc appears twice; case-sensitive so 'ABC is excluded. (count (select {from: TY where: (like s "abc")})) -- 2 ;; Every kept row must equal 'abc — assert min == 'abc and max == 'abc. -(min (== (at (select {from: TY where: (like s "abc")}) 's) 'abc)) -- 1 +(min (== (at (select {from: TY where: (like s "abc")}) 's) 'abc)) -- true ;; SHAPE_PREFIX — sym_ids starting with "abc": 'abc(×2), 'abcdef(×2), ;; 'abcabc → 5. (count (select {from: TY where: (like s "abc*")})) -- 5 -(min (like (at (select {from: TY where: (like s "abc*")}) 's) "abc*")) -- 1 +(min (like (at (select {from: TY where: (like s "abc*")}) 's) "abc*")) -- true ;; SHAPE_SUFFIX — ends with "abc": 'abc(×2), 'xyzabc, 'abcabc, 'zabc → 5. (count (select {from: TY where: (like s "*abc")})) -- 5 -(min (like (at (select {from: TY where: (like s "*abc")}) 's) "*abc")) -- 1 +(min (like (at (select {from: TY where: (like s "*abc")}) 's) "*abc")) -- true ;; SHAPE_CONTAINS — contains "abc": 'abc(×2), 'abcdef(×2), 'xyzabc, ;; 'abcabc, 'zabc → 7. (count (select {from: TY where: (like s "*abc*")})) -- 7 -(min (like (at (select {from: TY where: (like s "*abc*")}) 's) "*abc*")) -- 1 +(min (like (at (select {from: TY where: (like s "*abc*")}) 's) "*abc*")) -- true ;; SHAPE_ANY — every row. (count (select {from: TY where: (like s "*")})) -- 10 diff --git a/test/rfl/strop/string_branch_cov.rfl b/test/rfl/strop/string_branch_cov.rfl index 0e6a52c2..24838bfb 100644 --- a/test/rfl/strop/string_branch_cov.rfl +++ b/test/rfl/strop/string_branch_cov.rfl @@ -393,3 +393,91 @@ ;; Morsel segment counts are always consistent with row counts ;; in well-formed rowsel structures. ;; ================================================================ + +;; ================================================================ +;; 19. exec_like STR parallel dispatch (line 585) +;; +;; The non-parted RAY_STR LIKE path dispatches to the worker pool +;; only when len >= LIKE_PAR_MIN_ROWS_STR (200000) and the pool has +;; >= 2 workers. Existing strop tests all run the serial branch +;; (str_like_par_fn called inline). A 200k-row STR column forces +;; ray_pool_dispatch(str_like_par_fn). +;; ================================================================ +(.sys.exec "rm -f rfl_strop_bigstr.csv") -- 0 +(.sys.exec "awk 'BEGIN{print \"s\"; for(i=0;i<200000;i++) print \"url\" i%7}' > rfl_strop_bigstr.csv") -- 0 +(set TBS (.csv.read [STR] "rfl_strop_bigstr.csv")) +(count TBS) -- 200000 +;; like "url0" matches i%7==0 -> 0,7,...,199997 -> floor(199999/7)+1 = 28572 +(count (select {from: TBS where: (like s "url0")})) -- 28572 +;; like "url*" matches all rows. +(count (select {from: TBS where: (like s "url*")})) -- 200000 +(.sys.exec "rm -f rfl_strop_bigstr.csv") -- 0 + +;; ================================================================ +;; 20. exec_like_parted_sym parallel dispatch (lines 433, 468) +;; +;; A parted SYM column whose first segment has >= LIKE_PAR_MIN_ROWS_SYM +;; (100000) rows drives the per-segment ray_pool_dispatch of +;; like_seen_fn (433) and like_proj_fn (468). +;; ================================================================ +(.sys.exec "rm -rf /tmp/rfl_strop_pbsym /tmp/rfl_strop_pbsym.csv") -- 0 +(.sys.exec "awk 'BEGIN{print \"s\"; for(i=0;i<120000;i++) print \"k\" i%5}' > /tmp/rfl_strop_pbsym.csv") -- 0 +(set PBSym (.csv.read [SYMBOL] "/tmp/rfl_strop_pbsym.csv")) +(.db.splayed.set "/tmp/rfl_strop_pbsym/1/t/" PBSym) +(.db.splayed.set "/tmp/rfl_strop_pbsym/2/t/" (table [s] (list ['k0 'k1 'k2]))) +(set PSymBig (.db.parted.get "/tmp/rfl_strop_pbsym/" 't)) +(count PSymBig) -- 120003 +;; like "k0": seg1 i%5==0 -> 24000 ; seg2 'k0 once -> 24001 +(count (select {from: PSymBig where: (like s "k0")})) -- 24001 +;; like "k*": all rows. +(count (select {from: PSymBig where: (like s "k*")})) -- 120003 +(.sys.exec "rm -rf /tmp/rfl_strop_pbsym /tmp/rfl_strop_pbsym.csv") -- 0 + +;; ================================================================ +;; 21. exec_like_parted_str parallel dispatch (line 388) +;; +;; A parted STR column whose first segment has >= LIKE_PAR_MIN_ROWS_STR +;; (200000) rows drives the per-segment ray_pool_dispatch of +;; str_like_par_fn. +;; ================================================================ +(.sys.exec "rm -rf /tmp/rfl_strop_pbstr /tmp/rfl_strop_pbstr.csv") -- 0 +(.sys.exec "awk 'BEGIN{print \"s\"; for(i=0;i<210000;i++) print \"v\" i%4}' > /tmp/rfl_strop_pbstr.csv") -- 0 +(set PBStr (.csv.read [STR] "/tmp/rfl_strop_pbstr.csv")) +(.db.splayed.set "/tmp/rfl_strop_pbstr/1/t/" PBStr) +(.db.splayed.set "/tmp/rfl_strop_pbstr/2/t/" (table [s] (list ["v0" "v1"]))) +(set PStrBig (.db.parted.get "/tmp/rfl_strop_pbstr/" 't)) +(count PStrBig) -- 210002 +;; like "v0": seg1 i%4==0 -> 52500 ; seg2 "v0" once -> 52501 +(count (select {from: PStrBig where: (like s "v0")})) -- 52501 +;; like "v*": all rows. +(count (select {from: PStrBig where: (like s "v*")})) -- 210002 +(.sys.exec "rm -rf /tmp/rfl_strop_pbstr /tmp/rfl_strop_pbstr.csv") -- 0 + +;; ================================================================ +;; 22. substr with a NULL inside a per-row start/length VECTOR — null +;; propagation (string.c:979-980, the `ray_vec_is_null(start_v/len_v, i)` +;; True arms). A null in either vector yields a null result row. +;; (An earlier note here claimed this SIGSEGV'd; that crash was a +;; mixed-ABI build artifact from concurrent debug/coverage rebuilds, not +;; an exec_substr defect — it reproduces on no clean single-config build, +;; and ASan reports nothing.) +(set TSubNull (table [s st ln] (list ["alphabet" "bravo" "charlie"] [1 0N 2] [3 2 0N]))) +;; null in START vector → row 1 is null +(set RSubNs (at (select {r: (substr s st 3) from: TSubNull}) 'r)) +(at RSubNs 0) -- "alp" +(nil? (at RSubNs 1)) -- true +(at RSubNs 2) -- "har" +;; null in LENGTH vector → row 2 is null +(set RSubNl (at (select {r: (substr s 1 ln) from: TSubNull}) 'r)) +(at RSubNl 0) -- "alp" +(at RSubNl 1) -- "br" +(nil? (at RSubNl 2)) -- true + +;; The no-null per-row start/len vector path (covers s_data line 962, +;; l_data line 974, and the s_data/l_data ternaries at 998-999): +;; ================================================================ +(set TSubVec (table [s st ln] (list ["alphabet" "bravo" "charlie"] [1 2 3] [3 2 4]))) +(set RSubVec (at (select {r: (substr s st ln) from: TSubVec}) 'r)) +(at RSubVec 0) -- "alp" +(at RSubVec 1) -- "ra" +(at RSubVec 2) -- "arli" diff --git a/test/rfl/system/part.rfl b/test/rfl/system/part.rfl index 9989e155..f1e623ba 100644 --- a/test/rfl/system/part.rfl +++ b/test/rfl/system/part.rfl @@ -371,5 +371,27 @@ (at Rt1 'date) -- [2024.01.10 2024.01.10 2024.01.10 2024.01.10] (at Rt1 'k) -- [1 2 3 1] +;; ────────────── direct column access on a freshly-loaded parted table ────────────── +;; A `.db.parted.get` table holds segmented (RAY_IS_PARTED) data columns and a +;; RAY_MAPCOMMON partition-key column. `(at table 'col)` must materialize them +;; so `(at col i)` / count / asc / sum work — previously they raised `type`. +(.sys.exec "rm -rf /tmp/rfl_part_atcol") +(.db.splayed.set "/tmp/rfl_part_atcol/1/t/" (table [s v] (list ['apple 'banana] [1 2]))) +(.db.splayed.set "/tmp/rfl_part_atcol/2/t/" (table [s v] (list ['cherry 'date] [3 4]))) +(set Pac (.db.parted.get "/tmp/rfl_part_atcol/" 't)) +(count Pac) -- 4 +;; parted SYM column — direct element access + sorted set +(at (at Pac 's) 0) -- 'apple +(asc (at Pac 's)) -- ['apple 'banana 'cherry 'date] +(count (at Pac 's)) -- 4 +;; parted I64 column +(sum (at Pac 'v)) -- 10 +(at (at Pac 'v) 0) -- 1 +;; MAPCOMMON partition-key column +(at (at Pac 'part) 0) -- 1 +(asc (distinct (at Pac 'part))) -- [1 2] +(count (at Pac 'part)) -- 4 +(.sys.exec "rm -rf /tmp/rfl_part_atcol") + ;; ────────────── teardown ────────────── (.sys.exec "rm -rf /tmp/rfl_part_date /tmp/rfl_part_int /tmp/rfl_part_sym /tmp/rfl_part_single /tmp/rfl_part_empty /tmp/rfl_part_missing /tmp/rfl_part_three /tmp/rfl_part_cd /tmp/rfl_part_cd_sym /tmp/rfl_part_minute /tmp/rfl_part_like /tmp/rfl_part_mc /tmp/rfl_part_mb") diff --git a/test/test_datalog.c b/test/test_datalog.c index 424ac1f2..4ed693d3 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -198,6 +198,158 @@ static test_result_t test_source_prov_requires_flag(void) { PASS(); } +/* Source provenance over a body atom that carries a CONSTANT slot. + * + * dl_build_source_prov matches each derived row against the body relation, + * and for a constant body position it compares the source cell against + * body->const_vals[c] (datalog.c:2566). The plain-variable provenance test + * above never exercises that comparison. + * + * Program: + * EDB: kv(1,7), kv(2,7), kv(3,9) + * Rule: hit(E) :- kv(E, 7) // second body slot is the constant 7 + * + * Expected: hit has 2 rows (E=1, E=2). Each derived row's provenance points + * at exactly the matching kv source row (the const filter rejects kv(3,9)). */ +static test_result_t test_source_prov_const_body_slot(void) { + int64_t e_vals[] = {1, 2, 3}; + int64_t k_vals[] = {7, 7, 9}; + ray_t* e_col = ray_vec_from_raw(RAY_I64, e_vals, 3); + ray_t* k_col = ray_vec_from_raw(RAY_I64, k_vals, 3); + TEST_ASSERT_NOT_NULL(e_col); + TEST_ASSERT_NOT_NULL(k_col); + + ray_t* kv = ray_table_new(2); + kv = ray_table_add_col(kv, ray_sym_intern("kv__c0", 6), e_col); + TEST_ASSERT_FALSE(RAY_IS_ERR(kv)); + kv = ray_table_add_col(kv, ray_sym_intern("kv__c1", 6), k_col); + TEST_ASSERT_FALSE(RAY_IS_ERR(kv)); + + dl_program_t* prog = dl_program_new(); + TEST_ASSERT_NOT_NULL(prog); + prog->flags |= DL_FLAG_PROVENANCE; + + int kv_idx = dl_add_edb(prog, "kv", kv, 2); + TEST_ASSERT_EQ_I(kv_idx, 0); + + /* hit(E) :- kv(E, 7) */ + dl_rule_t rule; + dl_rule_init(&rule, "hit", 1); + dl_rule_head_var(&rule, 0, 0); + int body = dl_rule_add_atom(&rule, "kv", 2); + dl_body_set_var(&rule, body, 0, 0); /* E */ + dl_body_set_const_typed(&rule, body, 1, 7, RAY_I64); /* constant 7 */ + rule.n_vars = 1; + TEST_ASSERT_EQ_I(dl_add_rule(prog, &rule), 0); + + TEST_ASSERT_EQ_I(dl_eval(prog), 0); + + ray_t* out = dl_query(prog, "hit"); + TEST_ASSERT_NOT_NULL(out); + TEST_ASSERT_EQ_I((int)ray_table_nrows(out), 2); + + ray_t* offsets = dl_get_provenance_src_offsets(prog, "hit"); + ray_t* data = dl_get_provenance_src_data(prog, "hit"); + TEST_ASSERT_NOT_NULL(offsets); + TEST_ASSERT_NOT_NULL(data); + /* 2 derived rows → offsets length 3; one source ref per row (the const + * filter keeps only the matching kv row). */ + TEST_ASSERT_EQ_I((int)ray_len(offsets), 3); + TEST_ASSERT_EQ_I((int)ray_len(data), 2); + int64_t* off = (int64_t*)ray_data(offsets); + TEST_ASSERT_EQ_I((int)off[0], 0); + TEST_ASSERT_EQ_I((int)off[1], 1); + TEST_ASSERT_EQ_I((int)off[2], 2); + /* Both refs must point at relation kv (idx 0), rows 0 and 1 — never row 2, + * which holds the const-mismatched kv(3,9). */ + int64_t* sd = (int64_t*)ray_data(data); + for (int i = 0; i < 2; i++) { + TEST_ASSERT_EQ_I((int)(sd[i] >> 32), kv_idx); + TEST_ASSERT_TRUE((sd[i] & 0xffffffff) != 2); + } + + dl_program_free(prog); + ray_release(kv); + ray_release(e_col); + ray_release(k_col); + PASS(); +} + +/* Source provenance whose packed-ref buffer must GROW past its initial cap. + * + * dl_build_source_prov sizes the scratch buffer at nrows*4 (min 64) and + * doubles it on overflow (datalog.c:2574-2583). A cross-product rule whose + * second body atom shares no variables ("always matches") attaches every row + * of that relation to each derived row, so the ref count is nrows*(1+M) which + * blows past nrows*4 once M is large enough. + * + * Program: + * EDB: p(0..19) q(0..19) + * Rule: r(X) :- p(X), q(Y) // Y is body-only → unconstrained + * + * Derived r has 20 rows; each accrues 1 (p) + 20 (q) = 21 source refs, giving + * 420 packed entries vs an initial cap of 80 → at least two buffer doublings. */ +static test_result_t test_source_prov_buffer_grow(void) { + enum { N = 20 }; + int64_t pv[N], qv[N]; + for (int i = 0; i < N; i++) { pv[i] = i; qv[i] = i; } + ray_t* p_col = ray_vec_from_raw(RAY_I64, pv, N); + ray_t* q_col = ray_vec_from_raw(RAY_I64, qv, N); + TEST_ASSERT_NOT_NULL(p_col); + TEST_ASSERT_NOT_NULL(q_col); + + ray_t* p = ray_table_new(1); + p = ray_table_add_col(p, ray_sym_intern("p__c0", 5), p_col); + TEST_ASSERT_FALSE(RAY_IS_ERR(p)); + ray_t* q = ray_table_new(1); + q = ray_table_add_col(q, ray_sym_intern("q__c0", 5), q_col); + TEST_ASSERT_FALSE(RAY_IS_ERR(q)); + + dl_program_t* prog = dl_program_new(); + TEST_ASSERT_NOT_NULL(prog); + prog->flags |= DL_FLAG_PROVENANCE; + + int p_idx = dl_add_edb(prog, "p", p, 1); + TEST_ASSERT_EQ_I(p_idx, 0); + dl_add_edb(prog, "q", q, 1); + + /* r(X) :- p(X), q(Y) — Y (var idx 1) appears only in the body. */ + dl_rule_t rule; + dl_rule_init(&rule, "r", 1); + dl_rule_head_var(&rule, 0, 0); + int b0 = dl_rule_add_atom(&rule, "p", 1); + dl_body_set_var(&rule, b0, 0, 0); /* X */ + int b1 = dl_rule_add_atom(&rule, "q", 1); + dl_body_set_var(&rule, b1, 0, 1); /* Y, body-only */ + rule.n_vars = 2; + TEST_ASSERT_EQ_I(dl_add_rule(prog, &rule), 0); + + TEST_ASSERT_EQ_I(dl_eval(prog), 0); + + ray_t* out = dl_query(prog, "r"); + TEST_ASSERT_NOT_NULL(out); + TEST_ASSERT_EQ_I((int)ray_table_nrows(out), N); + + ray_t* offsets = dl_get_provenance_src_offsets(prog, "r"); + ray_t* data = dl_get_provenance_src_data(prog, "r"); + TEST_ASSERT_NOT_NULL(offsets); + TEST_ASSERT_NOT_NULL(data); + /* offsets length nrows+1 = 21; total refs = nrows*(1 + N) = 420, well past + * the initial 80-entry cap so the doubling path ran. */ + TEST_ASSERT_EQ_I((int)ray_len(offsets), N + 1); + int64_t* off = (int64_t*)ray_data(offsets); + TEST_ASSERT_EQ_I((int)off[0], 0); + TEST_ASSERT_EQ_I((int)off[N], N * (1 + N)); + TEST_ASSERT_EQ_I((int)ray_len(data), N * (1 + N)); + + dl_program_free(prog); + ray_release(p); + ray_release(q); + ray_release(p_col); + ray_release(q_col); + PASS(); +} + /* Verify cmp body literal filters tuples: rule keeps only rows where col0 < 60. * * Program: @@ -2286,6 +2438,8 @@ static test_result_t test_rule_add_interval_overflow(void) { const test_entry_t datalog_entries[] = { { "datalog/source_provenance", test_source_provenance, datalog_setup, datalog_teardown }, { "datalog/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown }, + { "datalog/source_prov_const_body_slot", test_source_prov_const_body_slot, datalog_setup, datalog_teardown }, + { "datalog/source_prov_buffer_grow", test_source_prov_buffer_grow, datalog_setup, datalog_teardown }, { "datalog/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown }, { "datalog/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown }, { "datalog/arith_assign_f64", test_arith_assign_f64, datalog_setup, datalog_teardown }, diff --git a/test/test_exec.c b/test/test_exec.c index 602a46de..abd08036 100644 --- a/test/test_exec.c +++ b/test/test_exec.c @@ -1351,6 +1351,136 @@ static test_result_t test_exec_join_multikey(void) { PASS(); } +/* ---- FULL OUTER JOIN (small / chained-HT path, mixed matched+unmatched) ---- + * exec_join L1200-1234: small-path FULL OUTER append-unmatched-right block. + * Needs pair_count > 0 (some matches) AND some unmatched right rows so the + * memcpy at L1217-1219 (copy existing pairs into the grown buffer) fires. */ +static test_result_t test_exec_join_full_small_mixed(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int64_t n_id = ray_sym_intern("id", 2); + int64_t n_val = ray_sym_intern("val", 3); + int64_t n_score = ray_sym_intern("score", 5); + + /* Left: ids {1,2,3} */ + int64_t lid[] = {1, 2, 3}; + int64_t lval[] = {10, 20, 30}; + ray_t* lid_v = ray_vec_from_raw(RAY_I64, lid, 3); + ray_t* lval_v = ray_vec_from_raw(RAY_I64, lval, 3); + ray_t* left = ray_table_new(2); + left = ray_table_add_col(left, n_id, lid_v); + left = ray_table_add_col(left, n_val, lval_v); + ray_release(lid_v); ray_release(lval_v); + + /* Right: ids {2,3,4,5} — 2,3 match; 4,5 are unmatched right rows */ + int64_t rid[] = {2, 3, 4, 5}; + int64_t rscore[] = {200, 300, 400, 500}; + ray_t* rid_v = ray_vec_from_raw(RAY_I64, rid, 4); + ray_t* rscore_v = ray_vec_from_raw(RAY_I64, rscore, 4); + ray_t* right = ray_table_new(2); + right = ray_table_add_col(right, n_id, rid_v); + right = ray_table_add_col(right, n_score, rscore_v); + ray_release(rid_v); ray_release(rscore_v); + + ray_graph_t* g = ray_graph_new(left); + ray_op_t* left_op = ray_const_table(g, left); + ray_op_t* right_op = ray_const_table(g, right); + ray_op_t* k = ray_scan(g, "id"); + ray_op_t* ka[] = { k }; + ray_op_t* j = ray_join(g, left_op, ka, right_op, ka, 1, 2); + + ray_t* res = ray_execute(g, j); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(res->type, RAY_TABLE); + /* id=1 unmatched-left, 2 matched, 3 matched, plus 4 & 5 unmatched-right. + * Total = 5 rows. */ + TEST_ASSERT_EQ_I(ray_table_nrows(res), 5); + + ray_release(res); + ray_graph_free(g); + ray_release(left); + ray_release(right); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + +/* ---- LEFT/FULL OUTER JOIN (radix path, empty right partitions) ---- + * join_radix_build_probe_fn L451-472: a hash partition where rp->count == 0 + * but lp->count > 0 emits the left rows as unmatched for join_type >= 1. + * Force it by giving the (large) right side a single repeated key so all + * right rows land in ONE radix partition, leaving the others right-empty + * while the left side's distinct keys spread across every partition. */ +static test_result_t test_exec_join_radix_empty_right_part(void) { + ray_heap_init(); + (void)ray_sym_init(); + + int64_t n_id = ray_sym_intern("id", 2); + + /* Right: 70000 rows, all id = 7 → one populated radix partition. */ + int64_t n_right = 70000; + ray_t* rid_v = ray_vec_new(RAY_I64, n_right); + rid_v->len = n_right; + int64_t* rid = (int64_t*)ray_data(rid_v); + for (int64_t i = 0; i < n_right; i++) rid[i] = 7; + ray_t* right = ray_table_new(1); + right = ray_table_add_col(right, n_id, rid_v); + ray_release(rid_v); + + /* Left: 70000 rows, id = i * 1000000 (distinct, NONE equal to 7) → + * keys spread over all radix partitions, but zero matches. This keeps + * the per-partition counts clean (no 1:N explosion) while still putting + * left rows into the right-empty partitions. */ + int64_t n_left = 70000; + ray_t* lid_v = ray_vec_new(RAY_I64, n_left); + lid_v->len = n_left; + int64_t* lid = (int64_t*)ray_data(lid_v); + for (int64_t i = 0; i < n_left; i++) lid[i] = i * 1000000; + ray_t* left = ray_table_new(1); + left = ray_table_add_col(left, n_id, lid_v); + ray_release(lid_v); + + /* LEFT OUTER (type 1): no matches → all 70000 left rows unmatched. */ + { + ray_graph_t* g = ray_graph_new(left); + ray_op_t* left_op = ray_const_table(g, left); + ray_op_t* right_op = ray_const_table(g, right); + ray_op_t* k = ray_scan(g, "id"); + ray_op_t* ka[] = { k }; + ray_op_t* j = ray_join(g, left_op, ka, right_op, ka, 1, 1); + ray_t* res = ray_execute(g, j); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(res->type, RAY_TABLE); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 70000); + ray_release(res); + ray_graph_free(g); + } + + /* FULL OUTER (type 2): no matches → 70000 unmatched left rows + + * 70000 unmatched right rows = 140000. */ + { + ray_graph_t* g = ray_graph_new(left); + ray_op_t* left_op = ray_const_table(g, left); + ray_op_t* right_op = ray_const_table(g, right); + ray_op_t* k = ray_scan(g, "id"); + ray_op_t* ka[] = { k }; + ray_op_t* j = ray_join(g, left_op, ka, right_op, ka, 1, 2); + ray_t* res = ray_execute(g, j); + TEST_ASSERT_FALSE(RAY_IS_ERR(res)); + TEST_ASSERT_EQ_I(res->type, RAY_TABLE); + TEST_ASSERT_EQ_I(ray_table_nrows(res), 140000); + ray_release(res); + ray_graph_free(g); + } + + ray_release(left); + ray_release(right); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + /* ---- WINDOW ---- */ static test_result_t test_exec_window(void) { ray_heap_init(); @@ -17052,6 +17182,8 @@ const test_entry_t exec_entries[] = { { "exec/join_skewed", test_exec_join_skewed, NULL, NULL }, { "exec/join_boundary", test_exec_join_boundary, NULL, NULL }, { "exec/join_multikey", test_exec_join_multikey, NULL, NULL }, + { "exec/join_full_small_mixed", test_exec_join_full_small_mixed, NULL, NULL }, + { "exec/join_radix_empty_right_part", test_exec_join_radix_empty_right_part, NULL, NULL }, { "exec/window", test_exec_window, NULL, NULL }, { "exec/select", test_exec_select, NULL, NULL }, { "exec/stddev", test_exec_stddev, NULL, NULL }, diff --git a/test/test_heap.c b/test/test_heap.c index b38a3fa5..aedebb2d 100644 --- a/test/test_heap.c +++ b/test/test_heap.c @@ -2221,6 +2221,123 @@ static test_result_t test_slab_gc_drains_wide(void) { PASS(); } +#include "ops/idxop.h" /* ray_index_t, ray_index_payload, RAY_IDX_SORT */ + +/* ---- ray_retain_owned_refs: atom-owns-obj branch (L675-676) --------------- + * + * ray_atom_owns_obj is true for a -RAY_STR atom that is NOT SSO (slen >= 8, + * so the payload lives in an external obj rather than inline). ray_alloc_copy + * treats the atom as data_size=0, memcpy's the 32-byte header (carrying obj), + * then ray_retain_owned_refs hits the atom-owns-obj arm and retains obj. + * Both copy and source share the one obj; freeing both releases it twice. */ + +static test_result_t test_retain_owned_refs_atom_obj(void) { + /* obj is a real heap block (a U8 byte buffer standing in for the string + * payload). Give it rc=1; the retain on copy bumps it to 2. */ + ray_t* obj = ray_alloc(16); + TEST_ASSERT_NOT_NULL(obj); + obj->type = RAY_U8; + obj->len = 16; + + ray_t* s = ray_alloc(0); + TEST_ASSERT_NOT_NULL(s); + s->type = -RAY_STR; + s->slen = 8; /* >= 8 → non-SSO → ray_atom_owns_obj true */ + s->obj = obj; /* external payload, owned by s */ + + uint32_t orc = obj->rc; + /* alloc_copy → retain_owned_refs atom-owns-obj arm (L675-676). */ + ray_t* copy = ray_alloc_copy(s); + TEST_ASSERT_NOT_NULL(copy); + TEST_ASSERT_FALSE(RAY_IS_ERR(copy)); + TEST_ASSERT_EQ_PTR(copy->obj, obj); /* shallow-shared obj */ + TEST_ASSERT_EQ_U(obj->rc, orc + 1); /* retained once */ + + /* Free copy: release_owned_refs drops obj 2→1. Then free s: drops 1→0 + * (frees obj). Order matters so the shared obj outlives the first free. */ + ray_free(copy); + TEST_ASSERT_EQ_U(obj->rc, orc); /* back to original */ + ray_free(s); /* releases obj to 0 → freed */ + + ray_t* probe = ray_alloc(0); + TEST_ASSERT_NOT_NULL(probe); + ray_free(probe); + PASS(); +} + +/* ---- ray_detach_owned_refs: RAY_LAZY arm (L747-751) ----------------------- + * + * ray_detach_owned_refs reaches the LAZY arm from ray_scratch_realloc on the + * old block. RAY_LAZY is an atom (data_size=0); the new block inherits the + * graph/op pointers via the header memcpy, and detach nulls them on the old + * block. Detach NEVER dereferences the pointers, so a sentinel is safe; we + * neutralize the surviving block before freeing it (mirrors the GRAPH test). */ + +static test_result_t test_scratch_realloc_lazy_handle(void) { + ray_t* g = ray_alloc(0); + TEST_ASSERT_NOT_NULL(g); + g->type = RAY_LAZY; + RAY_LAZY_GRAPH(g) = (ray_graph_t*)0x1234; /* sentinel — never deref'd */ + RAY_LAZY_OP(g) = (ray_op_t*)0x5678; /* sentinel — never deref'd */ + + /* realloc copies header (incl graph/op) to g2, then detaches g (L747-751: + * RAY_LAZY_GRAPH/OP(g) set NULL) before freeing it. g's free runs + * release_owned_refs LAZY arm with graph==NULL → no ray_graph_free. */ + ray_t* g2 = ray_scratch_realloc(g, 0); + TEST_ASSERT_NOT_NULL(g2); + TEST_ASSERT_EQ_I(g2->type, RAY_LAZY); + TEST_ASSERT_EQ_PTR(RAY_LAZY_GRAPH(g2), (ray_graph_t*)0x1234); + + /* Neutralize the survivor so its free doesn't ray_graph_free the sentinel. */ + RAY_LAZY_GRAPH(g2) = NULL; + RAY_LAZY_OP(g2) = NULL; + ray_free(g2); + PASS(); +} + +/* ---- ray_detach_owned_refs: HAS_INDEX arm (L790-794) ---------------------- + * + * A vector carrying RAY_ATTR_HAS_INDEX stores an owning ray_t* index in + * nullmap[0..7] (v->index). ray_scratch_realloc transfers the vector data to + * a new block (index pointer copied), then detaches the old block — nulling + * v->index and clearing the HAS_INDEX bit (L790-794). We use a real RAY_INDEX + * block as v->index and let the surviving block own it, releasing once. */ + +static test_result_t test_scratch_realloc_has_index_detach(void) { + /* A minimal RAY_INDEX block: kind RAY_IDX_SORT with perm NULL, so + * release_payload is a no-op when the survivor is freed. */ + ray_t* idx = ray_alloc(sizeof(ray_index_t)); + TEST_ASSERT_NOT_NULL(idx); + idx->type = RAY_INDEX; + idx->len = 0; + ray_index_t* ix = ray_index_payload(idx); + memset(ix, 0, sizeof(*ix)); + ix->kind = RAY_IDX_SORT; /* perm NULL → release is a no-op */ + + ray_t* v = ray_alloc(4 * sizeof(int64_t)); + TEST_ASSERT_NOT_NULL(v); + v->type = RAY_I64; + v->len = 4; + v->attrs |= RAY_ATTR_HAS_INDEX; + v->index = idx; /* v owns the only ref to idx */ + + /* realloc copies header (incl index ptr + HAS_INDEX bit) to v2, then + * detaches v: v->index=NULL, HAS_INDEX cleared (L790-794). v's free + * then takes the no-index path (won't release idx). */ + ray_t* v2 = ray_scratch_realloc(v, 8 * sizeof(int64_t)); + TEST_ASSERT_NOT_NULL(v2); + TEST_ASSERT_TRUE(v2->attrs & RAY_ATTR_HAS_INDEX); + TEST_ASSERT_EQ_PTR(v2->index, idx); + + /* Freeing v2 takes the HAS_INDEX release arm → ray_release(idx) → freed. */ + ray_free(v2); + + ray_t* probe = ray_alloc(0); + TEST_ASSERT_NOT_NULL(probe); + ray_free(probe); + PASS(); +} + /* ---- Suite definition -------------------------------------------------- */ const test_entry_t heap_entries[] = { @@ -2292,5 +2409,8 @@ const test_entry_t heap_entries[] = { { "heap/order_overflow_guards", test_order_overflow_guards, heap_setup, heap_teardown }, { "heap/slab_byte_budget", test_slab_byte_budget, heap_setup, heap_teardown }, { "heap/slab_gc_drains_wide", test_slab_gc_drains_wide, heap_setup, heap_teardown }, + { "heap/retain_atom_obj", test_retain_owned_refs_atom_obj, heap_setup, heap_teardown }, + { "heap/scratch_realloc_lazy", test_scratch_realloc_lazy_handle, heap_setup, heap_teardown }, + { "heap/scratch_realloc_has_index", test_scratch_realloc_has_index_detach, heap_setup, heap_teardown }, { NULL, NULL, NULL, NULL }, }; diff --git a/test/test_sym.c b/test/test_sym.c index 53acf895..658184fd 100644 --- a/test/test_sym.c +++ b/test/test_sym.c @@ -1411,6 +1411,50 @@ static test_result_t test_sym_ensure_cap_large(void) { PASS(); } +/* ---- sym_accessors_uninitialized -------------------------------------- * + * Every public accessor must early-return safely when the sym table is not + * initialized (g_sym_inited == false), rather than touching freed/NULL + * globals. Exercised by calling each one after ray_sym_destroy(). This + * covers the `!inited` guard branch in ray_sym_find, ray_sym_str, + * ray_sym_is_dotted, ray_sym_segs, ray_sym_count, ray_sym_persisted_count, + * ray_sym_rebuild_segments, ray_sym_strings_borrow, and ray_sym_ensure_cap. + * + * sym_setup() already called ray_sym_init(); tear it down first, then probe. + * sym_teardown() calls ray_sym_destroy() again, which is a no-op the second + * time (its own !inited guard). + * ----------------------------------------------------------------------- */ +static test_result_t test_sym_accessors_uninitialized(void) { + ray_sym_destroy(); /* g_sym_inited -> false */ + + TEST_ASSERT_EQ_I(ray_sym_find("anything", 8), -1); + TEST_ASSERT_NULL(ray_sym_str(0)); + TEST_ASSERT_FALSE(ray_sym_is_dotted(0)); + + const int64_t* segs = NULL; + TEST_ASSERT_EQ_I(ray_sym_segs(0, &segs), 0); + + TEST_ASSERT_EQ_U(ray_sym_count(), 0); + TEST_ASSERT_EQ_U(ray_sym_persisted_count(), 0); + + /* rebuild_segments reports RAY_ERR_IO when the table is not initialized. */ + TEST_ASSERT_EQ_I(ray_sym_rebuild_segments(), RAY_ERR_IO); + + /* strings_borrow must null out its out-params and not dereference globals. */ + ray_t** out_strings = (ray_t**)0x1; /* poison: must be overwritten to NULL */ + uint32_t out_count = 99; /* poison: must be overwritten to 0 */ + ray_sym_strings_borrow(&out_strings, &out_count); + TEST_ASSERT_NULL(out_strings); + TEST_ASSERT_EQ_U(out_count, 0); + + /* ensure_cap returns false when not initialized. */ + TEST_ASSERT_FALSE(ray_sym_ensure_cap(16)); + + /* Re-init so sym_teardown()'s ray_sym_destroy has a consistent table to + * tear down (and to match the setup/teardown contract). */ + (void)ray_sym_init(); + PASS(); +} + /* ---- sym_dotted_leading_dot_with_second_dot ---------------------------- */ /* Leading dot followed by a second dot (`.sys.gc`) should be treated as @@ -2688,6 +2732,7 @@ const test_entry_t sym_entries[] = { { "sym/str_invalid_id", test_sym_str_invalid_id, sym_setup, sym_teardown }, { "sym/is_dotted_invalid_id", test_sym_is_dotted_invalid_id, sym_setup, sym_teardown }, { "sym/segs_invalid_id", test_sym_segs_invalid_id, sym_setup, sym_teardown }, + { "sym/accessors_uninitialized", test_sym_accessors_uninitialized, sym_setup, sym_teardown }, { "sym/find_after_grow", test_sym_find_after_grow, sym_setup, sym_teardown }, { "sym/ensure_cap_zero", test_sym_ensure_cap_zero, sym_setup, sym_teardown }, { "sym/ensure_cap_large", test_sym_ensure_cap_large, sym_setup, sym_teardown },