Skip to content

Commit 1930f26

Browse files
committed
refactor(webapp): rely on cold safe-default for compute migration routing
Drop the per-trigger readiness gate (waitUntilReady) and the /healthcheck dependency; a cold registry read falls back to not-migrated, matching the datastore/llm-pricing registries. Add computeMigrationRequireTemplate flag (migrated orgs build the compute template in required mode at deploy when on, else shadow) and a reloading_registry_loaded gauge so a never-loaded registry is alertable. Drop dead GLOBAL_FLAGS_READY_TIMEOUT_MS.
1 parent 7b13f95 commit 1930f26

8 files changed

Lines changed: 33 additions & 103 deletions

File tree

apps/webapp/app/env.server.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,6 @@ const EnvironmentSchema = z
161161
// How often each replica reloads the global flags snapshot from the DB.
162162
// Sets kill/ramp propagation latency.
163163
GLOBAL_FLAGS_RELOAD_INTERVAL_MS: z.coerce.number().int().min(1000).default(5000),
164-
// Max time the first trigger blocks waiting for the initial flags load
165-
// before falling back to defaults (off = container, the safe direction).
166-
GLOBAL_FLAGS_READY_TIMEOUT_MS: z.coerce.number().int().min(0).default(5000),
167164
WORKER_ENABLED: z.string().default("true"),
168165
GRACEFUL_SHUTDOWN_TIMEOUT: z.coerce.number().int().default(60000),
169166
DISABLE_SSE: z.string().optional(),

apps/webapp/app/runEngine/services/triggerTask.server.ts

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -362,15 +362,8 @@ export class RunEngineTriggerTaskService {
362362
const enableFastPath = workerQueueResult?.enableFastPath ?? false;
363363

364364
// Rewrite the region to its compute backing for migration-enrolled orgs,
365-
// from the in-memory flag snapshot (no DB query). The isLoaded gates only
366-
// block during cold start so the first request can't serve a default over
367-
// a real flag; once warm they're a synchronous no-op.
368-
if (!globalFlagsRegistry.isLoaded) {
369-
await globalFlagsRegistry.waitUntilReady(env.GLOBAL_FLAGS_READY_TIMEOUT_MS);
370-
}
371-
if (!workerRegionRegistry.isLoaded) {
372-
await workerRegionRegistry.waitUntilReady(env.GLOBAL_FLAGS_READY_TIMEOUT_MS);
373-
}
365+
// from the in-memory snapshots (no DB query). A cold read (registry not yet
366+
// loaded) returns undefined/[] and the resolver falls back to not-migrated.
374367
const workerGroups = workerRegionRegistry.current() ?? [];
375368
const region = baseWorkerQueue ? regionForQueue(baseWorkerQueue, workerGroups) : undefined;
376369
const backing = baseWorkerQueue ? backingForQueue(baseWorkerQueue, workerGroups) : undefined;

apps/webapp/app/utils/reloadingRegistry.server.ts

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,20 @@ const lastSuccessfulLoadAt = new Gauge({
1818
registers: [metricsRegister],
1919
});
2020

21+
// 0 until the first successful load, then 1. Starts at 0 (not absent) so a
22+
// never-loaded registry is an alertable series, distinct from "feature off".
23+
const registryLoaded = new Gauge({
24+
name: "reloading_registry_loaded",
25+
help: "1 once the registry has loaded at least once, else 0 (0 = serving cold fallback)",
26+
labelNames: ["name"],
27+
registers: [metricsRegister],
28+
});
29+
2130
export type ReloadingRegistry<T> = {
2231
isReady: Promise<void>;
2332
readonly isLoaded: boolean;
2433
current(): T | undefined;
2534
reload(): Promise<void>;
26-
waitUntilReady(timeoutMs: number): Promise<void>;
2735
stop(): void;
2836
};
2937

@@ -42,15 +50,14 @@ export type ReloadingRegistryOptions<T> = {
4250

4351
/**
4452
* In-memory snapshot loaded at startup and refreshed on an interval. Reads are
45-
* synchronous (`current()`); the first read should gate on `waitUntilReady` so a
46-
* cold replica never serves a default over a real value. Mirrors the datastore /
47-
* LLM-pricing registries. Interval-only: no pub/sub (a follow-up if sub-second
48-
* propagation is ever needed).
53+
* synchronous (`current()`) and return undefined until the first load completes;
54+
* callers must tolerate that (e.g. fall back to a safe default), the same cold-start
55+
* contract as the datastore / LLM-pricing registries. Interval-only: no pub/sub
56+
* (a follow-up if sub-second propagation is ever needed).
4957
*/
5058
export function createReloadingRegistry<T>(opts: ReloadingRegistryOptions<T>): ReloadingRegistry<T> {
5159
let snapshot: T | undefined;
5260
let loaded = false;
53-
let started = false;
5461
let loadSeq = 0;
5562
let resolveReady!: () => void;
5663
const isReady = new Promise<void>((resolve) => {
@@ -65,14 +72,15 @@ export function createReloadingRegistry<T>(opts: ReloadingRegistryOptions<T>): R
6572
lastSuccessfulLoadAt.set({ name: opts.name }, Date.now() / 1000);
6673
if (!loaded) {
6774
loaded = true;
75+
registryLoaded.set({ name: opts.name }, 1);
6876
resolveReady();
6977
}
7078
}
7179

7280
let interval: ReturnType<typeof setInterval> | undefined;
7381

7482
if (opts.autoStart !== false) {
75-
started = true;
83+
registryLoaded.set({ name: opts.name }, 0); // visible cold series until first load
7684

7785
const startup = pRetry(() => doLoad(), {
7886
forever: opts.retry?.retries === undefined,
@@ -124,20 +132,6 @@ export function createReloadingRegistry<T>(opts: ReloadingRegistryOptions<T>): R
124132
},
125133
current: () => snapshot,
126134
reload: doLoad,
127-
async waitUntilReady(timeoutMs: number) {
128-
if (!started || loaded || timeoutMs <= 0) return;
129-
let timer: ReturnType<typeof setTimeout> | undefined;
130-
try {
131-
await Promise.race([
132-
isReady,
133-
new Promise<void>((resolve) => {
134-
timer = setTimeout(resolve, timeoutMs);
135-
}),
136-
]);
137-
} finally {
138-
if (timer) clearTimeout(timer);
139-
}
140-
},
141135
stop,
142136
};
143137
}

apps/webapp/app/v3/featureFlags.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ export const FEATURE_FLAG = {
1414
computeMigrationEnabled: "computeMigrationEnabled",
1515
computeMigrationFreePercentage: "computeMigrationFreePercentage",
1616
computeMigrationPaidPercentage: "computeMigrationPaidPercentage",
17+
computeMigrationRequireTemplate: "computeMigrationRequireTemplate",
1718
} as const;
1819

1920
export const FeatureFlagCatalog = {
@@ -37,6 +38,9 @@ export const FeatureFlagCatalog = {
3738
[FEATURE_FLAG.computeMigrationEnabled]: z.boolean(),
3839
[FEATURE_FLAG.computeMigrationFreePercentage]: z.coerce.number().int().min(0).max(100),
3940
[FEATURE_FLAG.computeMigrationPaidPercentage]: z.coerce.number().int().min(0).max(100),
41+
// When on, migrated orgs build their compute template in required mode at deploy
42+
// (fails the deploy on error) instead of shadow. Strict boolean (see above).
43+
[FEATURE_FLAG.computeMigrationRequireTemplate]: z.boolean(),
4044
};
4145

4246
export type FeatureFlagKey = keyof typeof FeatureFlagCatalog;

apps/webapp/app/v3/globalFlagsRegistry.server.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ import { createReloadingRegistry } from "~/utils/reloadingRegistry.server";
77
/**
88
* In-memory snapshot of the global feature flags, refreshed every
99
* GLOBAL_FLAGS_RELOAD_INTERVAL_MS. `flags()` reads the DB-backed global values
10-
* (no per-org overrides). Read synchronously on the trigger hot path; callers
11-
* gate the first read on `waitUntilReady`.
10+
* (no per-org overrides). Read synchronously on the trigger hot path; a cold
11+
* read (before the first load) returns undefined and the resolver falls back to
12+
* not-migrated.
1213
*/
1314
export const globalFlagsRegistry = singleton("globalFlagsRegistry", () =>
1415
createReloadingRegistry<Partial<FeatureFlagCatalog>>({

apps/webapp/app/v3/services/computeTemplateCreation.server.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,9 @@ export class ComputeTemplateCreationService {
168168
// Migrated orgs route runs to the compute backing even though their stored
169169
// default is still the container region, so they need a compute template too.
170170
// shadow mode: never fail a deploy over a backing the org didn't opt into.
171-
if (!workerRegionRegistry.isLoaded) {
172-
await workerRegionRegistry.waitUntilReady(env.GLOBAL_FLAGS_READY_TIMEOUT_MS);
173-
}
171+
// A cold registry read returns no backing, so this is simply skipped until loaded.
174172
const defaultQueue = project.defaultWorkerGroup?.masterQueue;
175173
if (defaultQueue && backingForQueue(defaultQueue, workerRegionRegistry.current() ?? [])) {
176-
if (!globalFlagsRegistry.isLoaded) {
177-
await globalFlagsRegistry.waitUntilReady(env.GLOBAL_FLAGS_READY_TIMEOUT_MS);
178-
}
179174
const decision = {
180175
orgId: project.organization.id,
181176
orgFeatureFlags: project.organization.featureFlags as Record<string, unknown> | null,
@@ -198,7 +193,8 @@ export class ComputeTemplateCreationService {
198193
migrated = isOrgMigrated({ ...decision, planType });
199194
}
200195
if (migrated) {
201-
return "shadow";
196+
// required => template built at deploy (deploy fails on error); off => shadow.
197+
return decision.flags?.computeMigrationRequireTemplate ? "required" : "shadow";
202198
}
203199
}
204200

apps/webapp/app/v3/workerRegions.server.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,10 @@ export function backingForQueue(
5050

5151
/**
5252
* In-memory snapshot of every worker group's (queue, region, type, hidden),
53-
* refreshed on an interval. Read synchronously on the hot path; callers gate the
54-
* first read on `waitUntilReady`. DB-backed source of truth for region<->backing
55-
* resolution (replaces the old env-var backing map).
53+
* refreshed on an interval. Read synchronously on the hot path; a cold read
54+
* returns undefined (callers default safely - no backing, so not-migrated).
55+
* DB-backed source of truth for region<->backing resolution (replaces the old
56+
* env-var backing map).
5657
*/
5758
export const workerRegionRegistry = singleton("workerRegionRegistry", () =>
5859
createReloadingRegistry<WorkerGroupRegionRow[]>({
Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { describe, it, expect, vi } from "vitest";
1+
import { describe, it, expect } from "vitest";
22
import { createReloadingRegistry } from "~/utils/reloadingRegistry.server";
33

44
describe("createReloadingRegistry", () => {
@@ -15,32 +15,6 @@ describe("createReloadingRegistry", () => {
1515
reg.stop();
1616
});
1717

18-
it("waitUntilReady resolves once loaded", async () => {
19-
const reg = createReloadingRegistry({
20-
name: "test-b",
21-
intervalMs: 10_000,
22-
load: async () => 1,
23-
});
24-
await reg.waitUntilReady(1000);
25-
expect(reg.current()).toBe(1);
26-
reg.stop();
27-
});
28-
29-
it("waitUntilReady times out (and stays unloaded) when load never succeeds", async () => {
30-
const reg = createReloadingRegistry({
31-
name: "test-c",
32-
intervalMs: 10_000,
33-
retry: { retries: 0 },
34-
load: async () => {
35-
throw new Error("db down");
36-
},
37-
});
38-
await reg.waitUntilReady(50);
39-
expect(reg.isLoaded).toBe(false);
40-
expect(reg.current()).toBeUndefined();
41-
reg.stop();
42-
});
43-
4418
it("reload() picks up a changed value", async () => {
4519
let v = 1;
4620
const reg = createReloadingRegistry({
@@ -86,7 +60,7 @@ describe("createReloadingRegistry", () => {
8660
reg.stop();
8761
});
8862

89-
it("autoStart:false stays inert and non-blocking", async () => {
63+
it("autoStart:false stays inert (never loads)", async () => {
9064
let loadCalls = 0;
9165
const reg = createReloadingRegistry({
9266
name: "test-inert",
@@ -99,37 +73,7 @@ describe("createReloadingRegistry", () => {
9973
});
10074
expect(reg.isLoaded).toBe(false);
10175
expect(reg.current()).toBeUndefined();
102-
await reg.waitUntilReady(10_000); // must resolve ~immediately, not wait 10s
103-
expect(reg.isLoaded).toBe(false);
10476
expect(loadCalls).toBe(0); // never hit the DB/load
10577
reg.stop();
10678
});
107-
108-
it("waitUntilReady clears its timeout when ready wins", async () => {
109-
const clearSpy = vi.spyOn(global, "clearTimeout");
110-
// load resolves only when the test releases it, so waitUntilReady runs the
111-
// race while still unloaded (it would return early if already loaded)
112-
let releaseLoad!: () => void;
113-
const loadGate = new Promise<void>((resolve) => {
114-
releaseLoad = resolve;
115-
});
116-
const reg = createReloadingRegistry({
117-
name: "test-f",
118-
intervalMs: 10_000,
119-
load: async () => {
120-
await loadGate;
121-
return 1;
122-
},
123-
});
124-
125-
// long timeout so isReady is what actually wins the race
126-
const waiting = reg.waitUntilReady(10_000);
127-
releaseLoad();
128-
await reg.isReady;
129-
await waiting;
130-
131-
expect(clearSpy).toHaveBeenCalled();
132-
clearSpy.mockRestore();
133-
reg.stop();
134-
});
13579
});

0 commit comments

Comments
 (0)