diff --git a/README.md b/README.md
index e01550476..323865194 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ npm install react-native-webgpu
 ## With Expo
 
 Expo provides a React Native WebGPU template that works with React Three Fiber.
-The works on iOS, Android, and Web.
+This works on iOS, Android, and Web.
 
 ```
 npx create-expo-app@latest -e with-webgpu
@@ -174,8 +174,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+In React Native, frame presentation is a manual operation: when you are ready to present a frame, call `present()` on the context after submitting your commands to the queue. This works the same on every runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor). `present()` runs synchronously on the calling thread, so the frame is presented from whichever thread did the rendering.
 
 ```tsx
 // draw
@@ -185,6 +184,13 @@ device.queue.submit([commandEncoder.finish()]);
 context.present();
 ```
 
+### Threading model
+
+react-native-webgpu can drive WebGPU from more than one JavaScript runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor).
+This module also works well with [Bundle Mode](https://docs.swmansion.com/react-native-worklets/docs/bundleMode/) and lets you run complex Three.js scenes on the UI thread or dedicated worklet threads.
+
+There is a caveat with `device.lost` and `uncapturederror`: they are only delivered on the main JS runtime. This is usually fine because the GPU device is typically created on the main JS thread and then sent to the UI or a dedicated worklet thread. However, if for some reason you create the device outside the main JS thread, beware that `device.lost` and `uncapturederror` won't fire.
+
 ### Canvas Transparency
 
 On Android, the `alphaMode` property is ignored when configuring the canvas.
@@ -293,10 +299,10 @@ const render = () => {
 
   // ... encode a pass that samples `externalTexture`, then:
   device.queue.submit([encoder.finish()]);
+  context.present();
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
@@ -316,14 +322,21 @@ First, install the optional peer dependencies:
 npm install react-native-reanimated react-native-worklets
 ```
 
-WebGPU objects are automatically registered for Worklets serialization when the module loads. You can pass WebGPU objects like `GPUDevice` and `GPUCanvasContext` directly to worklets:
+WebGPU objects are automatically registered for Worklets serialization when the module loads. You can pass WebGPU objects like `GPUDevice` and `GPUCanvasContext` directly to worklets.
+Call `installWebGPU()` once at the top of the worklet to install flag constants like `GPUBufferUsage`, `GPUTextureUsage`, and so on.
 
 ```tsx
-import { Canvas } from "react-native-webgpu";
+import { Canvas, installWebGPU } from "react-native-webgpu";
 import { runOnUI } from "react-native-reanimated";
 
 const renderFrame = (device: GPUDevice, context: GPUCanvasContext) => {
   "worklet";
+  installWebGPU();
+  // WebGPU constants are now available on this worklet thread
+  const buffer = device.createBuffer({
+    size,
+    usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+  });
   // WebGPU rendering code runs on the UI thread
   const commandEncoder = device.createCommandEncoder();
   // ... render ...
diff --git a/apps/example/ios/Podfile.lock b/apps/example/ios/Podfile.lock
index fd5ba968c..560141dff 100644
--- a/apps/example/ios/Podfile.lock
+++ b/apps/example/ios/Podfile.lock
@@ -1924,7 +1924,7 @@ PODS:
     - ReactCommon/turbomodule/core
     - SocketRocket
     - Yoga
-  - react-native-wgpu (0.5.12):
+  - react-native-webgpu (0.5.15):
     - boost
     - DoubleConversion
     - fast_float
@@ -2812,7 +2812,7 @@ DEPENDENCIES:
   - React-microtasksnativemodule (from `../../../node_modules/react-native/ReactCommon/react/nativemodule/microtasks`)
   - react-native-safe-area-context (from `../../../node_modules/react-native-safe-area-context`)
   - "react-native-skia (from `../../../node_modules/@shopify/react-native-skia`)"
-  - react-native-wgpu (from `../../../node_modules/react-native-wgpu`)
+  - react-native-webgpu (from `../../../node_modules/react-native-webgpu`)
   - React-NativeModulesApple (from `../../../node_modules/react-native/ReactCommon/react/nativemodule/core/platform/ios`)
   - React-oscompat (from `../../../node_modules/react-native/ReactCommon/oscompat`)
   - React-perflogger (from `../../../node_modules/react-native/ReactCommon/reactperflogger`)
@@ -2948,8 +2948,8 @@ EXTERNAL SOURCES:
     :path: "../../../node_modules/react-native-safe-area-context"
   react-native-skia:
     :path: "../../../node_modules/@shopify/react-native-skia"
-  react-native-wgpu:
-    :path: "../../../node_modules/react-native-wgpu"
+  react-native-webgpu:
+    :path: "../../../node_modules/react-native-webgpu"
   React-NativeModulesApple:
     :path: "../../../node_modules/react-native/ReactCommon/react/nativemodule/core/platform/ios"
   React-oscompat:
@@ -3074,7 +3074,7 @@ SPEC CHECKSUMS:
   React-microtasksnativemodule: 75b6604b667d297292345302cc5bfb6b6aeccc1b
   react-native-safe-area-context: c00143b4823773bba23f2f19f85663ae89ceb460
   react-native-skia: fc73e9bdc46ebb420a98c9c2be29fee80f565e79
-  react-native-wgpu: 274ffec11ee3a082260d9f3d1fb54030a5ca0873
+  react-native-webgpu: 02d51c1d86e4d653de06bdc954d2f693dcead7a5
   React-NativeModulesApple: 879fbdc5dcff7136abceb7880fe8a2022a1bd7c3
   React-oscompat: 93b5535ea7f7dff46aaee4f78309a70979bdde9d
   React-perflogger: 5536d2df3d18fe0920263466f7b46a56351c0510
diff --git a/apps/example/src/CanvasAPI/CanvasAPI.tsx b/apps/example/src/CanvasAPI/CanvasAPI.tsx
index a6fc2bd32..f5815169d 100644
--- a/apps/example/src/CanvasAPI/CanvasAPI.tsx
+++ b/apps/example/src/CanvasAPI/CanvasAPI.tsx
@@ -89,7 +89,6 @@ export const CanvasAPI = () => {
             passEncoder.end();
 
             device.queue.submit([commandEncoder.finish()]);
-
             context.present();
           })()
         }
diff --git a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
index 4027baf63..9fd16463e 100644
--- a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
+++ b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
@@ -244,10 +244,10 @@ export const ImportExternalTexture = () => {
 
       pass.end();
       device.queue.submit([encoder.finish()]);
+      context.present();
       // Now that the work sampling it has been submitted, end the external
       // texture's access window so the frame's surface is released promptly.
       externalTex?.destroy();
-      context.present();
       rafRef.current = requestAnimationFrame(render);
     };
     rafRef.current = requestAnimationFrame(render);
diff --git a/apps/example/src/Reanimated/AsyncBuffer.tsx b/apps/example/src/Reanimated/AsyncBuffer.tsx
new file mode 100644
index 000000000..9847b612b
--- /dev/null
+++ b/apps/example/src/Reanimated/AsyncBuffer.tsx
@@ -0,0 +1,232 @@
+import React, { useEffect, useRef, useState } from "react";
+import { Pressable, StyleSheet, Text, View } from "react-native";
+import type { CanvasRef, RNCanvasContext } from "react-native-webgpu";
+import { Canvas, GPUBufferUsage, GPUMapMode } from "react-native-webgpu";
+import type { SharedValue } from "react-native-reanimated";
+import { useSharedValue } from "react-native-reanimated";
+
+import { redFragWGSL, triangleVertWGSL } from "../Triangle/triangle";
+
+// A triangle demo that creates its adapter/device AND performs an async GPU
+// readback (buffer.mapAsync) every frame, all on the runtime this worklet runs
+// on. With the ProcessEvents async model the device must be created and used on
+// the same runtime, so requestAdapter/requestDevice happen here in the worklet
+// (the GPU object is passed in). The point: with the JS thread busy, the readback
+// keeps resolving on this runtime's own thread and the triangle keeps animating.
+//
+// GPUBufferUsage / GPUMapMode are imported from react-native-webgpu: the bare
+// globals are only installed on the main JS runtime, but importing them lets the
+// Worklets serializer capture them by closure, so they work on this runtime too.
+export const webGPUAsyncDemo = (
+  runAnimation: SharedValue<boolean>,
+  context: RNCanvasContext,
+  gpu: GPU,
+  presentationFormat: GPUTextureFormat,
+) => {
+  "worklet";
+  if (!context) {
+    throw new Error("No context");
+  }
+
+  // Errors thrown on a worklet are forwarded to the JS thread by the worklets
+  // runtime; if the error object transitively references WebGPU host objects,
+  // JSON.stringify of it on the JS side can crash. So we catch everything here
+  // and forward only a plain string.
+  const logError = (where: string, e: unknown) => {
+    console.error(
+      `[asyncBuffer] ${where}: ` +
+        String((e as { message?: string })?.message ?? e),
+    );
+  };
+
+  const run = async () => {
+    const adapter = await gpu.requestAdapter();
+    if (!adapter) {
+      console.error("[asyncBuffer] failed to get adapter on worklet runtime");
+      return;
+    }
+    const device = await adapter.requestDevice();
+    if (!device) {
+      console.error("[asyncBuffer] failed to get device on worklet runtime");
+      return;
+    }
+    console.log("[asyncBuffer] device created on worklet runtime");
+
+    context.configure({
+      device,
+      format: presentationFormat,
+      alphaMode: "premultiplied",
+    });
+
+    const pipeline = device.createRenderPipeline({
+      layout: "auto",
+      vertex: {
+        module: device.createShaderModule({ code: triangleVertWGSL }),
+        entryPoint: "main",
+      },
+      fragment: {
+        module: device.createShaderModule({ code: redFragWGSL }),
+        entryPoint: "main",
+        targets: [{ format: presentationFormat }],
+      },
+      primitive: { topology: "triangle-list" },
+    });
+
+    const SIZE = 16; // 4 x f32
+    const readback = device.createBuffer({
+      size: SIZE,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+    });
+
+    let frameId = 0;
+
+    const frame = async () => {
+      try {
+        frameId += 1;
+        const commandEncoder = device.createCommandEncoder();
+        const textureView = context.getCurrentTexture().createView();
+
+        const time = Date.now() / 1000;
+        const r = (Math.sin(time * 2) + 1) / 2;
+        const g = (Math.sin(time * 1.5 + Math.PI / 3) + 1) / 2;
+        const b = (Math.sin(time + Math.PI / 2) + 1) / 2;
+
+        const passEncoder = commandEncoder.beginRenderPass({
+          colorAttachments: [
+            {
+              view: textureView,
+              clearValue: [r, g, b, 1],
+              loadOp: "clear",
+              storeOp: "store",
+            },
+          ],
+        });
+        passEncoder.setPipeline(pipeline);
+        passEncoder.draw(3);
+        passEncoder.end();
+
+        const src = device.createBuffer({
+          size: SIZE,
+          usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE,
+          mappedAtCreation: true,
+        });
+        new Float32Array(src.getMappedRange()).set([frameId, r, g, b]);
+        src.unmap();
+        commandEncoder.copyBufferToBuffer(src, 0, readback, 0, SIZE);
+
+        device.queue.submit([commandEncoder.finish()]);
+
+        // THE ASYNC OP. With the ProcessEvents model this Promise is pumped and
+        // settled on THIS runtime's own thread, so it resolves even while the JS
+        // thread is busy. Watch the logs against the "Make JS busy" button.
+        await readback.mapAsync(GPUMapMode.READ);
+        const data = Array.from(new Float32Array(readback.getMappedRange()));
+        readback.unmap();
+        src.destroy();
+        if (frameId % 30 === 0) {
+          console.log(`[asyncBuffer] frame ${frameId} resolved ->`, data);
+        }
+
+        context.present();
+
+        if (runAnimation.value) {
+          requestAnimationFrame(frame);
+        }
+      } catch (e) {
+        logError("frame", e);
+      }
+    };
+    frame();
+  };
+  run().catch((e) => logError("run", e));
+};
+
+interface AsyncBufferExampleProps {
+  // Schedules the worklet on a given runtime (e.g. runOnUI for the UI thread,
+  // or runOnRuntime(runtime, ...) for a dedicated worklet runtime).
+  run: (
+    worklet: typeof webGPUAsyncDemo,
+  ) => (
+    runAnimation: SharedValue<boolean>,
+    context: RNCanvasContext,
+    gpu: GPU,
+    presentationFormat: GPUTextureFormat,
+  ) => void;
+}
+
+export function AsyncBufferExample({ run }: AsyncBufferExampleProps) {
+  const runAnimation = useSharedValue(true);
+  const ref = useRef<CanvasRef>(null);
+  const [busy, setBusy] = useState(false);
+
+  // Hammer the JS thread to prove the worklet's async readback + rendering are
+  // independent of it. Each tick blocks the JS thread for 250ms.
+  useEffect(() => {
+    if (!busy) {
+      return;
+    }
+    let job = requestAnimationFrame(function work() {
+      const start = Date.now();
+      while (Date.now() - start < 250) {
+        // Busy-wait, blocking the JS thread.
+      }
+      job = requestAnimationFrame(work);
+    });
+    return () => cancelAnimationFrame(job);
+  }, [busy]);
+
+  useEffect(() => {
+    const ctx = ref.current!.getContext("webgpu");
+    if (!ctx) {
+      console.error("Failed to get GPU canvas context");
+      return;
+    }
+    // The GPU object is created on the main runtime; we hand it to the worklet,
+    // which calls requestAdapter/requestDevice on its OWN runtime.
+    const { gpu } = navigator;
+    const presentationFormat = gpu.getPreferredCanvasFormat();
+    run(webGPUAsyncDemo)(runAnimation, ctx, gpu, presentationFormat);
+    return () => {
+      runAnimation.value = false;
+    };
+    // Init the GPU pipeline once on mount. Toggling `busy` must NOT re-run this
+    // (a second device + render loop would fight over the same surface and
+    // trigger a device-mismatch validation error).
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  return (
+    <View style={style.container}>
+      <Canvas ref={ref} style={style.webgpu} />
+      <Pressable style={style.button} onPress={() => setBusy((b) => !b)}>
+        <Text style={style.buttonText}>
+          {busy ? "Stop busy JS" : "Make JS busy"}
+        </Text>
+      </Pressable>
+    </View>
+  );
+}
+
+const style = StyleSheet.create({
+  container: {
+    flex: 1,
+    backgroundColor: "rgb(90, 180, 255)",
+  },
+  webgpu: {
+    flex: 1,
+  },
+  button: {
+    position: "absolute",
+    bottom: 32,
+    alignSelf: "center",
+    backgroundColor: "rgba(0,0,0,0.6)",
+    paddingHorizontal: 20,
+    paddingVertical: 12,
+    borderRadius: 24,
+  },
+  buttonText: {
+    color: "white",
+    fontSize: 16,
+    fontWeight: "600",
+  },
+});
diff --git a/apps/example/src/Reanimated/AsyncBufferDedicatedThread.tsx b/apps/example/src/Reanimated/AsyncBufferDedicatedThread.tsx
new file mode 100644
index 000000000..78d9efe10
--- /dev/null
+++ b/apps/example/src/Reanimated/AsyncBufferDedicatedThread.tsx
@@ -0,0 +1,14 @@
+import React, { useMemo } from "react";
+import { createWorkletRuntime, runOnRuntime } from "react-native-worklets";
+
+import { AsyncBufferExample } from "./AsyncBuffer";
+
+export const AsyncBufferDedicatedThread = () => {
+  const runtime = useMemo(
+    () => createWorkletRuntime({ name: "WebGPUAsyncBufferRuntime" }),
+    [],
+  );
+  return (
+    <AsyncBufferExample run={(worklet) => runOnRuntime(runtime, worklet)} />
+  );
+};
diff --git a/apps/example/src/Reanimated/AsyncBufferUIThread.tsx b/apps/example/src/Reanimated/AsyncBufferUIThread.tsx
new file mode 100644
index 000000000..c310e07a1
--- /dev/null
+++ b/apps/example/src/Reanimated/AsyncBufferUIThread.tsx
@@ -0,0 +1,8 @@
+import React from "react";
+import { runOnUI } from "react-native-reanimated";
+
+import { AsyncBufferExample } from "./AsyncBuffer";
+
+export const AsyncBufferUIThread = () => {
+  return <AsyncBufferExample run={runOnUI} />;
+};
diff --git a/apps/example/src/Reanimated/List.tsx b/apps/example/src/Reanimated/List.tsx
index 6531786aa..71446fd1d 100644
--- a/apps/example/src/Reanimated/List.tsx
+++ b/apps/example/src/Reanimated/List.tsx
@@ -19,6 +19,14 @@ export const examples = [
     screen: "FrameProcessor",
     title: "📷 Frame Processor",
   },
+  {
+    screen: "AsyncBufferUIThread",
+    title: "🧵 Async Buffer (UI)",
+  },
+  {
+    screen: "AsyncBufferDedicatedThread",
+    title: "🔀 Async Buffer (Dedicated)",
+  },
 ] as const;
 
 const styles = StyleSheet.create({
diff --git a/apps/example/src/Reanimated/Reanimated.tsx b/apps/example/src/Reanimated/Reanimated.tsx
index 74392d03d..f48266d05 100644
--- a/apps/example/src/Reanimated/Reanimated.tsx
+++ b/apps/example/src/Reanimated/Reanimated.tsx
@@ -78,8 +78,11 @@ export const webGPUDemo = (
     passEncoder.end();
 
     device.queue.submit([commandEncoder.finish()]);
-
+    // Present runs on the calling thread, so it works the same whether this
+    // renders on the UI runtime (UIThread) or a dedicated worklet runtime
+    // (DedicatedThread).
     context.present();
+
     if (runAnimation.value) {
       requestAnimationFrame(frame);
     }
diff --git a/apps/example/src/Reanimated/Routes.ts b/apps/example/src/Reanimated/Routes.ts
index d39029d66..51fedd064 100644
--- a/apps/example/src/Reanimated/Routes.ts
+++ b/apps/example/src/Reanimated/Routes.ts
@@ -3,4 +3,6 @@ export type Routes = {
   UIThread: undefined;
   DedicatedThread: undefined;
   FrameProcessor: undefined;
+  AsyncBufferUIThread: undefined;
+  AsyncBufferDedicatedThread: undefined;
 };
diff --git a/apps/example/src/Reanimated/index.tsx b/apps/example/src/Reanimated/index.tsx
index 7200678e2..1f2310317 100644
--- a/apps/example/src/Reanimated/index.tsx
+++ b/apps/example/src/Reanimated/index.tsx
@@ -6,6 +6,8 @@ import { List } from "./List";
 import { UIThread } from "./UIThread";
 import { DedicatedThread } from "./DedicatedThread";
 import { FrameProcessor } from "./FrameProcessor";
+import { AsyncBufferUIThread } from "./AsyncBufferUIThread";
+import { AsyncBufferDedicatedThread } from "./AsyncBufferDedicatedThread";
 
 const Stack = createStackNavigator<Routes>();
 export const Reanimated = () => {
@@ -40,6 +42,20 @@ export const Reanimated = () => {
           title: "📷 Frame Processor",
         }}
       />
+      <Stack.Screen
+        name="AsyncBufferUIThread"
+        component={AsyncBufferUIThread}
+        options={{
+          title: "🧵 Async Buffer (UI)",
+        }}
+      />
+      <Stack.Screen
+        name="AsyncBufferDedicatedThread"
+        component={AsyncBufferDedicatedThread}
+        options={{
+          title: "🔀 Async Buffer (Dedicated)",
+        }}
+      />
     </Stack.Navigator>
   );
 };
diff --git a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
index 0e48aa2b7..071bfb92e 100644
--- a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
+++ b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
@@ -185,8 +185,7 @@ export function StorageBufferVertices() {
 
     const commandBuffer = encoder.finish();
     device.queue.submit([commandBuffer]);
-    // eslint-disable-next-line @typescript-eslint/no-explicit-any
-    (context as any).present();
+    context.present();
   });
 
   return (
diff --git a/apps/example/src/ThreeJS/Backdrop.tsx b/apps/example/src/ThreeJS/Backdrop.tsx
index 64bd63bbe..12189ce60 100644
--- a/apps/example/src/ThreeJS/Backdrop.tsx
+++ b/apps/example/src/ThreeJS/Backdrop.tsx
@@ -150,7 +150,7 @@ export const Backdrop = () => {
       }
 
       renderer.render(scene, camera);
-      context!.present();
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/Helmet.tsx b/apps/example/src/ThreeJS/Helmet.tsx
index cbf16011e..0dbb8dd91 100644
--- a/apps/example/src/ThreeJS/Helmet.tsx
+++ b/apps/example/src/ThreeJS/Helmet.tsx
@@ -49,7 +49,7 @@ export const Helmet = () => {
     function animate() {
       animateCamera();
       renderer.render(scene, camera);
-      context!.present();
+      context.present();
     }
 
     return () => {
diff --git a/apps/example/src/ThreeJS/InstancedMesh.tsx b/apps/example/src/ThreeJS/InstancedMesh.tsx
index 208c0afbc..42f489f0d 100644
--- a/apps/example/src/ThreeJS/InstancedMesh.tsx
+++ b/apps/example/src/ThreeJS/InstancedMesh.tsx
@@ -59,7 +59,6 @@ export const InstancedMesh = () => {
 
     function animate() {
       render();
-      context!.present();
     }
 
     function render() {
@@ -88,6 +87,7 @@ export const InstancedMesh = () => {
       }
 
       renderer.render(scene, camera);
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/PostProcessing.tsx b/apps/example/src/ThreeJS/PostProcessing.tsx
index 2186b26b1..1698563c0 100644
--- a/apps/example/src/ThreeJS/PostProcessing.tsx
+++ b/apps/example/src/ThreeJS/PostProcessing.tsx
@@ -72,7 +72,7 @@ export const PostProcessing = () => {
         mixer.update(delta);
       }
       postProcessing.render();
-      context!.present();
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/components/FiberCanvas.tsx b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
index a72d18b13..8b6eaaf8f 100644
--- a/apps/example/src/ThreeJS/components/FiberCanvas.tsx
+++ b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
@@ -66,7 +66,7 @@ export const FiberCanvas = ({
         const renderFrame = state.gl.render.bind(state.gl);
         state.gl.render = (s: THREE.Scene, c: THREE.Camera) => {
           renderFrame(s, c);
-          context?.present();
+          context.present();
         };
       },
     });
diff --git a/apps/example/src/Triangle/HelloTriangle.tsx b/apps/example/src/Triangle/HelloTriangle.tsx
index 56ec732bc..12bf56653 100644
--- a/apps/example/src/Triangle/HelloTriangle.tsx
+++ b/apps/example/src/Triangle/HelloTriangle.tsx
@@ -77,7 +77,6 @@ export function HelloTriangle() {
       passEncoder.end();
 
       device.queue.submit([commandEncoder.finish()]);
-
       context.present();
     })();
   }, [ref]);
diff --git a/apps/example/src/Triangle/HelloTriangleMSAA.tsx b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
index 481063830..19a692a2f 100644
--- a/apps/example/src/Triangle/HelloTriangleMSAA.tsx
+++ b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
@@ -84,10 +84,10 @@ export function HelloTriangleMSAA() {
         passEncoder.end();
 
         device.queue.submit([commandEncoder.finish()]);
+        context.present();
       }
 
       frame();
-      context.present();
     })();
   }, [ref]);
 
diff --git a/apps/example/src/VisionCamera/VisionCamera.tsx b/apps/example/src/VisionCamera/VisionCamera.tsx
index 8f196f937..c2571c4f8 100644
--- a/apps/example/src/VisionCamera/VisionCamera.tsx
+++ b/apps/example/src/VisionCamera/VisionCamera.tsx
@@ -613,11 +613,13 @@ const CameraView = () => {
           pass.draw(3);
           pass.end();
           device.queue.submit([encoder.finish()]);
+          // Vision Camera frame processors run on a dedicated worklet runtime;
+          // present runs on that thread, presenting the frame we just rendered.
+          context.present();
           // The work sampling it is submitted, so end the external texture's
           // access window now to release the camera frame's surface promptly
           // (don't wait for GC, which would starve the frame buffer pool).
           externalTex.destroy();
-          context.present();
         } finally {
           videoFrame.release();
         }
diff --git a/apps/example/src/components/useWebGPU.ts b/apps/example/src/components/useWebGPU.ts
index 196a39c26..68cce550f 100644
--- a/apps/example/src/components/useWebGPU.ts
+++ b/apps/example/src/components/useWebGPU.ts
@@ -4,10 +4,11 @@ import {
   useDevice,
   type CanvasRef,
   type NativeCanvas,
+  type RNCanvasContext,
 } from "react-native-webgpu";
 
 interface SceneProps {
-  context: GPUCanvasContext;
+  context: RNCanvasContext;
   device: GPUDevice;
   gpu: GPU;
   presentationFormat: GPUTextureFormat;
diff --git a/packages/webgpu-shim/README.md b/packages/webgpu-shim/README.md
index f23e4f6e7..8ae240b65 100644
--- a/packages/webgpu-shim/README.md
+++ b/packages/webgpu-shim/README.md
@@ -1,30 +1,9 @@
 # react-native-wgpu
 
-This package is a thin shim that re-exports [`react-native-webgpu`](https://www.npmjs.com/package/react-native-webgpu) under its previous npm name.
+This package has been renamed to [`react-native-webgpu`](https://www.npmjs.com/package/react-native-webgpu).
 
-It exists so that projects that depended on the older `react-native-wgpu` name keep working without an immediate code change. New projects should depend on `react-native-webgpu` directly.
-
-## Installation
+Please use `react-native-webgpu` instead.
 
 ```
-npm install react-native-wgpu
+npm install react-native-webgpu
 ```
-
-This installs `react-native-webgpu` as a dependency. All imports are forwarded:
-
-```ts
-import { Canvas } from "react-native-wgpu";
-// equivalent to
-import { Canvas } from "react-native-webgpu";
-```
-
-## Migrating
-
-Replace the dependency in your `package.json`:
-
-```diff
--  "react-native-wgpu": "^0.5.11"
-+  "react-native-webgpu": "^0.5.11"
-```
-
-and update your imports from `"react-native-wgpu"` to `"react-native-webgpu"`.
diff --git a/packages/webgpu-shim/package.json b/packages/webgpu-shim/package.json
index f1b29c1c0..f9318ed6d 100644
--- a/packages/webgpu-shim/package.json
+++ b/packages/webgpu-shim/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-wgpu",
-  "version": "0.5.14",
+  "version": "0.5.15",
   "description": "Shim that re-exports react-native-webgpu under its previous package name",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",
diff --git a/packages/webgpu/README.md b/packages/webgpu/README.md
index e01550476..323865194 100644
--- a/packages/webgpu/README.md
+++ b/packages/webgpu/README.md
@@ -15,7 +15,7 @@ npm install react-native-webgpu
 ## With Expo
 
 Expo provides a React Native WebGPU template that works with React Three Fiber.
-The works on iOS, Android, and Web.
+This works on iOS, Android, and Web.
 
 ```
 npx create-expo-app@latest -e with-webgpu
@@ -174,8 +174,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+In React Native, frame presentation is a manual operation: when you are ready to present a frame, call `present()` on the context after submitting your commands to the queue. This works the same on every runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor). `present()` runs synchronously on the calling thread, so the frame is presented from whichever thread did the rendering.
 
 ```tsx
 // draw
@@ -185,6 +184,13 @@ device.queue.submit([commandEncoder.finish()]);
 context.present();
 ```
 
+### Threading model
+
+react-native-webgpu can drive WebGPU from more than one JavaScript runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor).
+This module also works well with [Bundle Mode](https://docs.swmansion.com/react-native-worklets/docs/bundleMode/) and lets you run complex Three.js scenes on the UI thread or dedicated worklet threads.
+
+There is a caveat with `device.lost` and `uncapturederror`: they are only delivered on the main JS runtime. This is usually fine because the GPU device is typically created on the main JS thread and then sent to the UI or a dedicated worklet thread. However, if for some reason you create the device outside the main JS thread, beware that `device.lost` and `uncapturederror` won't fire.
+
 ### Canvas Transparency
 
 On Android, the `alphaMode` property is ignored when configuring the canvas.
@@ -293,10 +299,10 @@ const render = () => {
 
   // ... encode a pass that samples `externalTexture`, then:
   device.queue.submit([encoder.finish()]);
+  context.present();
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
@@ -316,14 +322,21 @@ First, install the optional peer dependencies:
 npm install react-native-reanimated react-native-worklets
 ```
 
-WebGPU objects are automatically registered for Worklets serialization when the module loads. You can pass WebGPU objects like `GPUDevice` and `GPUCanvasContext` directly to worklets:
+WebGPU objects are automatically registered for Worklets serialization when the module loads. You can pass WebGPU objects like `GPUDevice` and `GPUCanvasContext` directly to worklets.
+Call `installWebGPU()` once at the top of the worklet to install flag constants like `GPUBufferUsage`, `GPUTextureUsage`, and so on.
 
 ```tsx
-import { Canvas } from "react-native-webgpu";
+import { Canvas, installWebGPU } from "react-native-webgpu";
 import { runOnUI } from "react-native-reanimated";
 
 const renderFrame = (device: GPUDevice, context: GPUCanvasContext) => {
   "worklet";
+  installWebGPU();
+  // WebGPU constants are now available on this worklet thread
+  const buffer = device.createBuffer({
+    size,
+    usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+  });
   // WebGPU rendering code runs on the UI thread
   const commandEncoder = device.createCommandEncoder();
   // ... render ...
diff --git a/packages/webgpu/android/CMakeLists.txt b/packages/webgpu/android/CMakeLists.txt
index c2e25f54d..8f7321b7f 100644
--- a/packages/webgpu/android/CMakeLists.txt
+++ b/packages/webgpu/android/CMakeLists.txt
@@ -51,9 +51,8 @@ add_library(${PACKAGE_NAME} SHARED
     ../cpp/jsi/Promise.cpp
     ../cpp/jsi/RuntimeLifecycleMonitor.cpp
     ../cpp/jsi/RuntimeAwareCache.cpp
-    ../cpp/rnwgpu/async/AsyncRunner.cpp
+    ../cpp/rnwgpu/async/RuntimeContext.cpp
     ../cpp/rnwgpu/async/AsyncTaskHandle.cpp
-    ../cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
 )
 
 target_include_directories(
diff --git a/packages/webgpu/cpp/jsi/NativeObject.h b/packages/webgpu/cpp/jsi/NativeObject.h
index a90927721..d3e09ed5c 100644
--- a/packages/webgpu/cpp/jsi/NativeObject.h
+++ b/packages/webgpu/cpp/jsi/NativeObject.h
@@ -439,6 +439,29 @@ class NativeObject : public jsi::NativeState,
     prototype.setProperty(runtime, name, func);
   }
 
+  /**
+   * Install a method whose native implementation needs the calling jsi::Runtime
+   * as its first parameter. Used by entry points that must act per-runtime
+   * (e.g. GPU::requestAdapter, which creates a per-runtime RuntimeContext).
+   */
+  template <typename ReturnType, typename... Args>
+  static void
+  installMethodWithRuntime(jsi::Runtime &runtime, jsi::Object &prototype,
+                           const char *name,
+                           ReturnType (Derived::*method)(jsi::Runtime &,
+                                                         Args...)) {
+    auto func = jsi::Function::createFromHostFunction(
+        runtime, jsi::PropNameID::forUtf8(runtime, name), sizeof...(Args),
+        [method](jsi::Runtime &rt, const jsi::Value &thisVal,
+                 const jsi::Value *args, size_t count) -> jsi::Value {
+          auto native = Derived::fromValue(rt, thisVal);
+          return callMethodWithRuntime(native.get(), method, rt, args,
+                                       std::index_sequence_for<Args...>{},
+                                       count);
+        });
+    prototype.setProperty(runtime, name, func);
+  }
+
   /**
    * Install a getter on the prototype.
    */
@@ -574,6 +597,22 @@ class NativeObject : public jsi::NativeState,
   }
 
 private:
+  // Helper to call a method that takes the calling jsi::Runtime as its first
+  // parameter, with JSI argument conversion for the rest and JSI conversion of
+  // the result.
+  template <typename ReturnType, typename... Args, size_t... Is>
+  static jsi::Value
+  callMethodWithRuntime(Derived *obj,
+                        ReturnType (Derived::*method)(jsi::Runtime &, Args...),
+                        jsi::Runtime &runtime, const jsi::Value *args,
+                        std::index_sequence<Is...>, size_t count) {
+    ReturnType result = (obj->*method)(
+        runtime, rnwgpu::JSIConverter<std::decay_t<Args>>::fromJSI(
+                     runtime, args[Is], Is >= count)...);
+    return rnwgpu::JSIConverter<std::decay_t<ReturnType>>::toJSI(
+        runtime, std::move(result));
+  }
+
   // Helper to call a method with JSI argument conversion
   template <typename ReturnType, typename... Args, size_t... Is>
   static jsi::Value callMethod(Derived *obj,
diff --git a/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp b/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
index 56b0b5581..9db8ce387 100644
--- a/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
+++ b/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
@@ -64,6 +64,12 @@ RNWebGPUManager::RNWebGPUManager(
   // Register main runtime for RuntimeAwareCache
   BaseRuntimeAwareCache::setMainJsRuntime(_jsRuntime);
 
+  // Register the main runtime + its CallInvoker so spontaneous events
+  // (device.lost / uncapturederror) on main-runtime devices can be delivered to
+  // the JS thread without the ProcessEvents pump. Worklet-runtime devices have
+  // no invoker (best-effort; see README "Threading model").
+  async::RuntimeContext::registerMainRuntime(_jsRuntime, _jsCallInvoker);
+
   auto gpu = std::make_shared<GPU>(*_jsRuntime);
   auto rnWebGPU =
       std::make_shared<RNWebGPU>(gpu, _platformContext, _jsCallInvoker);
diff --git a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
index 110a45d44..5e96ee480 100644
--- a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
+++ b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
@@ -7,6 +7,12 @@
 
 #include "webgpu/webgpu_cpp.h"
 
+#ifdef __APPLE__
+namespace dawn::native::metal {
+void WaitForCommandsToBeScheduled(WGPUDevice device);
+} // namespace dawn::native::metal
+#endif
+
 namespace rnwgpu {
 
 struct NativeInfo {
@@ -113,7 +119,27 @@ class SurfaceInfo {
     height = newHeight;
   }
 
-  void present() {
+  // Present the current surface texture. Called synchronously from the thread
+  // that did getCurrentTexture / submit (via GPUCanvasContext::present), so it
+  // preserves Dawn surface thread-affinity. No-op when offscreen / unconfigured
+  // (no surface).
+  void presentFrame() {
+#ifdef __APPLE__
+    // Ensure command buffers are scheduled before presenting. Read the device
+    // under a shared lock, then wait without holding it (the wait can block).
+    // The device may be reconfigured between the two locks; that is safe because
+    // present() is called on the rendering thread right after submit(), the wait
+    // just flushes that thread's already-submitted work, and the Present() below
+    // re-checks `surface` under the unique lock before touching it.
+    wgpu::Device device;
+    {
+      std::shared_lock<std::shared_mutex> lock(_mutex);
+      device = config.device;
+    }
+    if (device) {
+      dawn::native::metal::WaitForCommandsToBeScheduled(device.Get());
+    }
+#endif
     std::unique_lock<std::shared_mutex> lock(_mutex);
     if (surface) {
       surface.Present();
@@ -131,6 +157,12 @@ class SurfaceInfo {
     }
   }
 
+  // True when an on-screen wgpu::Surface is attached (vs offscreen texture).
+  bool hasSurface() {
+    std::shared_lock<std::shared_mutex> lock(_mutex);
+    return surface != nullptr;
+  }
+
   NativeInfo getNativeInfo() {
     std::shared_lock<std::shared_mutex> lock(_mutex);
     return {.nativeSurface = nativeSurface, .width = width, .height = height};
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPU.cpp b/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
index 11530f4da..92939b28c 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
@@ -9,11 +9,11 @@
 
 #include "Convertors.h"
 #include "JSIConverter.h"
-#include "rnwgpu/async/JSIMicrotaskDispatcher.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 namespace rnwgpu {
 
-GPU::GPU(jsi::Runtime &runtime) : NativeObject(CLASS_NAME) {
+GPU::GPU(jsi::Runtime & /*runtime*/) : NativeObject(CLASS_NAME) {
   static const auto kTimedWaitAny = wgpu::InstanceFeatureName::TimedWaitAny;
   wgpu::InstanceDescriptor instanceDesc{.requiredFeatureCount = 1,
                                         .requiredFeatures = &kTimedWaitAny};
@@ -48,12 +48,10 @@ GPU::GPU(jsi::Runtime &runtime) : NativeObject(CLASS_NAME) {
   instanceDesc.nextInChain = &toggles;
 
   _instance = wgpu::CreateInstance(&instanceDesc);
-
-  auto dispatcher = std::make_shared<async::JSIMicrotaskDispatcher>(runtime);
-  _async = async::AsyncRunner::getOrCreate(runtime, _instance, dispatcher);
 }
 
 async::AsyncTaskHandle GPU::requestAdapter(
+    jsi::Runtime &runtime,
     std::optional<std::shared_ptr<GPURequestAdapterOptions>> options) {
   wgpu::RequestAdapterOptions aOptions;
   Convertor conv;
@@ -66,12 +64,17 @@ async::AsyncTaskHandle GPU::requestAdapter(
   constexpr auto kDefaultBackendType = wgpu::BackendType::Vulkan;
 #endif
   aOptions.backendType = kDefaultBackendType;
-  return _async->postTask(
-      [this, aOptions](const async::AsyncTaskHandle::ResolveFunction &resolve,
-                       const async::AsyncTaskHandle::RejectFunction &reject) {
+
+  // Per-runtime context: async ops requested on this runtime resolve on this
+  // runtime's own thread (via its ProcessEvents pump).
+  auto context = async::RuntimeContext::getOrCreate(runtime, _instance);
+  return context->postTask(
+      [this, aOptions,
+       context](const async::AsyncTaskHandle::ResolveFunction &resolve,
+                const async::AsyncTaskHandle::RejectFunction &reject) {
         _instance.RequestAdapter(
             &aOptions, wgpu::CallbackMode::AllowProcessEvents,
-            [asyncRunner = _async, resolve,
+            [context, resolve,
              reject](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter,
                      wgpu::StringView message) {
               if (message.length) {
@@ -79,8 +82,8 @@ async::AsyncTaskHandle GPU::requestAdapter(
               }
 
               if (status == wgpu::RequestAdapterStatus::Success && adapter) {
-                auto adapterHost = std::make_shared<GPUAdapter>(
-                    std::move(adapter), asyncRunner);
+                auto adapterHost =
+                    std::make_shared<GPUAdapter>(std::move(adapter), context);
                 auto result =
                     std::variant<std::nullptr_t, std::shared_ptr<GPUAdapter>>(
                         adapterHost);
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPU.h b/packages/webgpu/cpp/rnwgpu/api/GPU.h
index f6bb4ede3..f42589fc7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPU.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPU.h
@@ -9,8 +9,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -32,7 +32,10 @@ class GPU : public NativeObject<GPU> {
 public:
   std::string getBrand() { return CLASS_NAME; }
 
+  // requestAdapter needs the calling runtime so each runtime gets its own
+  // RuntimeContext (and ProcessEvents pump on its own thread).
   async::AsyncTaskHandle requestAdapter(
+      jsi::Runtime &runtime,
       std::optional<std::shared_ptr<GPURequestAdapterOptions>> options);
   wgpu::TextureFormat getPreferredCanvasFormat();
 
@@ -40,7 +43,8 @@ class GPU : public NativeObject<GPU> {
 
   static void definePrototype(jsi::Runtime &runtime, jsi::Object &prototype) {
     installGetter(runtime, prototype, "__brand", &GPU::getBrand);
-    installMethod(runtime, prototype, "requestAdapter", &GPU::requestAdapter);
+    installMethodWithRuntime(runtime, prototype, "requestAdapter",
+                             &GPU::requestAdapter);
     installMethod(runtime, prototype, "getPreferredCanvasFormat",
                   &GPU::getPreferredCanvasFormat);
     installGetter(runtime, prototype, "wgslLanguageFeatures",
@@ -48,11 +52,9 @@ class GPU : public NativeObject<GPU> {
   }
 
   inline const wgpu::Instance get() { return _instance; }
-  inline std::shared_ptr<async::AsyncRunner> getAsyncRunner() { return _async; }
 
 private:
   wgpu::Instance _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
 };
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
index 085b582dc..04de74ed1 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
@@ -164,10 +164,9 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
         }
         _instance.RequestDevice(
             &deviceDesc, wgpu::CallbackMode::AllowProcessEvents,
-            [asyncRunner = _async, resolve, reject, label, creationRuntime,
+            [context = _async, resolve, reject, label, creationRuntime,
              deviceLostBinding](wgpu::RequestDeviceStatus status,
-                                wgpu::Device device,
-                                wgpu::StringView message) {
+                                wgpu::Device device, wgpu::StringView message) {
               if (message.length) {
                 fprintf(stderr, "%s", message.data);
               }
@@ -191,14 +190,12 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
                     case wgpu::LoggingType::Warning:
                       logLevel = "Warning";
                       Logger::warnToJavascriptConsole(
-                          *creationRuntime,
-                          std::string(msg.data, msg.length));
+                          *creationRuntime, std::string(msg.data, msg.length));
                       break;
                     case wgpu::LoggingType::Error:
                       logLevel = "Error";
                       Logger::errorToJavascriptConsole(
-                          *creationRuntime,
-                          std::string(msg.data, msg.length));
+                          *creationRuntime, std::string(msg.data, msg.length));
                       break;
                     case wgpu::LoggingType::Verbose:
                       logLevel = "Verbose";
@@ -216,7 +213,7 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
                   creationRuntime);
 
               auto deviceHost = std::make_shared<GPUDevice>(std::move(device),
-                                                            asyncRunner, label);
+                                                            context, label);
               *deviceLostBinding = deviceHost;
 
               // Register the device in the static registry so the uncaptured
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
index 66acdc2f7..7f399f0a7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
@@ -8,8 +8,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -27,7 +27,7 @@ class GPUAdapter : public NativeObject<GPUAdapter> {
   static constexpr const char *CLASS_NAME = "GPUAdapter";
 
   explicit GPUAdapter(wgpu::Adapter instance,
-                      std::shared_ptr<async::AsyncRunner> async)
+                      std::shared_ptr<async::RuntimeContext> async)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async) {}
 
 public:
@@ -53,7 +53,7 @@ class GPUAdapter : public NativeObject<GPUAdapter> {
 
 private:
   wgpu::Adapter _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
 };
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
index 4d6012621..6ab4b5927 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
@@ -55,30 +55,29 @@ async::AsyncTaskHandle GPUBuffer::mapAsync(uint64_t modeIn,
       [bufferHandle, mode, resolvedOffset,
        rangeSize](const async::AsyncTaskHandle::ResolveFunction &resolve,
                   const async::AsyncTaskHandle::RejectFunction &reject) {
-        bufferHandle.MapAsync(mode, resolvedOffset, rangeSize,
-                              wgpu::CallbackMode::AllowProcessEvents,
-                              [resolve, reject](wgpu::MapAsyncStatus status,
-                                                wgpu::StringView message) {
-                                switch (status) {
-                                case wgpu::MapAsyncStatus::Success:
-                                  resolve(nullptr);
-                                  break;
-                                case wgpu::MapAsyncStatus::CallbackCancelled:
-                                  reject("MapAsyncStatus::CallbackCancelled");
-                                  break;
-                                case wgpu::MapAsyncStatus::Error:
-                                  reject("MapAsyncStatus::Error");
-                                  break;
-                                case wgpu::MapAsyncStatus::Aborted:
-                                  reject("MapAsyncStatus::Aborted");
-                                  break;
-                                default:
-                                  reject(
-                                      "MapAsyncStatus: " +
-                                      std::to_string(static_cast<int>(status)));
-                                  break;
-                                }
-                              });
+        bufferHandle.MapAsync(
+            mode, resolvedOffset, rangeSize, wgpu::CallbackMode::AllowProcessEvents,
+            [resolve, reject](wgpu::MapAsyncStatus status,
+                              wgpu::StringView message) {
+              switch (status) {
+              case wgpu::MapAsyncStatus::Success:
+                resolve(nullptr);
+                break;
+              case wgpu::MapAsyncStatus::CallbackCancelled:
+                reject("MapAsyncStatus::CallbackCancelled");
+                break;
+              case wgpu::MapAsyncStatus::Error:
+                reject("MapAsyncStatus::Error");
+                break;
+              case wgpu::MapAsyncStatus::Aborted:
+                reject("MapAsyncStatus::Aborted");
+                break;
+              default:
+                reject("MapAsyncStatus: " +
+                       std::to_string(static_cast<int>(status)));
+                break;
+              }
+            });
       });
 }
 
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
index edfc8e41b..036b5af4b 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
@@ -9,8 +9,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -25,7 +25,7 @@ class GPUBuffer : public NativeObject<GPUBuffer> {
   static constexpr const char *CLASS_NAME = "GPUBuffer";
 
   explicit GPUBuffer(wgpu::Buffer instance,
-                     std::shared_ptr<async::AsyncRunner> async,
+                     std::shared_ptr<async::RuntimeContext> async,
                      std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -71,7 +71,7 @@ class GPUBuffer : public NativeObject<GPUBuffer> {
 
 private:
   wgpu::Buffer _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
   struct Mapping {
     uint64_t start;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
index d75eb7b0f..4da91d441 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
@@ -3,14 +3,6 @@
 #include "RNWebGPUManager.h"
 #include <memory>
 
-#ifdef __APPLE__
-namespace dawn::native::metal {
-
-void WaitForCommandsToBeScheduled(WGPUDevice device);
-
-}
-#endif
-
 namespace rnwgpu {
 
 void GPUCanvasContext::configure(
@@ -47,21 +39,26 @@ std::shared_ptr<GPUTexture> GPUCanvasContext::getCurrentTexture() {
   if (sizeHasChanged) {
     _surfaceInfo->reconfigure(width, height);
   }
+
   auto texture = _surfaceInfo->getCurrentTexture();
+
+  auto size = _surfaceInfo->getSize();
+  _canvas->setClientWidth(size.width);
+  _canvas->setClientHeight(size.height);
+
   // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
   // cycles every frame since the canvas texture doesn't own the buffer.
   return std::make_shared<GPUTexture>(texture, "", false);
 }
 
 void GPUCanvasContext::present() {
-#ifdef __APPLE__
-  dawn::native::metal::WaitForCommandsToBeScheduled(
-      _surfaceInfo->getDevice().Get());
-#endif
-  auto size = _surfaceInfo->getSize();
-  _canvas->setClientWidth(size.width);
-  _canvas->setClientHeight(size.height);
-  _surfaceInfo->present();
+  // Present runs synchronously on the calling thread (the one that did
+  // getCurrentTexture / submit), preserving Dawn surface thread-affinity.
+  // Required on every runtime (main JS, Reanimated UI, dedicated worklet);
+  // offscreen surfaces have no wgpu::Surface so they no-op.
+  if (_surfaceInfo->hasSurface()) {
+    _surfaceInfo->presentFrame();
+  }
 }
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
index 4b97a7887..a5efc3c6a 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
@@ -55,6 +55,9 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
   void configure(std::shared_ptr<GPUCanvasConfiguration> configuration);
   void unconfigure();
   std::shared_ptr<GPUTexture> getCurrentTexture();
+  // Present is explicit on every runtime (main JS, Reanimated UI, and dedicated
+  // worklet runtimes). It runs synchronously on the calling thread, preserving
+  // Dawn surface thread-affinity; offscreen surfaces no-op.
   void present();
 
 private:
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
index 58df56f89..624068fe6 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
@@ -6,6 +6,8 @@
 #include <utility>
 #include <vector>
 
+#include <ReactCommon/CallInvoker.h>
+
 #include "Convertors.h"
 #include "JSIConverter.h"
 
@@ -19,23 +21,33 @@ namespace rnwgpu {
 
 void GPUDevice::notifyDeviceLost(wgpu::DeviceLostReason reason,
                                  std::string message) {
-  if (_lostSettled) {
-    return;
-  }
+  std::optional<async::AsyncTaskHandle::ResolveFunction> resolveToCall;
+  std::shared_ptr<GPUDeviceLostInfo> info;
+  {
+    std::lock_guard<std::mutex> lock(_lostMutex);
+    if (_lostSettled) {
+      return;
+    }
+
+    _lostSettled = true;
+    _lostInfo = std::make_shared<GPUDeviceLostInfo>(reason, std::move(message));
+    info = _lostInfo;
+
+    if (_lostResolve.has_value()) {
+      resolveToCall = std::move(*_lostResolve);
+      _lostResolve.reset();
+    }
 
-  _lostSettled = true;
-  _lostInfo = std::make_shared<GPUDeviceLostInfo>(reason, std::move(message));
+    _lostHandle.reset();
+  }
 
-  if (_lostResolve.has_value()) {
-    auto resolve = std::move(*_lostResolve);
-    _lostResolve.reset();
-    resolve([info = _lostInfo](jsi::Runtime &runtime) mutable {
+  // Settle outside the lock: resolve() only enqueues onto the JS thread.
+  if (resolveToCall.has_value()) {
+    (*resolveToCall)([info](jsi::Runtime &runtime) mutable {
       return JSIConverter<std::shared_ptr<GPUDeviceLostInfo>>::toJSI(runtime,
                                                                      info);
     });
   }
-
-  _lostHandle.reset();
 }
 
 void GPUDevice::forceLossForTesting() {
@@ -367,9 +379,9 @@ async::AsyncTaskHandle GPUDevice::createComputePipelineAsync(
                   runtime, pipelineHolder);
             });
           } else {
-            std::string error =
-                msg.length ? std::string(msg.data, msg.length)
-                           : "Failed to create compute pipeline";
+            std::string error = msg.length
+                                    ? std::string(msg.data, msg.length)
+                                    : "Failed to create compute pipeline";
             reject(std::move(error));
           }
         });
@@ -409,9 +421,8 @@ async::AsyncTaskHandle GPUDevice::createRenderPipelineAsync(
                   runtime, pipelineHolder);
             });
           } else {
-            std::string error =
-                msg.length ? std::string(msg.data, msg.length)
-                           : "Failed to create render pipeline";
+            std::string error = msg.length ? std::string(msg.data, msg.length)
+                                           : "Failed to create render pipeline";
             reject(std::move(error));
           }
         });
@@ -498,6 +509,11 @@ std::unordered_set<std::string> GPUDevice::getFeatures() {
 }
 
 async::AsyncTaskHandle GPUDevice::getLost() {
+  // Held across the whole body: the postTask callback below runs synchronously
+  // on this (JS) thread and touches the same _lost* fields, so it must not
+  // re-lock. notifyDeviceLost() takes the same lock from its (possibly worker)
+  // thread.
+  std::lock_guard<std::mutex> lock(_lostMutex);
   if (_lostHandle.has_value()) {
     return *_lostHandle;
   }
@@ -512,7 +528,7 @@ async::AsyncTaskHandle GPUDevice::getLost() {
                 runtime, info);
           });
         },
-        false);
+        /*keepPumping=*/false);
   }
 
   auto handle = _async->postTask(
@@ -526,9 +542,10 @@ async::AsyncTaskHandle GPUDevice::getLost() {
           return;
         }
 
+        // Resolved later from notifyDeviceLost().
         _lostResolve = resolve;
       },
-      false);
+      /*keepPumping=*/false);
 
   _lostHandle = handle;
   return handle;
@@ -548,6 +565,24 @@ void GPUDevice::removeEventListener(std::string type, jsi::Function callback) {
 
 void GPUDevice::notifyUncapturedError(wgpu::ErrorType type,
                                       std::string message) {
+  // Dawn can surface an uncaptured error from any ProcessEvents pump (a worklet
+  // runtime sharing this instance may pump it on the wrong thread). Marshal to
+  // the owning runtime's JS thread via its CallInvoker before touching JSI. The
+  // invoker is wired only for the main JS runtime, so a device created on a
+  // worklet runtime does not deliver uncaptured errors to JS (best-effort; see
+  // README "Threading model").
+  auto invoker = _async ? _async->callInvoker() : nullptr;
+  if (!invoker) {
+    return;
+  }
+  auto self = shared_from_this();
+  invoker->invokeAsync([self, type, message = std::move(message)]() mutable {
+    self->deliverUncapturedError(type, std::move(message));
+  });
+}
+
+void GPUDevice::deliverUncapturedError(wgpu::ErrorType type,
+                                       std::string message) {
   auto it = _eventListeners.find("uncapturederror");
   if (it == _eventListeners.end() || it->second.empty()) {
     return;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
index ed5ff98ef..8df6909a2 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
@@ -15,8 +15,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -64,7 +64,7 @@ class GPUDevice : public NativeObject<GPUDevice> {
   static constexpr const char *CLASS_NAME = "GPUDevice";
 
   explicit GPUDevice(wgpu::Device instance,
-                     std::shared_ptr<async::AsyncRunner> async,
+                     std::shared_ptr<async::RuntimeContext> async,
                      std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -253,9 +253,18 @@ class GPUDevice : public NativeObject<GPUDevice> {
 private:
   friend class GPUAdapter;
 
+  // Runs the uncapturederror listeners on the creation runtime's JS thread.
+  // Invoked from notifyUncapturedError via the main CallInvoker.
+  void deliverUncapturedError(wgpu::ErrorType type, std::string message);
+
   wgpu::Device _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
+  // Guards the device-lost state below. In the ProcessEvents model both
+  // notifyDeviceLost() (fired by Dawn during ProcessEvents) and getLost() run on
+  // the owning runtime's own thread, but device destruction can also trigger
+  // notifyDeviceLost() synchronously, so the mutex keeps these fields safe.
+  std::mutex _lostMutex;
   std::optional<async::AsyncTaskHandle> _lostHandle;
   std::shared_ptr<GPUDeviceLostInfo> _lostInfo;
   bool _lostSettled = false;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
index be824e781..f322392b7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
@@ -8,8 +8,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -28,7 +28,7 @@ class GPUQueue : public NativeObject<GPUQueue> {
   static constexpr const char *CLASS_NAME = "GPUQueue";
 
   explicit GPUQueue(wgpu::Queue instance,
-                    std::shared_ptr<async::AsyncRunner> async,
+                    std::shared_ptr<async::RuntimeContext> async,
                     std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -74,7 +74,7 @@ class GPUQueue : public NativeObject<GPUQueue> {
 
 private:
   wgpu::Queue _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
 };
 
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
index ab8561090..0e59edf01 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
@@ -7,8 +7,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -23,7 +23,7 @@ class GPUShaderModule : public NativeObject<GPUShaderModule> {
   static constexpr const char *CLASS_NAME = "GPUShaderModule";
 
   explicit GPUShaderModule(wgpu::ShaderModule instance,
-                           std::shared_ptr<async::AsyncRunner> async,
+                           std::shared_ptr<async::RuntimeContext> async,
                            std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -59,7 +59,7 @@ class GPUShaderModule : public NativeObject<GPUShaderModule> {
 
 private:
   wgpu::ShaderModule _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
 };
 
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h b/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h
deleted file mode 100644
index 0ec176824..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <memory>
-
-#include <jsi/jsi.h>
-
-namespace rnwgpu::async {
-
-namespace jsi = facebook::jsi;
-
-/**
- * Abstract dispatcher used by the AsyncRunner to enqueue work back onto the
- * JavaScript thread.
- */
-class AsyncDispatcher {
-public:
-  using Work = std::function<void(jsi::Runtime &)>;
-
-  virtual ~AsyncDispatcher() = default;
-
-  /**
-   * Enqueue a unit of work that will be executed on the JavaScript thread.
-   */
-  virtual void post(Work work) = 0;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp b/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp
deleted file mode 100644
index 94bbae230..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-#include "AsyncRunner.h"
-
-#include <chrono>
-#include <stdexcept>
-#include <utility>
-
-#include "AsyncTaskHandle.h"
-#include "WGPULogger.h"
-
-namespace rnwgpu::async {
-
-namespace {
-struct RuntimeData {
-  std::shared_ptr<AsyncRunner> runner;
-};
-constexpr const char *TAG = "AsyncRunner";
-} // namespace
-
-AsyncRunner::AsyncRunner(wgpu::Instance instance,
-                         std::shared_ptr<AsyncDispatcher> dispatcher)
-    : _instance(std::move(instance)), _dispatcher(std::move(dispatcher)),
-      _pendingTasks(0), _pumpTasks(0), _tickScheduled(false),
-      _lastTickTimeNs(0) {
-  if (!_dispatcher) {
-    throw std::runtime_error("AsyncRunner requires a valid dispatcher.");
-  }
-  Logger::logToConsole("[%s] Created runner (dispatcher=%p)", TAG,
-                       _dispatcher.get());
-}
-
-std::shared_ptr<AsyncRunner> AsyncRunner::get(jsi::Runtime &runtime) {
-  auto data = runtime.getRuntimeData(runtimeDataUUID());
-  if (!data) {
-    return nullptr;
-  }
-  auto stored = std::static_pointer_cast<RuntimeData>(data);
-  return stored->runner;
-}
-
-std::shared_ptr<AsyncRunner>
-AsyncRunner::getOrCreate(jsi::Runtime &runtime, wgpu::Instance instance,
-                         std::shared_ptr<AsyncDispatcher> dispatcher) {
-  auto existing = get(runtime);
-  if (existing) {
-    return existing;
-  }
-
-  auto runner =
-      std::make_shared<AsyncRunner>(std::move(instance), std::move(dispatcher));
-  auto data = std::make_shared<RuntimeData>();
-  data->runner = runner;
-  runtime.setRuntimeData(runtimeDataUUID(), data);
-  return runner;
-}
-
-AsyncTaskHandle AsyncRunner::postTask(const TaskCallback &callback,
-                                      bool keepPumping) {
-  auto handle = AsyncTaskHandle::create(shared_from_this(), keepPumping);
-  if (!handle.valid()) {
-    throw std::runtime_error("Failed to create AsyncTaskHandle.");
-  }
-
-  _pendingTasks.fetch_add(1, std::memory_order_acq_rel);
-  if (keepPumping) {
-    _pumpTasks.fetch_add(1, std::memory_order_acq_rel);
-  }
-  requestTick();
-
-  Logger::logToConsole(
-      "[%s] postTask (keepPumping=%s, pending=%zu, pumping=%zu)", TAG,
-      keepPumping ? "true" : "false",
-      _pendingTasks.load(std::memory_order_acquire),
-      _pumpTasks.load(std::memory_order_acquire));
-
-  auto resolve = handle.createResolveFunction();
-  auto reject = handle.createRejectFunction();
-
-  try {
-    callback(resolve, reject);
-  } catch (const std::exception &exception) {
-    reject(exception.what());
-  } catch (...) {
-    reject("Unknown native error in AsyncRunner::postTask.");
-  }
-
-  return handle;
-}
-
-void AsyncRunner::requestTick() {
-  bool expected = false;
-  if (!_tickScheduled.compare_exchange_strong(expected, true,
-                                              std::memory_order_acq_rel)) {
-    return;
-  }
-
-  auto self = shared_from_this();
-  _dispatcher->post([self](jsi::Runtime &runtime) {
-    auto tickCallback = jsi::Function::createFromHostFunction(
-        runtime, jsi::PropNameID::forAscii(runtime, "AsyncRunnerTick"), 0,
-        [self](jsi::Runtime &runtime, const jsi::Value & /*thisValue*/,
-               const jsi::Value * /*args*/, size_t /*count*/) -> jsi::Value {
-          self->tick(runtime);
-          return jsi::Value::undefined();
-        });
-
-#if defined(ANDROID) || defined(__ANDROID__)
-    auto global = runtime.global();
-    auto setImmediateValue = global.getProperty(runtime, "setImmediate");
-    constexpr auto kMinTickInterval = std::chrono::milliseconds(4);
-    const int64_t nowNs =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(
-            std::chrono::steady_clock::now().time_since_epoch())
-            .count();
-    const int64_t lastNs =
-        self->_lastTickTimeNs.load(std::memory_order_acquire);
-    int delayMs = 0;
-    if (lastNs > 0) {
-      const int64_t elapsedNs = nowNs - lastNs;
-      const int64_t minIntervalNs = kMinTickInterval.count() * 1000000LL;
-      if (elapsedNs < minIntervalNs) {
-        const int64_t remainingNs = minIntervalNs - elapsedNs;
-        delayMs = static_cast<int>((remainingNs + 999999) / 1000000);
-      }
-    }
-
-    auto tryScheduleTimeout = [&](int ms) {
-      auto setTimeoutValue = global.getProperty(runtime, "setTimeout");
-      if (!setTimeoutValue.isObject()) {
-        return false;
-      }
-      auto setTimeoutObj = setTimeoutValue.asObject(runtime);
-      if (!setTimeoutObj.isFunction(runtime)) {
-        return false;
-      }
-      Logger::logToConsole("[%s] requestTick scheduled via setTimeout(%d)", TAG,
-                           ms);
-      auto setTimeoutFn = setTimeoutObj.asFunction(runtime);
-      jsi::Value callbackArg(runtime, tickCallback);
-      jsi::Value delayArg(static_cast<double>(ms));
-      setTimeoutFn.call(runtime, callbackArg, delayArg);
-      return true;
-    };
-
-    if (delayMs > 0) {
-      if (tryScheduleTimeout(delayMs)) {
-        return;
-      }
-      // If setTimeout unavailable fall through to immediate scheduling.
-    }
-
-    if (setImmediateValue.isObject()) {
-      auto setImmediateObj = setImmediateValue.asObject(runtime);
-      if (setImmediateObj.isFunction(runtime)) {
-        Logger::logToConsole("[%s] requestTick scheduled via setImmediate",
-                             TAG);
-        auto setImmediateFn = setImmediateObj.asFunction(runtime);
-        jsi::Value callbackArg(runtime, tickCallback);
-        setImmediateFn.call(runtime, callbackArg);
-        return;
-      }
-    }
-
-    int timeoutDelayMs = delayMs > 0 ? delayMs : 0;
-    if (tryScheduleTimeout(timeoutDelayMs)) {
-      return;
-    }
-
-    Logger::logToConsole("[%s] requestTick scheduled via microtask fallback",
-                         TAG);
-    runtime.queueMicrotask(std::move(tickCallback));
-#else
-    Logger::logToConsole("[%s] requestTick scheduled microtask (non-Android)",
-                         TAG);
-    runtime.queueMicrotask(std::move(tickCallback));
-#endif
-  });
-}
-
-void AsyncRunner::tick(jsi::Runtime & /*runtime*/) {
-  _tickScheduled.store(false, std::memory_order_release);
-  _instance.ProcessEvents();
-  const auto nowNs = std::chrono::duration_cast<std::chrono::nanoseconds>(
-                         std::chrono::steady_clock::now().time_since_epoch())
-                         .count();
-  _lastTickTimeNs.store(nowNs, std::memory_order_release);
-  Logger::logToConsole("[%s] tick processed events (pending=%zu, pumping=%zu)",
-                       TAG, _pendingTasks.load(std::memory_order_acquire),
-                       _pumpTasks.load(std::memory_order_acquire));
-  if (_pumpTasks.load(std::memory_order_acquire) > 0) {
-    requestTick();
-  }
-}
-
-void AsyncRunner::onTaskSettled(bool keepPumping) {
-  _pendingTasks.fetch_sub(1, std::memory_order_acq_rel);
-  if (keepPumping) {
-    _pumpTasks.fetch_sub(1, std::memory_order_acq_rel);
-  }
-  Logger::logToConsole(
-      "[%s] onTaskSettled (keepPumping=%s, pending=%zu, pumping=%zu)", TAG,
-      keepPumping ? "true" : "false",
-      _pendingTasks.load(std::memory_order_acquire),
-      _pumpTasks.load(std::memory_order_acquire));
-}
-
-std::shared_ptr<AsyncDispatcher> AsyncRunner::dispatcher() const {
-  return _dispatcher;
-}
-
-jsi::UUID AsyncRunner::runtimeDataUUID() {
-  static const auto uuid = jsi::UUID();
-  return uuid;
-}
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h b/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h
deleted file mode 100644
index f81101d10..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include <cstdint>
-#include <functional>
-#include <memory>
-
-#include <jsi/jsi.h>
-
-#include "AsyncDispatcher.h"
-#include "AsyncTaskHandle.h"
-
-#include "webgpu/webgpu_cpp.h"
-
-namespace jsi = facebook::jsi;
-
-namespace rnwgpu::async {
-
-class AsyncRunner : public std::enable_shared_from_this<AsyncRunner> {
-public:
-  using TaskCallback =
-      std::function<void(const AsyncTaskHandle::ResolveFunction &,
-                         const AsyncTaskHandle::RejectFunction &)>;
-
-  AsyncRunner(wgpu::Instance instance,
-              std::shared_ptr<AsyncDispatcher> dispatcher);
-
-  static std::shared_ptr<AsyncRunner> get(jsi::Runtime &runtime);
-  static std::shared_ptr<AsyncRunner>
-  getOrCreate(jsi::Runtime &runtime, wgpu::Instance instance,
-              std::shared_ptr<AsyncDispatcher> dispatcher);
-
-  AsyncTaskHandle postTask(const TaskCallback &callback,
-                           bool keepPumping = true);
-
-  void requestTick();
-  void tick(jsi::Runtime &runtime);
-  void onTaskSettled(bool keepPumping);
-
-  std::shared_ptr<AsyncDispatcher> dispatcher() const;
-
-private:
-  static jsi::UUID runtimeDataUUID();
-
-  wgpu::Instance _instance;
-  std::shared_ptr<AsyncDispatcher> _dispatcher;
-  std::atomic<size_t> _pendingTasks;
-  std::atomic<size_t> _pumpTasks;
-  std::atomic<bool> _tickScheduled;
-  std::atomic<int64_t> _lastTickTimeNs;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
index 6b262005a..e6ca59285 100644
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
+++ b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
@@ -1,11 +1,13 @@
 #include "AsyncTaskHandle.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
-#include "Promise.h"
+#include <ReactCommon/CallInvoker.h>
 
-#include "AsyncRunner.h"
+#include "Promise.h"
+#include "RuntimeContext.h"
 
 namespace rnwgpu::async {
 
@@ -13,8 +15,8 @@ using Action = std::function<void(jsi::Runtime &, rnwgpu::Promise &)>;
 
 struct AsyncTaskHandle::State
     : public std::enable_shared_from_this<AsyncTaskHandle::State> {
-  State(std::shared_ptr<AsyncRunner> runner, bool keepPumping)
-      : runner(std::move(runner)), keepPumping(keepPumping) {}
+  State(std::shared_ptr<RuntimeContext> context, bool keepPumping)
+      : context(std::move(context)), keepPumping(keepPumping) {}
 
   void settle(Action action);
   void attachPromise(const std::shared_ptr<rnwgpu::Promise> &promise);
@@ -26,12 +28,12 @@ struct AsyncTaskHandle::State
   std::shared_ptr<rnwgpu::Promise> currentPromise();
 
   std::mutex mutex;
-  std::weak_ptr<AsyncRunner> runner;
+  std::shared_ptr<RuntimeContext> context;
+  bool keepPumping;
   std::shared_ptr<rnwgpu::Promise> promise;
   std::optional<Action> pendingAction;
   bool settled = false;
   std::shared_ptr<State> keepAlive;
-  bool keepPumping;
 };
 
 // MARK: - State helpers
@@ -77,30 +79,60 @@ void AsyncTaskHandle::State::attachPromise(
 }
 
 void AsyncTaskHandle::State::schedule(Action action) {
-  auto runnerRef = runner.lock();
-  if (!runnerRef) {
+  auto promiseRef = currentPromise();
+  if (!promiseRef) {
     return;
   }
 
-  auto promiseRef = currentPromise();
-  if (!promiseRef) {
-    runnerRef->onTaskSettled(keepPumping);
+  if (!context) {
+    // No context (shouldn't happen): best-effort inline settle.
+    action(promiseRef->runtime, *promiseRef);
+    std::lock_guard<std::mutex> lock(mutex);
+    keepAlive.reset();
     return;
   }
 
-  auto dispatcherRef = runnerRef->dispatcher();
-  if (!dispatcherRef) {
-    runnerRef->onTaskSettled(keepPumping);
+  auto self = shared_from_this();
+
+  if (!keepPumping) {
+    // Spontaneous task (e.g. device.lost): not driven by the ProcessEvents pump.
+    // Settle on the owning runtime's JS thread via its CallInvoker, which is
+    // wired only for the main JS runtime. A device created on a worklet runtime
+    // has no invoker, so its device.lost is dropped (best-effort; see the README
+    // "Threading model"). invokeAsync runs the closure on the main JS thread,
+    // where promiseRef->runtime lives for a main-runtime device.
+    auto invoker = context->callInvoker();
+    if (invoker) {
+      invoker->invokeAsync(
+          [self, action = std::move(action), promiseRef]() mutable {
+            action(promiseRef->runtime, *promiseRef);
+            std::lock_guard<std::mutex> lock(self->mutex);
+            self->keepAlive.reset();
+          });
+    } else {
+      std::lock_guard<std::mutex> lock(mutex);
+      keepAlive.reset();
+    }
     return;
   }
 
-  dispatcherRef->post([self = shared_from_this(), action = std::move(action),
-                       runnerRef, promiseRef](jsi::Runtime &runtime) mutable {
-    runnerRef->onTaskSettled(self->keepPumping);
-    action(runtime, *promiseRef);
-    std::lock_guard<std::mutex> lock(self->mutex);
-    self->keepAlive.reset();
-  });
+  // Pumping task (request/response op). The resolve/reject callback may fire on
+  // a thread that is NOT the owning runtime's thread: with a shared
+  // wgpu::Instance, another runtime's ProcessEvents() pump can consume this Dawn
+  // event. Touching the Promise's runtime off-thread would corrupt Hermes. So we
+  // deposit the actual settle (the only JSI-touching work) into the owning
+  // context's mailbox; the context drains it on its own thread during its next
+  // tick. The deposited closure captures only C++ state and runs no JSI until
+  // drained, so depositing from any thread is safe.
+  context->postSettle(
+      [self, action = std::move(action), promiseRef]() mutable {
+        action(promiseRef->runtime, *promiseRef);
+        if (self->context) {
+          self->context->onTaskSettled(/*keepPumping=*/true);
+        }
+        std::lock_guard<std::mutex> lock(self->mutex);
+        self->keepAlive.reset();
+      });
 }
 
 AsyncTaskHandle::ResolveFunction
@@ -149,9 +181,9 @@ AsyncTaskHandle::AsyncTaskHandle(std::shared_ptr<State> state)
 bool AsyncTaskHandle::valid() const { return _state != nullptr; }
 
 AsyncTaskHandle
-AsyncTaskHandle::create(const std::shared_ptr<AsyncRunner> &runner,
+AsyncTaskHandle::create(const std::shared_ptr<RuntimeContext> &context,
                         bool keepPumping) {
-  auto state = std::make_shared<State>(runner, keepPumping);
+  auto state = std::make_shared<State>(context, keepPumping);
   state->keepAlive = state;
   return AsyncTaskHandle(std::move(state));
 }
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
index cb6c7a2a4..fea16c0f6 100644
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
+++ b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
@@ -8,19 +8,22 @@
 
 #include <jsi/jsi.h>
 
-#include "AsyncDispatcher.h"
-
 namespace rnwgpu {
 class Promise;
 }
 
 namespace rnwgpu::async {
 
-class AsyncRunner;
+class RuntimeContext;
 
 /**
  * Represents a pending asynchronous WebGPU operation that can be converted into
  * a JavaScript Promise.
+ *
+ * In the ProcessEvents model the resolve/reject callbacks are invoked on the
+ * owning runtime's own thread (synchronously from instance.ProcessEvents()
+ * during the RuntimeContext tick, or synchronously from postTask), so the
+ * Promise is settled directly without any thread marshalling.
  */
 class AsyncTaskHandle {
 public:
@@ -34,7 +37,7 @@ class AsyncTaskHandle {
   AsyncTaskHandle();
 
   /**
-   * Internal constructor used by AsyncRunner.
+   * Internal constructor used by RuntimeContext.
    */
   explicit AsyncTaskHandle(std::shared_ptr<State> state);
 
@@ -45,7 +48,7 @@ class AsyncTaskHandle {
 
   void attachPromise(const std::shared_ptr<rnwgpu::Promise> &promise) const;
 
-  static AsyncTaskHandle create(const std::shared_ptr<AsyncRunner> &runner,
+  static AsyncTaskHandle create(const std::shared_ptr<RuntimeContext> &context,
                                 bool keepPumping);
 
 private:
diff --git a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp b/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
deleted file mode 100644
index 6231a833c..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "JSIMicrotaskDispatcher.h"
-
-#include <utility>
-
-namespace rnwgpu::async {
-
-JSIMicrotaskDispatcher::JSIMicrotaskDispatcher(jsi::Runtime &runtime)
-    : _runtime(runtime) {}
-
-void JSIMicrotaskDispatcher::post(Work work) {
-  auto microtask = jsi::Function::createFromHostFunction(
-      _runtime, jsi::PropNameID::forAscii(_runtime, "AsyncMicrotask"), 0,
-      [work = std::move(work)](
-          jsi::Runtime &runtime, const jsi::Value & /*thisValue*/,
-          const jsi::Value * /*args*/, size_t /*count*/) -> jsi::Value {
-        work(runtime);
-        return jsi::Value::undefined();
-      });
-
-  _runtime.queueMicrotask(std::move(microtask));
-}
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h b/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h
deleted file mode 100644
index bae208c5d..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "AsyncDispatcher.h"
-
-namespace rnwgpu::async {
-
-/**
- * Dispatcher implementation backed by `jsi::Runtime::queueMicrotask`.
- */
-class JSIMicrotaskDispatcher final
-    : public AsyncDispatcher,
-      public std::enable_shared_from_this<JSIMicrotaskDispatcher> {
-public:
-  explicit JSIMicrotaskDispatcher(jsi::Runtime &runtime);
-
-  void post(Work work) override;
-
-private:
-  jsi::Runtime &_runtime;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp
new file mode 100644
index 000000000..46754a40c
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp
@@ -0,0 +1,193 @@
+#include "RuntimeContext.h"
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+
+#include <ReactCommon/CallInvoker.h>
+
+#include "AsyncTaskHandle.h"
+#include "WGPULogger.h"
+
+namespace rnwgpu::async {
+
+namespace {
+struct RuntimeData {
+  std::shared_ptr<RuntimeContext> context;
+};
+constexpr const char *TAG = "RuntimeContext";
+
+// The main JS runtime and its CallInvoker, registered once on install. The
+// context created for sMainRuntime gets sMainInvoker; spontaneous events
+// (device.lost) on a main-runtime device are delivered through it without the
+// pump. Worklet runtimes have no invoker (best-effort, see the header doc).
+jsi::Runtime *sMainRuntime = nullptr;
+std::shared_ptr<facebook::react::CallInvoker> sMainInvoker;
+
+// Serializes ProcessEvents() across all runtimes that share a wgpu::Instance.
+// Held only across the ProcessEvents call itself, never while running JS / mailbox
+// settle-actions, so it cannot deadlock against the per-context mailbox mutex.
+std::mutex &processEventsMutex() {
+  static std::mutex mutex;
+  return mutex;
+}
+} // namespace
+
+void RuntimeContext::registerMainRuntime(
+    jsi::Runtime *runtime,
+    std::shared_ptr<facebook::react::CallInvoker> invoker) {
+  sMainRuntime = runtime;
+  sMainInvoker = std::move(invoker);
+}
+
+RuntimeContext::RuntimeContext(jsi::Runtime &runtime, wgpu::Instance instance)
+    : _runtime(runtime), _instance(std::move(instance)) {
+  Logger::logToConsole("[%s] Created (runtime=%p)", TAG, &runtime);
+}
+
+std::shared_ptr<RuntimeContext> RuntimeContext::get(jsi::Runtime &runtime) {
+  auto data = runtime.getRuntimeData(runtimeDataUUID());
+  if (!data) {
+    return nullptr;
+  }
+  return std::static_pointer_cast<RuntimeData>(data)->context;
+}
+
+std::shared_ptr<RuntimeContext>
+RuntimeContext::getOrCreate(jsi::Runtime &runtime, wgpu::Instance instance) {
+  if (auto existing = get(runtime)) {
+    return existing;
+  }
+  auto context = std::make_shared<RuntimeContext>(runtime, std::move(instance));
+  // Only the main JS runtime's context carries the CallInvoker; it is used to
+  // deliver spontaneous events (device.lost) without the pump.
+  if (&runtime == sMainRuntime) {
+    context->_callInvoker = sMainInvoker;
+  }
+  auto data = std::make_shared<RuntimeData>();
+  data->context = context;
+  runtime.setRuntimeData(runtimeDataUUID(), data);
+  return context;
+}
+
+AsyncTaskHandle RuntimeContext::postTask(const TaskCallback &callback,
+                                         bool keepPumping) {
+  auto handle = AsyncTaskHandle::create(shared_from_this(), keepPumping);
+  if (!handle.valid()) {
+    throw std::runtime_error("Failed to create AsyncTaskHandle.");
+  }
+
+  // Only pumping tasks (request/response ops) drive the ProcessEvents pump.
+  // Spontaneous tasks (keepPumping == false, e.g. device.lost) never touch the
+  // pump: they settle via the CallInvoker (see AsyncTaskHandle::State::schedule).
+  if (keepPumping) {
+    _pumpTasks.fetch_add(1, std::memory_order_acq_rel);
+    requestTick();
+  }
+
+  auto resolve = handle.createResolveFunction();
+  auto reject = handle.createRejectFunction();
+  try {
+    callback(resolve, reject);
+  } catch (const std::exception &exception) {
+    reject(exception.what());
+  } catch (...) {
+    reject("Unknown native error in RuntimeContext::postTask.");
+  }
+  return handle;
+}
+
+void RuntimeContext::onTaskSettled(bool keepPumping) {
+  if (keepPumping) {
+    _pumpTasks.fetch_sub(1, std::memory_order_acq_rel);
+  }
+}
+
+void RuntimeContext::postSettle(std::function<void()> job) {
+  if (!job) {
+    return;
+  }
+  std::lock_guard<std::mutex> lock(_mailboxMutex);
+  _mailbox.push_back(std::move(job));
+}
+
+void RuntimeContext::drainMailbox() {
+  std::vector<std::function<void()>> jobs;
+  {
+    std::lock_guard<std::mutex> lock(_mailboxMutex);
+    jobs.swap(_mailbox);
+  }
+  // Run settle-actions on this (the owning) thread, NOT under the ProcessEvents
+  // mutex, so JS continuations never execute while the pump lock is held.
+  for (auto &job : jobs) {
+    job();
+  }
+}
+
+void RuntimeContext::requestTick() {
+  bool expected = false;
+  if (!_tickScheduled.compare_exchange_strong(expected, true,
+                                              std::memory_order_acq_rel)) {
+    return;
+  }
+
+  // The pump only ever runs while a request/response op is outstanding, so it
+  // always schedules as soon as possible (delay 0). postTask and tick both run
+  // on the owning runtime's thread, so we schedule the next tick directly via
+  // that runtime's own timer. setTimeout is available on the main RN runtime and
+  // on worklet runtimes (backed by the worklets EventLoop); setImmediate /
+  // queueMicrotask are fallbacks. We do NOT use queueMicrotask as the primary
+  // mechanism: a self-rescheduling microtask never yields the microtask
+  // checkpoint, starving the runtime's task loop.
+  auto self = shared_from_this();
+  jsi::Runtime &rt = _runtime;
+  auto tickCallback = jsi::Function::createFromHostFunction(
+      rt, jsi::PropNameID::forAscii(rt, "RNWGPUAsyncTick"), 0,
+      [self](jsi::Runtime & /*runtime*/, const jsi::Value & /*thisVal*/,
+             const jsi::Value * /*args*/, size_t /*count*/) -> jsi::Value {
+        self->tick();
+        return jsi::Value::undefined();
+      });
+
+  auto global = rt.global();
+  auto setTimeoutValue = global.getProperty(rt, "setTimeout");
+  if (setTimeoutValue.isObject() &&
+      setTimeoutValue.asObject(rt).isFunction(rt)) {
+    setTimeoutValue.asObject(rt).asFunction(rt).call(
+        rt, jsi::Value(rt, tickCallback), jsi::Value(0.0));
+    return;
+  }
+  auto setImmediateValue = global.getProperty(rt, "setImmediate");
+  if (setImmediateValue.isObject() &&
+      setImmediateValue.asObject(rt).isFunction(rt)) {
+    setImmediateValue.asObject(rt).asFunction(rt).call(
+        rt, jsi::Value(rt, tickCallback));
+    return;
+  }
+  rt.queueMicrotask(std::move(tickCallback));
+}
+
+void RuntimeContext::tick() {
+  _tickScheduled.store(false, std::memory_order_release);
+  {
+    // Serialize ProcessEvents across runtimes sharing this instance. Callbacks
+    // fired here only deposit into mailboxes (postSettle), they do not run JS.
+    std::lock_guard<std::mutex> lock(processEventsMutex());
+    _instance.ProcessEvents();
+  }
+  // Settle this runtime's ready promises on this thread, outside the pump lock.
+  drainMailbox();
+  // Keep pumping only while a "pumping" task (active async work) is outstanding.
+  // Non-pumping tasks (e.g. device.lost) intentionally do NOT keep the pump
+  // alive: we prioritise battery over catching a device.lost fired while idle.
+  if (_pumpTasks.load(std::memory_order_acquire) > 0) {
+    requestTick();
+  }
+}
+
+jsi::UUID RuntimeContext::runtimeDataUUID() {
+  static const auto uuid = jsi::UUID();
+  return uuid;
+}
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h
new file mode 100644
index 000000000..cb0024d6d
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include <jsi/jsi.h>
+
+#include "AsyncTaskHandle.h"
+
+#include "webgpu/webgpu_cpp.h"
+
+namespace jsi = facebook::jsi;
+
+namespace facebook::react {
+class CallInvoker;
+} // namespace facebook::react
+
+namespace rnwgpu::async {
+
+/**
+ * Per-runtime coordinator for asynchronous WebGPU operations.
+ *
+ * Each JS runtime that uses WebGPU gets its own RuntimeContext, stored in the
+ * runtime's runtimeData. Async Dawn operations are registered with
+ * CallbackMode::AllowProcessEvents and driven to completion by pumping
+ * `instance.ProcessEvents()` on the runtime's OWN thread via a self-
+ * rescheduling tick (scheduled through that runtime's setTimeout). Because
+ * ProcessEvents invokes the Dawn callbacks synchronously on the pumping thread,
+ * the JS Promise is settled directly on the owning runtime, with no background
+ * thread and no cross-thread hop.
+ *
+ * The pump only runs while at least one "pumping" task is outstanding, so it
+ * costs nothing when idle and stops cleanly.
+ *
+ * Spontaneous events (keepPumping = false): events that may fire at any time,
+ * independent of any request/response op (today only GPUDevice::getLost, whose
+ * Dawn callback is registered AllowSpontaneous). These are NOT driven by the
+ * pump. Instead their settle is marshalled onto the owning runtime's JS thread
+ * via that runtime's CallInvoker, which is wired only for the MAIN JS runtime
+ * (callInvoker()). A device created on a worklet runtime has no invoker, so its
+ * device.lost is best-effort and may never fire. See the README "Threading
+ * model" section.
+ *
+ * Shared-instance safety (mailbox): multiple runtimes may share one
+ * wgpu::Instance. ProcessEvents() drains the whole instance queue and fires
+ * callbacks on the calling thread, which may NOT be the owning runtime's thread
+ * for a given promise. So a settled callback never touches JSI inline; it
+ * deposits a settle-action (a plain C++ closure, no JSI) into the OWNING
+ * context's thread-safe mailbox via postSettle(), and each context drains its
+ * own mailbox on its own thread during tick(). ProcessEvents() itself is
+ * serialized across runtimes by a process-wide mutex, since concurrent
+ * ProcessEvents on one instance is not guaranteed reentrant.
+ *
+ * Threading contract: a RuntimeContext must only be pumped from the runtime it
+ * was created for. Create and use a GPUDevice (and the buffers/queues derived
+ * from it) on the same runtime that requested the adapter.
+ */
+class RuntimeContext : public std::enable_shared_from_this<RuntimeContext> {
+public:
+  using TaskCallback =
+      std::function<void(const AsyncTaskHandle::ResolveFunction &,
+                         const AsyncTaskHandle::RejectFunction &)>;
+
+  RuntimeContext(jsi::Runtime &runtime, wgpu::Instance instance);
+
+  static std::shared_ptr<RuntimeContext> get(jsi::Runtime &runtime);
+  static std::shared_ptr<RuntimeContext> getOrCreate(jsi::Runtime &runtime,
+                                                     wgpu::Instance instance);
+
+  // Register the main JS runtime and its CallInvoker. The RuntimeContext created
+  // for this runtime gets the invoker (callInvoker() returns it); every other
+  // runtime's context returns null. Called once from RNWebGPUManager on install.
+  static void
+  registerMainRuntime(jsi::Runtime *runtime,
+                      std::shared_ptr<facebook::react::CallInvoker> invoker);
+
+  // CallInvoker for this runtime's JS thread, or null. Non-null only for the
+  // main JS runtime; used to deliver spontaneous events (device.lost) without
+  // the pump. See the class doc.
+  const std::shared_ptr<facebook::react::CallInvoker> &callInvoker() const {
+    return _callInvoker;
+  }
+
+  // The wgpu::Instance bound to this runtime.
+  wgpu::Instance instance() const { return _instance; }
+
+  AsyncTaskHandle postTask(const TaskCallback &callback,
+                           bool keepPumping = true);
+
+  // Deposit a settle-action to run on THIS context's runtime thread. Thread-safe
+  // (callable from any thread, e.g. another runtime that pumped ProcessEvents).
+  // The job must not touch JSI until it runs (it runs during drainMailbox on the
+  // owning thread).
+  void postSettle(std::function<void()> job);
+
+  // Invoked by a drained settle-action when its task settles. Runs on the owning
+  // runtime's thread.
+  void onTaskSettled(bool keepPumping);
+
+private:
+  static jsi::UUID runtimeDataUUID();
+
+  void requestTick();
+  void tick();
+  void drainMailbox();
+
+  jsi::Runtime &_runtime;
+  wgpu::Instance _instance;
+  // Non-null only for the main JS runtime's context (see registerMainRuntime).
+  std::shared_ptr<facebook::react::CallInvoker> _callInvoker;
+  std::atomic<std::size_t> _pumpTasks{0};
+  std::atomic<bool> _tickScheduled{false};
+
+  std::mutex _mailboxMutex;
+  std::vector<std::function<void()>> _mailbox;
+};
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/package.json b/packages/webgpu/package.json
index b69932f21..ce48f8605 100644
--- a/packages/webgpu/package.json
+++ b/packages/webgpu/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-webgpu",
-  "version": "0.5.14",
+  "version": "0.5.15",
   "description": "React Native WebGPU",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",
diff --git a/packages/webgpu/src/Canvas.tsx b/packages/webgpu/src/Canvas.tsx
index 1030f3e38..d5bca183d 100644
--- a/packages/webgpu/src/Canvas.tsx
+++ b/packages/webgpu/src/Canvas.tsx
@@ -20,6 +20,15 @@ export interface NativeCanvas {
 }
 
 export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Call this after `queue.submit()` on every runtime: the main JS runtime, the
+   * Reanimated UI runtime, and dedicated worklet runtimes (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor). It runs synchronously on the calling thread, so the frame is
+   * presented from whichever thread did the rendering.
+   */
   present: () => void;
 };
 
diff --git a/packages/webgpu/src/Offscreen.ts b/packages/webgpu/src/Offscreen.ts
index c4e460bb2..4deab8a1c 100644
--- a/packages/webgpu/src/Offscreen.ts
+++ b/packages/webgpu/src/Offscreen.ts
@@ -65,7 +65,7 @@ class GPUOffscreenCanvasContext implements GPUCanvasContext {
   }
 
   present() {
-    // Do nothing
+    // Offscreen contexts have nothing to present; readback is via getImageData.
   }
 
   getDevice() {
diff --git a/packages/webgpu/src/WebPolyfillGPUModule.ts b/packages/webgpu/src/WebPolyfillGPUModule.ts
index 9dcc1f1c5..3851733dd 100644
--- a/packages/webgpu/src/WebPolyfillGPUModule.ts
+++ b/packages/webgpu/src/WebPolyfillGPUModule.ts
@@ -40,9 +40,9 @@ function makeWebGPUCanvasContext(
   }
 
   const context = canvas.getContext("webgpu")!;
-  return Object.assign(context, {
-    present: () => {},
-  });
+  // On web there is no manual present; expose a no-op so RNCanvasContext's
+  // present() (called after queue.submit() on native) is callable here too.
+  return Object.assign(context, { present: () => {} });
 }
 
 // @ts-expect-error - polyfill for RNWebGPU native module
diff --git a/packages/webgpu/src/constants.ts b/packages/webgpu/src/constants.ts
new file mode 100644
index 000000000..c96970f46
--- /dev/null
+++ b/packages/webgpu/src/constants.ts
@@ -0,0 +1,37 @@
+/// <reference types="@webgpu/types" />
+
+// WebGPU flag constants as importable JS values.
+//
+// The native module installs `GPUBufferUsage`, `GPUTextureUsage`,
+// `GPUShaderStage`, `GPUColorWrite` and `GPUMapMode` as globals, but only on the
+// main JS runtime. Worklet runtimes (Reanimated UI, dedicated worklet runtimes,
+// Vision Camera frame processors) do not get those globals, so referencing the
+// bare global inside a worklet yields `undefined`.
+//
+// Rather than hardcode the bit values here (which could drift from the native
+// `wgpu::*Usage` enums), we re-export the globals the native module already
+// installed (see `GPUBufferUsage.h` and friends, which derive their values from
+// the Dawn enums with `static_assert`s). This keeps a single source of truth.
+// Importing them into a worklet lets the Worklets serializer capture them by
+// closure (the same way module-level shader strings are captured), making them
+// available on every runtime without passing them in by hand:
+//
+//   import { GPUBufferUsage } from "react-native-webgpu";
+//   const work = () => {
+//     "worklet";
+//     device.createBuffer({ usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });
+//   };
+//
+// These are read at module evaluation time. The package entry (`index.tsx`)
+// re-exports `./main` before `./constants`, and `./main` installs the native
+// module synchronously, so the globals always exist by the time this runs.
+
+export const GPUBufferUsage = globalThis.GPUBufferUsage;
+
+export const GPUTextureUsage = globalThis.GPUTextureUsage;
+
+export const GPUShaderStage = globalThis.GPUShaderStage;
+
+export const GPUColorWrite = globalThis.GPUColorWrite;
+
+export const GPUMapMode = globalThis.GPUMapMode;
diff --git a/packages/webgpu/src/index.tsx b/packages/webgpu/src/index.tsx
index 5bb19fd3a..58728ad32 100644
--- a/packages/webgpu/src/index.tsx
+++ b/packages/webgpu/src/index.tsx
@@ -13,6 +13,8 @@ import type {
 } from "./types";
 
 export * from "./main";
+export * from "./constants";
+export * from "./install";
 export type {
   NativeVideoFrame,
   VideoPlayer,
diff --git a/packages/webgpu/src/install.ts b/packages/webgpu/src/install.ts
new file mode 100644
index 000000000..3483a0e3d
--- /dev/null
+++ b/packages/webgpu/src/install.ts
@@ -0,0 +1,61 @@
+import {
+  GPUBufferUsage,
+  GPUColorWrite,
+  GPUMapMode,
+  GPUShaderStage,
+  GPUTextureUsage,
+} from "./constants";
+
+// Globals that this function installs on the calling runtime. These are the
+// native-derived flag constants re-exported from `./constants` (a single source
+// of truth, matching the native `wgpu::*Usage` enums), so they are safe to set
+// on any runtime.
+const constants = {
+  GPUBufferUsage,
+  GPUTextureUsage,
+  GPUShaderStage,
+  GPUColorWrite,
+  GPUMapMode,
+};
+
+/**
+ * Install WebGPU on the runtime that calls it.
+ *
+ * The native module installs the WebGPU flag constants (`GPUBufferUsage`,
+ * `GPUTextureUsage`, `GPUShaderStage`, `GPUColorWrite`, `GPUMapMode`) as globals
+ * on the main JS runtime, but worklet runtimes (Reanimated UI, dedicated worklet
+ * runtimes, Vision Camera frame processors) start without them, so referencing
+ * the bare global inside a worklet yields `undefined`.
+ *
+ * Call `installWebGPU()` once at the top of a worklet to make those globals
+ * available there, instead of importing each constant by hand:
+ *
+ * ```tsx
+ * import { installWebGPU } from "react-native-webgpu";
+ *
+ * const work = (device: GPUDevice) => {
+ *   "worklet";
+ *   installWebGPU();
+ *   device.createBuffer({
+ *     usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+ *   });
+ * };
+ * ```
+ *
+ * The constants are captured into the worklet by closure (the same way a shader
+ * string is), so they work on every runtime. Calling it on a runtime that
+ * already has the globals (e.g. the main JS runtime) is a safe no-op.
+ *
+ * This is the explicit entry point for runtime setup; for now it only installs
+ * the flag constants, but it is the place where other per-runtime WebGPU setup
+ * (e.g. `navigator.gpu`) can be wired in later.
+ */
+export const installWebGPU = () => {
+  "worklet";
+  const g = globalThis as unknown as Record<string, unknown>;
+  for (const [key, value] of Object.entries(constants)) {
+    if (g[key] === undefined) {
+      g[key] = value;
+    }
+  }
+};
diff --git a/packages/webgpu/src/types.ts b/packages/webgpu/src/types.ts
index ef06c192c..cd94faa10 100644
--- a/packages/webgpu/src/types.ts
+++ b/packages/webgpu/src/types.ts
@@ -9,6 +9,15 @@ export interface NativeCanvas {
 }
 
 export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Call this after `queue.submit()` on every runtime: the main JS runtime, the
+   * Reanimated UI runtime, and dedicated worklet runtimes (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor). It runs synchronously on the calling thread, so the frame is
+   * presented from whichever thread did the rendering.
+   */
   present: () => void;
 };