From 33b9835112076cac5f9a45a8d5f78dcbf9b9f6d5 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 20 Jun 2026 14:46:52 +0300
Subject: [PATCH 1/8] feat(launcher): add CUDA backend path

Add a --cuda flag to llama-tornado that selects the TornadoVM CUDA backend,
mirroring the existing --opencl/--ptx/--metal plumbing: it loads the
tornado.drivers.cuda module and the cuda-exports export list. Also disambiguate
--ptx help text (was 'PTX/CUDA').
---
 llama-tornado | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/llama-tornado b/llama-tornado
index 1d6c3d23..78388295 100755
--- a/llama-tornado
+++ b/llama-tornado
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 llama-tornado: GPU-accelerated Java LLM runner with TornadoVM
-Run LLM models using either OpenCL or PTX backends.
+Run LLM models using OpenCL, PTX, CUDA, or Metal backends.
 """
 
 import argparse
@@ -19,6 +19,7 @@ from enum import Enum
 class Backend(Enum):
     OPENCL = "opencl"
     PTX = "ptx"
+    CUDA = "cuda"
     METAL = "metal"
 
 
@@ -178,6 +179,14 @@ class LlamaRunner:
                     "ALL-SYSTEM,jdk.incubator.vector,tornado.runtime,tornado.annotation,tornado.drivers.common,tornado.drivers.ptx",
                 ]
             )
+        elif args.backend == Backend.CUDA:
+            module_config.extend(
+                [
+                    f"@{self.tornado_sdk}/etc/exportLists/cuda-exports",
+                    "--add-modules",
+                    "ALL-SYSTEM,jdk.incubator.vector,tornado.runtime,tornado.annotation,tornado.drivers.common,tornado.drivers.cuda",
+                ]
+            )
         elif args.backend == Backend.METAL:
             module_config.extend(
                 [
@@ -426,7 +435,14 @@ def create_parser() -> argparse.ArgumentParser:
         dest="backend",
         action="store_const",
         const=Backend.PTX,
-        help="Use PTX/CUDA backend",
+        help="Use PTX backend",
+    )
+    hw_group.add_argument(
+        "--cuda",
+        dest="backend",
+        action="store_const",
+        const=Backend.CUDA,
+        help="Use CUDA backend (requires TornadoVM built with the CUDA backend)",
     )
     hw_group.add_argument(
         "--metal",

From 2e2fa90b3fc3fa144afe76adfe68a1248c66b6f1 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 20 Jun 2026 14:46:52 +0300
Subject: [PATCH 2/8] build(pom): build against TornadoVM 4.0.2-jdk21-dev (CUDA
 backend)

The CUDA backend is only available in a dev build of TornadoVM (PR #861), so
point the JDK21 build at 4.0.2-jdk21-dev. The project's own version is unchanged.
---
 pom.xml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pom.xml b/pom.xml
index a83c7ecf..82e875e9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,9 +39,10 @@
         <properties>
             <!-- CI-friendly version: resolved by flatten-maven-plugin at build time -->
             <revision>0.4.0</revision>
-            <tornadovm.base.version>4.0.1</tornadovm.base.version>
+            <tornadovm.base.version>4.0.2</tornadovm.base.version>
             <jdk.version.suffix>-jdk21</jdk.version.suffix>
-            <tornadovm.version>${tornadovm.base.version}${jdk.version.suffix}</tornadovm.version>
+            <!-- CUDA backend is only available in a dev build of TornadoVM (PR #861) -->
+            <tornadovm.version>${tornadovm.base.version}${jdk.version.suffix}-dev</tornadovm.version>
             <!-- Compiler defaults (overridden by JDK profiles below) -->
             <maven.compiler.source>25</maven.compiler.source>
             <maven.compiler.target>25</maven.compiler.target>
@@ -147,7 +148,8 @@
                     <maven.compiler.source>21</maven.compiler.source>
                     <maven.compiler.target>21</maven.compiler.target>
                     <jdk.version.suffix>-jdk21</jdk.version.suffix>
-                    <tornadovm.version>${tornadovm.base.version}${jdk.version.suffix}</tornadovm.version>
+                    <!-- CUDA backend is only available in a dev build of TornadoVM (PR #861) -->
+                    <tornadovm.version>${tornadovm.base.version}${jdk.version.suffix}-dev</tornadovm.version>
                 </properties>
                 <build>
                     <plugins>

From 74b88c209e5ddb2ed2c2c465177bfd4d82bb3aeb Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 20 Jun 2026 14:46:52 +0300
Subject: [PATCH 3/8] docs: document CUDA backend and TornadoVM PR #861
 requirement

List CUDA among the supported backends, add a --cuda usage example, and note
that the CUDA backend requires a TornadoVM build with the CUDA backend from
PR #861 (https://github.com/beehive-lab/TornadoVM/pull/861).
---
 README.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2e2db217..9cb3a0a9 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,8 @@ GPULlama3ChatModel model = GPULlama3ChatModel.builder()
 Ensure you have the following installed and configured:
 
 - **Java 21**: Required for Vector API support & TornadoVM.
-- [TornadoVM](https://github.com/beehive-lab/TornadoVM) with OpenCL or PTX backends.
+- [TornadoVM](https://github.com/beehive-lab/TornadoVM) with OpenCL, PTX, or CUDA backends.
+  - The `--cuda` backend requires a TornadoVM build that includes the CUDA backend from [TornadoVM PR #861](https://github.com/beehive-lab/TornadoVM/pull/861). This project currently builds against TornadoVM `4.0.2-jdk21-dev`.
 - GCC/G++ 13 or newer: Required to build and run TornadoVM native components.
 
 ### Install, Build, and Run
@@ -305,6 +306,12 @@ Run a model with a text prompt:
 ./llama-tornado --gpu --verbose-init --opencl --model beehive-llama-3.2-1b-instruct-fp16.gguf --prompt "Explain the benefits of GPU acceleration."
 ```
 
+Select a backend explicitly with `--opencl`, `--ptx`, or `--cuda` (NVIDIA), or `--metal` (Apple Silicon). For example, to run on the CUDA backend:
+
+```bash
+./llama-tornado --gpu --cuda --model beehive-llama-3.2-1b-instruct-fp16.gguf --prompt "Explain the benefits of GPU acceleration."
+```
+
 #### GPU Execution (FP16 Model)
 Enable GPU acceleration with Q8_0 quantization:
 ```bash
@@ -393,7 +400,7 @@ Supported command-line options include:
 ```bash
 cmd ➜ llama-tornado --help
 usage: llama-tornado [-h] --model MODEL_PATH [--prompt PROMPT] [-sp SYSTEM_PROMPT] [--temperature TEMPERATURE] [--top-p TOP_P] [--seed SEED] [-n MAX_TOKENS]
-                     [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX]
+                     [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--cuda] [--metal] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX]
                      [--debug] [--profiler] [--profiler-dump-dir PROFILER_DUMP_DIR] [--print-bytecodes] [--print-threads] [--print-kernel] [--full-dump]
                      [--show-command] [--execute-after-show] [--opencl-flags OPENCL_FLAGS] [--max-wait-events MAX_WAIT_EVENTS] [--verbose]
 
@@ -424,7 +431,9 @@ Mode Selection:
 Hardware Configuration:
   --gpu                 Enable GPU acceleration (default: False)
   --opencl              Use OpenCL backend (default) (default: None)
-  --ptx                 Use PTX/CUDA backend (default: None)
+  --ptx                 Use PTX backend (default: None)
+  --cuda                Use CUDA backend (requires TornadoVM built with the CUDA backend) (default: None)
+  --metal               Use Apple Metal backend (macOS only) (default: None)
   --gpu-memory GPU_MEMORY
                         GPU memory allocation (default: 7GB)
   --heap-min HEAP_MIN   Minimum JVM heap size (default: 20g)
@@ -480,9 +489,9 @@ View TornadoVM's internal behavior:
   - **Support for GGUF format models** with full FP16 and partial support for Q8_0 and Q4_0 quantization.
   - **Instruction-following and chat modes** for various use cases.
   - **Interactive CLI** with `--interactive` and `--instruct` modes.
-  - **Flexible backend switching** - choose OpenCL or PTX at runtime (need to build TornadoVM with both enabled).
+  - **Flexible backend switching** - choose OpenCL, PTX, or CUDA at runtime (need to build TornadoVM with the chosen backends enabled).
   - **Cross-platform compatibility**:
-    - ✅ NVIDIA GPUs (OpenCL & PTX )
+    - ✅ NVIDIA GPUs (OpenCL, PTX & CUDA)
     - ✅ Intel GPUs (OpenCL)
     - ✅ Apple GPUs (OpenCL)
 

From ed4db212e42977400eef82a3ee143e7945a504ec Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 20 Jun 2026 15:08:47 +0300
Subject: [PATCH 4/8] ci: add CUDA backend to build and inference matrices

Add a cuda variant to the build, standalone-inference, and quarkus-integration
backend matrices. The setup-tornadovm action now builds the CUDA backend from
the cuda2 branch (TornadoVM PR #861) until it is merged to master; other
backends still build from master. Shared inference steps run on CUDA via the
matrix; the PTX-only CUDA-graph steps remain gated to ptx.
---
 .github/actions/run-inference/action.yml   |  2 +-
 .github/actions/setup-tornadovm/action.yml | 20 ++++++++++++++++----
 .github/workflows/build-and-run.yml        |  5 ++++-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/.github/actions/run-inference/action.yml b/.github/actions/run-inference/action.yml
index 314a8be5..fe3d574b 100644
--- a/.github/actions/run-inference/action.yml
+++ b/.github/actions/run-inference/action.yml
@@ -3,7 +3,7 @@ description: Run one llama-tornado inference pass and write the metrics + sideca
 
 inputs:
   backend:
-    description: 'GPU backend (opencl or ptx)'
+    description: 'GPU backend (opencl, ptx, or cuda)'
     required: true
   model_file:
     description: 'Model filename inside $MODELS_DIR (e.g. Llama-3.2-1B-Instruct-F16.gguf)'
diff --git a/.github/actions/setup-tornadovm/action.yml b/.github/actions/setup-tornadovm/action.yml
index 3b1c5070..01fc41ac 100644
--- a/.github/actions/setup-tornadovm/action.yml
+++ b/.github/actions/setup-tornadovm/action.yml
@@ -3,17 +3,29 @@ description: Build TornadoVM once per backend and reuse across runs via a local
 
 inputs:
   backend:
-    description: 'TornadoVM backend to build (opencl or ptx)'
+    description: 'TornadoVM backend to build (opencl, ptx, or cuda)'
     required: true
 
 runs:
   using: composite
   steps:
+    - name: Determine TornadoVM branch
+      id: branch
+      shell: bash
+      run: |
+        # The CUDA backend currently lives on the cuda2 branch (TornadoVM PR #861)
+        # until it is merged to master; all other backends build from master.
+        if [ "${{ inputs.backend }}" = "cuda" ]; then
+          echo "ref=cuda2" >> $GITHUB_OUTPUT
+        else
+          echo "ref=master" >> $GITHUB_OUTPUT
+        fi
+
     - name: Get TornadoVM HEAD SHA
       id: tornado_sha
       shell: bash
       run: |
-        SHA=$(git ls-remote https://github.com/beehive-lab/TornadoVM HEAD | cut -f1)
+        SHA=$(git ls-remote https://github.com/beehive-lab/TornadoVM ${{ steps.branch.outputs.ref }} | cut -f1)
         echo "sha=$SHA" >> $GITHUB_OUTPUT
 
     - name: Check local build sentinel
@@ -27,12 +39,12 @@ runs:
           echo "up-to-date=false" >> $GITHUB_OUTPUT
         fi
 
-    - name: Clone TornadoVM master
+    - name: Clone TornadoVM
       if: steps.sentinel.outputs.up-to-date != 'true'
       shell: bash
       run: |
         rm -rf $TORNADO_ROOT
-        git clone --depth 1 --branch master \
+        git clone --depth 1 --branch ${{ steps.branch.outputs.ref }} \
           https://github.com/beehive-lab/TornadoVM.git \
           $TORNADO_ROOT
 
diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index f60083cc..7b7da0ad 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -31,7 +31,7 @@ jobs:
           # ./mvnw -T12C -Pspotless spotless:check
 
   # Build: TornadoVM → GPULlama3 → Quarkus LangChain4j
-  # max-parallel: 1 ensures the opencl and ptx variants run sequentially so
+  # max-parallel: 1 ensures the opencl, ptx and cuda variants run sequentially so
   # there are no workspace conflicts between matrix jobs.
   build:
     if: github.repository == 'beehive-lab/GPULlama3.java'
@@ -45,6 +45,7 @@ jobs:
         backend:
           - name: opencl
           - name: ptx
+          - name: cuda
 
     steps:
       - name: Checkout GPULlama3
@@ -99,6 +100,7 @@ jobs:
         backend:
           - name: opencl
           - name: ptx
+          - name: cuda
 
     steps:
       - name: Checkout GPULlama3
@@ -523,6 +525,7 @@ jobs:
         backend:
           - name: opencl
           - name: ptx
+          - name: cuda
 
     steps:
       - name: Checkout GPULlama3

From 3f13606766ea4273ad43078e674c1fedb8881c90 Mon Sep 17 00:00:00 2001
From: Orion Papadakis <orion.papadakis@manchester.ac.uk>
Date: Mon, 22 Jun 2026 13:37:10 +0300
Subject: [PATCH 5/8] [hack] force ci to run on strix

---
 .github/workflows/build-and-run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index 7b7da0ad..419b1698 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -18,7 +18,7 @@ env:
 jobs:
   code-quality:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: self-hosted
+    runs-on: [self-hosted, orion-strix]
     timeout-minutes: 30
 
     steps:

From bd4a70b679d50881eec421747668018c4079738f Mon Sep 17 00:00:00 2001
From: Orion Papadakis <orion.papadakis@manchester.ac.uk>
Date: Mon, 22 Jun 2026 14:03:39 +0300
Subject: [PATCH 6/8] [hack] force all ci jobs to run on strix

---
 .github/workflows/build-and-run.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index 419b1698..f2ef0606 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -35,7 +35,7 @@ jobs:
   # there are no workspace conflicts between matrix jobs.
   build:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted]
+    runs-on: [self-hosted, orion-strix]
     needs: code-quality
     timeout-minutes: 30
     strategy:
@@ -91,7 +91,7 @@ jobs:
 
   standalone-inference:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted]
+    runs-on: [self-hosted, orion-strix]
     needs: build
     timeout-minutes: 30
     strategy:
@@ -516,7 +516,7 @@ jobs:
   # Test integration with Quarkus-langchain4j
   quarkus-langchain4j-integration:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted]
+    runs-on: [self-hosted, orion-strix]
     needs: build
     timeout-minutes: 10
     strategy:
@@ -618,7 +618,7 @@ jobs:
       github.event_name == 'push' &&
       github.ref == 'refs/heads/main'
 
-    runs-on: [self-hosted]
+    runs-on: [self-hosted, orion-strix]
     needs: standalone-inference
     timeout-minutes: 15
 

From 3efde927f567b1d9099738e645cdfbcbdad4024f Mon Sep 17 00:00:00 2001
From: Orion Papadakis <orion.papadakis@manchester.ac.uk>
Date: Mon, 22 Jun 2026 14:12:16 +0300
Subject: [PATCH 7/8] [hack] use correct runner custom label

---
 .github/workflows/build-and-run.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index f2ef0606..bdb48336 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -18,7 +18,7 @@ env:
 jobs:
   code-quality:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, orion-strix]
+    runs-on: [self-hosted, 5090-laptop]
     timeout-minutes: 30
 
     steps:
@@ -35,7 +35,7 @@ jobs:
   # there are no workspace conflicts between matrix jobs.
   build:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, orion-strix]
+    runs-on: [self-hosted, 5090-laptop]
     needs: code-quality
     timeout-minutes: 30
     strategy:
@@ -91,7 +91,7 @@ jobs:
 
   standalone-inference:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, orion-strix]
+    runs-on: [self-hosted, 5090-laptop]
     needs: build
     timeout-minutes: 30
     strategy:
@@ -516,7 +516,7 @@ jobs:
   # Test integration with Quarkus-langchain4j
   quarkus-langchain4j-integration:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, orion-strix]
+    runs-on: [self-hosted, 5090-laptop]
     needs: build
     timeout-minutes: 10
     strategy:
@@ -618,7 +618,7 @@ jobs:
       github.event_name == 'push' &&
       github.ref == 'refs/heads/main'
 
-    runs-on: [self-hosted, orion-strix]
+    runs-on: [self-hosted, 5090-laptop]
     needs: standalone-inference
     timeout-minutes: 15
 

From dcee6cc0ae80c975e831a0c8bc3ac847ed302ece Mon Sep 17 00:00:00 2001
From: Orion Papadakis <orion.papadakis@manchester.ac.uk>
Date: Tue, 23 Jun 2026 16:57:37 +0300
Subject: [PATCH 8/8] Revert specific workflow runner labels

---
 .github/workflows/build-and-run.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index bdb48336..3b33059c 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -18,7 +18,7 @@ env:
 jobs:
   code-quality:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, 5090-laptop]
+    runs-on: [self-hosted]
     timeout-minutes: 30
 
     steps:
@@ -35,7 +35,7 @@ jobs:
   # there are no workspace conflicts between matrix jobs.
   build:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, 5090-laptop]
+    runs-on: [self-hosted]
     needs: code-quality
     timeout-minutes: 30
     strategy:
@@ -91,7 +91,7 @@ jobs:
 
   standalone-inference:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, 5090-laptop]
+    runs-on: [self-hosted]
     needs: build
     timeout-minutes: 30
     strategy:
@@ -516,7 +516,7 @@ jobs:
   # Test integration with Quarkus-langchain4j
   quarkus-langchain4j-integration:
     if: github.repository == 'beehive-lab/GPULlama3.java'
-    runs-on: [self-hosted, 5090-laptop]
+    runs-on: [self-hosted]
     needs: build
     timeout-minutes: 10
     strategy:
@@ -618,7 +618,7 @@ jobs:
       github.event_name == 'push' &&
       github.ref == 'refs/heads/main'
 
-    runs-on: [self-hosted, 5090-laptop]
+    runs-on: [self-hosted]
     needs: standalone-inference
     timeout-minutes: 15