From bed0adcc803b1b6f29d8cf0bc8af19f280e83904 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Sun, 17 May 2026 20:13:01 -0700
Subject: [PATCH 01/13] feat: update llama.cpp to dd7cad719

---
 CHANGELOG.md           |  2 +-
 llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36e4fa168..3c63bc070 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- feat: Update llama.cpp to ggml-org/llama.cpp@dd7cad719 and sync Python bindings
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a9c32a15b..6560b5178 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
 LLAMA_SPLIT_MODE_TENSOR = 3
 
 
+# enum llama_context_type {
+#     LLAMA_CONTEXT_TYPE_DEFAULT = 0,
+#     LLAMA_CONTEXT_TYPE_MTP     = 1,
+# };
+LLAMA_CONTEXT_TYPE_DEFAULT = 0
+LLAMA_CONTEXT_TYPE_MTP = 1
+
+
 # typedef struct llama_token_data {
 #     llama_token id; // token id
 #     float logit;    // log-odds of the token
@@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure):
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
 #     uint32_t n_ubatch;          // physical maximum batch size
 #     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+#     uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
 #     int32_t  n_threads;         // number of threads to use for generation
 #     int32_t  n_threads_batch;   // number of threads to use for batch processing
 
+#     enum llama_context_type      ctx_type;          // set the context type (e.g. MTP)
 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
@@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure):
         n_batch (int): logical maximum batch size that can be submitted to llama_decode
         n_ubatch (int): physical maximum batch size
         n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
+        n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
+        ctx_type (int): context type, from `enum llama_context_type`
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
         pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         attention_type (int): attention type to use for embeddings
@@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure):
         n_batch: int
         n_ubatch: int
         n_seq_max: int
+        n_rs_seq: int
         n_threads: int
         n_threads_batch: int
+        ctx_type: int
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
@@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure):
         ("n_batch", ctypes.c_uint32),
         ("n_ubatch", ctypes.c_uint32),
         ("n_seq_max", ctypes.c_uint32),
+        ("n_rs_seq", ctypes.c_uint32),
         ("n_threads", ctypes.c_int32),
         ("n_threads_batch", ctypes.c_int32),
+        ("ctx_type", ctypes.c_int),
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
@@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 
 
+# LLAMA_API uint32_t llama_n_rs_seq   (const struct llama_context * ctx);
+@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ...
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 91e84fed6..dd7cad719 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f
+Subproject commit dd7cad7197f991b18ded6aca46ff095972b95318

From 71898123b8a354e6dbff8a32c9805bc9c2e90e7b Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Sun, 17 May 2026 23:20:06 -0700
Subject: [PATCH 02/13] feat: update llama.cpp to c3f95c1f0

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c63bc070..9f6700884 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggml-org/llama.cpp@dd7cad719 and sync Python bindings
+- feat: Update llama.cpp to ggml-org/llama.cpp@c3f95c1f0 and sync Python bindings
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index dd7cad719..c3f95c1f0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit dd7cad7197f991b18ded6aca46ff095972b95318
+Subproject commit c3f95c1f069c91e21b8063b09907a5fba38d1695

From d1f4bf67677a575cab418465057ea50695f22b79 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:05:15 -0700
Subject: [PATCH 03/13] ci: add linux qwen35 failure diagnostics

---
 .github/workflows/test.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8a6845ff2..e68e4b6b1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -33,6 +33,7 @@ jobs:
     needs: download-model
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
@@ -50,6 +51,11 @@ jobs:
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+      - name: System Info
+        run: |
+          uname -a
+          lscpu
+          python -c "import platform; print(platform.machine(), platform.architecture())"
       - name: Install dependencies (Linux/MacOS)
         run: |
           python -m pip install --upgrade pip
@@ -58,7 +64,7 @@ jobs:
         shell: bash
       - name: Test with pytest
         run: |
-          python -m pytest
+          python -m pytest -q -s tests/test_llama.py::test_real_model
 
   build-windows:
     needs: download-model

From e2df6b4a0254eb49d390bdbe36b023aeb4cbd024 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:13:24 -0700
Subject: [PATCH 04/13] ci: instrument qwen35 native assertion

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c3f95c1f0..aac47ea1e 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c3f95c1f069c91e21b8063b09907a5fba38d1695
+Subproject commit aac47ea1e8d00ca5e52dde54008974980f81fe09

From 821905c52101753dd3627c9950e2bdefff6c798e Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:13:58 -0700
Subject: [PATCH 05/13] ci: instrument get_rows_f32 assertion

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index aac47ea1e..8e34af8cc 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit aac47ea1e8d00ca5e52dde54008974980f81fe09
+Subproject commit 8e34af8cc4fbc6351914e653b3ca9cf564922d5e

From 8d3139ec37523874a28d0b326de5a0e6685efedd Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:14:30 -0700
Subject: [PATCH 06/13] ci: target get_rows_f32 assertion instrumentation

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8e34af8cc..8e602f69c 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8e34af8cc4fbc6351914e653b3ca9cf564922d5e
+Subproject commit 8e602f69cc7abc9257e42a8034ea167f9585b7d7

From 40b2083a5cba034de9b2ce30e1f697619cb65a1f Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:16:00 -0700
Subject: [PATCH 07/13] ci: restore upstream llama.cpp pointer

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8e602f69c..c3f95c1f0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8e602f69cc7abc9257e42a8034ea167f9585b7d7
+Subproject commit c3f95c1f069c91e21b8063b09907a5fba38d1695

From 463316637acd152f48a802b964dd3888711ac27d Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:16:32 -0700
Subject: [PATCH 08/13] ci: patch get_rows_f32 diagnostics on linux

---
 .github/workflows/test.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e68e4b6b1..b87cf2d2f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -56,6 +56,31 @@ jobs:
           uname -a
           lscpu
           python -c "import platform; print(platform.machine(), platform.architecture())"
+      - name: Instrument Linux get_rows_f32 assertion
+        run: |
+          python - <<'PY'
+          from pathlib import Path
+
+          path = Path("vendor/llama.cpp/ggml/src/ggml-cpu/ops.cpp")
+          text = path.read_text()
+          old = """        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_vec_cpy_f32(nc,"""
+          new = """        if (!(i01 >= 0 && i01 < ne01)) {
+            fprintf(stderr,
+                    "%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\n",
+                    __func__, dst->name, src0->name, src1->name,
+                    (long long) i, (long long) i10, (long long) i11, (long long) i12,
+                    (long long) i01, (long long) ne01,
+                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);
+        }
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_vec_cpy_f32(nc,"""
+          if old not in text:
+              raise SystemExit("get_rows_f32 assertion pattern not found")
+          path.write_text(text.replace(old, new, 1))
+          PY
       - name: Install dependencies (Linux/MacOS)
         run: |
           python -m pip install --upgrade pip

From 43b25bf9fcc04f8717f4d30f71d368af9917dddf Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:21:36 -0700
Subject: [PATCH 09/13] ci: fix diagnostic workflow syntax

---
 .github/workflows/test.yaml | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b87cf2d2f..623afd150 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,20 +63,19 @@ jobs:
 
           path = Path("vendor/llama.cpp/ggml/src/ggml-cpu/ops.cpp")
           text = path.read_text()
-          old = """        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        ggml_vec_cpy_f32(nc,"""
-          new = """        if (!(i01 >= 0 && i01 < ne01)) {
-            fprintf(stderr,
-                    "%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\n",
-                    __func__, dst->name, src0->name, src1->name,
-                    (long long) i, (long long) i10, (long long) i11, (long long) i12,
-                    (long long) i01, (long long) ne01,
-                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);
-        }
-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        ggml_vec_cpy_f32(nc,"""
+          old = "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\\n\\n        ggml_vec_cpy_f32(nc,"
+          new = (
+              "        if (!(i01 >= 0 && i01 < ne01)) {\\n"
+              "            fprintf(stderr,\\n"
+              "                    \"%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\\\n\",\\n"
+              "                    __func__, dst->name, src0->name, src1->name,\\n"
+              "                    (long long) i, (long long) i10, (long long) i11, (long long) i12,\\n"
+              "                    (long long) i01, (long long) ne01,\\n"
+              "                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);\\n"
+              "        }\\n"
+              "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\\n\\n"
+              "        ggml_vec_cpy_f32(nc,"
+          )
           if old not in text:
               raise SystemExit("get_rows_f32 assertion pattern not found")
           path.write_text(text.replace(old, new, 1))

From d2b183742481847742c6bb1d873b81329c34c78a Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:26:23 -0700
Subject: [PATCH 10/13] ci: fix diagnostic newline matching

---
 .github/workflows/test.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 623afd150..32826b0ee 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,17 +63,17 @@ jobs:
 
           path = Path("vendor/llama.cpp/ggml/src/ggml-cpu/ops.cpp")
           text = path.read_text()
-          old = "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\\n\\n        ggml_vec_cpy_f32(nc,"
+          old = "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\n\n        ggml_vec_cpy_f32(nc,"
           new = (
-              "        if (!(i01 >= 0 && i01 < ne01)) {\\n"
-              "            fprintf(stderr,\\n"
-              "                    \"%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\\\n\",\\n"
-              "                    __func__, dst->name, src0->name, src1->name,\\n"
-              "                    (long long) i, (long long) i10, (long long) i11, (long long) i12,\\n"
-              "                    (long long) i01, (long long) ne01,\\n"
-              "                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);\\n"
-              "        }\\n"
-              "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\\n\\n"
+              "        if (!(i01 >= 0 && i01 < ne01)) {\n"
+              "            fprintf(stderr,\n"
+              "                    \"%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\\\n\",\n"
+              "                    __func__, dst->name, src0->name, src1->name,\n"
+              "                    (long long) i, (long long) i10, (long long) i11, (long long) i12,\n"
+              "                    (long long) i01, (long long) ne01,\n"
+              "                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);\n"
+              "        }\n"
+              "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\n\n"
               "        ggml_vec_cpy_f32(nc,"
           )
           if old not in text:

From 51e540e5fcf04da9f2e38064b58ae5785b9d887a Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 00:38:34 -0700
Subject: [PATCH 11/13] ci: test qwen35 pre-norm flag initialization

---
 .github/workflows/test.yaml | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 32826b0ee..84dda9c3d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -56,28 +56,21 @@ jobs:
           uname -a
           lscpu
           python -c "import platform; print(platform.machine(), platform.architecture())"
-      - name: Instrument Linux get_rows_f32 assertion
+      - name: Patch Linux Qwen3.5 pre-norm flag initialization
         run: |
           python - <<'PY'
           from pathlib import Path
 
-          path = Path("vendor/llama.cpp/ggml/src/ggml-cpu/ops.cpp")
+          path = Path("vendor/llama.cpp/src/llama-context.cpp")
           text = path.read_text()
-          old = "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\n\n        ggml_vec_cpy_f32(nc,"
+          old = "    cparams.embeddings       = params.embeddings;\n    cparams.embeddings_pre_norm = false;\n"
           new = (
-              "        if (!(i01 >= 0 && i01 < ne01)) {\n"
-              "            fprintf(stderr,\n"
-              "                    \"%s: invalid row index in get_rows_f32: dst=%s src0=%s src1=%s i=%lld i10=%lld i11=%lld i12=%lld i01=%lld ne01=%lld src1_ne=[%lld,%lld,%lld,%lld]\\\\n\",\n"
-              "                    __func__, dst->name, src0->name, src1->name,\n"
-              "                    (long long) i, (long long) i10, (long long) i11, (long long) i12,\n"
-              "                    (long long) i01, (long long) ne01,\n"
-              "                    (long long) ne10, (long long) ne11, (long long) ne12, (long long) ne13);\n"
-              "        }\n"
-              "        GGML_ASSERT(i01 >= 0 && i01 < ne01);\n\n"
-              "        ggml_vec_cpy_f32(nc,"
+              "    cparams.embeddings       = params.embeddings;\n"
+              "    cparams.embeddings_pre_norm = false;\n"
+              "    cparams.embeddings_pre_norm_masked = false;\n"
           )
           if old not in text:
-              raise SystemExit("get_rows_f32 assertion pattern not found")
+              raise SystemExit("cparams pre-norm initialization pattern not found")
           path.write_text(text.replace(old, new, 1))
           PY
       - name: Install dependencies (Linux/MacOS)

From 7c7213e6a59632a9e342ca9c7a1a20b7f2c3a4da Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 01:00:25 -0700
Subject: [PATCH 12/13] ci: test qwen35 fix from submodule

---
 .github/workflows/test.yaml | 25 +------------------------
 vendor/llama.cpp            |  2 +-
 2 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 84dda9c3d..8a6845ff2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -33,7 +33,6 @@ jobs:
     needs: download-model
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: false
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
@@ -51,28 +50,6 @@ jobs:
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
-      - name: System Info
-        run: |
-          uname -a
-          lscpu
-          python -c "import platform; print(platform.machine(), platform.architecture())"
-      - name: Patch Linux Qwen3.5 pre-norm flag initialization
-        run: |
-          python - <<'PY'
-          from pathlib import Path
-
-          path = Path("vendor/llama.cpp/src/llama-context.cpp")
-          text = path.read_text()
-          old = "    cparams.embeddings       = params.embeddings;\n    cparams.embeddings_pre_norm = false;\n"
-          new = (
-              "    cparams.embeddings       = params.embeddings;\n"
-              "    cparams.embeddings_pre_norm = false;\n"
-              "    cparams.embeddings_pre_norm_masked = false;\n"
-          )
-          if old not in text:
-              raise SystemExit("cparams pre-norm initialization pattern not found")
-          path.write_text(text.replace(old, new, 1))
-          PY
       - name: Install dependencies (Linux/MacOS)
         run: |
           python -m pip install --upgrade pip
@@ -81,7 +58,7 @@ jobs:
         shell: bash
       - name: Test with pytest
         run: |
-          python -m pytest -q -s tests/test_llama.py::test_real_model
+          python -m pytest
 
   build-windows:
     needs: download-model
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c3f95c1f0..e02216170 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c3f95c1f069c91e21b8063b09907a5fba38d1695
+Subproject commit e02216170bf21d1a8f9c63540070ec1d428f8879

From 9b541bccad9cb35a27014120a873f05bb7d70260 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 18 May 2026 07:24:46 -0700
Subject: [PATCH 13/13] feat: update llama.cpp to b9a2170fc

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f6700884..18c6af161 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggml-org/llama.cpp@c3f95c1f0 and sync Python bindings
+- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e02216170..b9a2170fc 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e02216170bf21d1a8f9c63540070ec1d428f8879
+Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348