From 0728a4790e00b7c02117b55737cf0dbaef7fadb3 Mon Sep 17 00:00:00 2001
From: Tom Allsop <tom.allsop@arm.com>
Date: Fri, 26 Jun 2026 11:16:26 +0100
Subject: [PATCH] Arm backend: Add Qwen3 VL language model MXFP8 test

* Add an E2E TOSA test for the Qwen3 VL language model with
  linear layers converted to MXFP8.

Signed-off-by: Baris Demir <baris.demir@arm.com>
Signed-off-by: Tom Allsop <tom.allsop@arm.com>

Co-authored-by: Baris Demir <baris.demir@arm.com>
---
 .../models/Qwen3_VL/test_qwen3_vl_model.py    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py
index ae4ba2dcbd0..b898ca62cf5 100644
--- a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py
+++ b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py
@@ -11,10 +11,12 @@
 import pytest
 import torch
 import torch.nn.functional as F
+from executorch.backends.arm.ao_ext import MXFPOpConfig
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.models.Qwen3_VL.qwen3_vl_test_config import (
     get_qwen3_vl_2b_instruct_checkpoint_config,
 )
+from executorch.backends.arm.test.ops.mxfp.common import MXFPTosaPipelineFP
 from executorch.backends.arm.test.tester.test_pipeline import (
     TosaPipelineFP,
     VgfPipeline,
@@ -25,6 +27,7 @@
 )
 
 input_t = Tuple[torch.Tensor, ...]
+aten_op_mxfp_linear = "torch.ops.tosa_mxfp.linear.default"
 
 
 def _make_qwen3_vl_2b_instruct_layer_config():
@@ -104,6 +107,10 @@ def _to_bfloat16_model_and_floating_inputs(
     )
 
 
+def _is_linear(module: torch.nn.Module, _fqn: str) -> bool:
+    return isinstance(module, torch.nn.Linear)
+
+
 class TextModelWrapper(Qwen3VLModelTestModule):
     def __init__(self, config) -> None:
         super().__init__()
@@ -300,3 +307,35 @@ def test_qwen3_vl_full_models_vgf_no_quant_bf16(test_case: Qwen3VLModelTestCase)
             tosa_spec="TOSA-1.0+FP+bf16",
         )
         pipeline.run()
+
+
+@pytest.mark.slow
+def test_qwen3_vl_text_model_tosa_mxfp8_bf16():
+    # The Qwen 3 VL FP8 model only quantizes the TextModel
+    model, inputs = TextModelWrapper.prepare_model_and_inputs()
+    model, inputs = _to_bfloat16_model_and_floating_inputs(model, inputs)
+    mxfp_config = MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+    with torch.no_grad():
+        pipeline = MXFPTosaPipelineFP[input_t](
+            model,
+            inputs,
+            aten_op=aten_op_mxfp_linear,
+            exir_op=[],
+            filter_fn=_is_linear,
+            frobenius_threshold=0.1,
+            cosine_threshold=0.98,
+            mxfp_config=mxfp_config,
+            tosa_version="1.1",
+            tosa_extensions=["bf16", "mxfp"],
+        )
+        # Check all linear layers are converted to MXFP
+        linear_count = sum(
+            _is_linear(submodule, name) for name, submodule in model.named_modules()
+        )
+        pipeline.add_stage_after(
+            "export",
+            pipeline.tester.check_count,
+            {aten_op_mxfp_linear: linear_count},
+            suffix="mxfp_linear",
+        )
+        pipeline.run()