From b6d333d53e713ad01a4b32d2a3db07008ebfaa09 Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Wed, 6 May 2026 11:37:15 -0700
Subject: [PATCH] route EthosU input/output memcpy through overridable hook
 (#19264)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:

The EthosU backend's input/output scratch shuffling currently does plain
CPU std::memcpy of every input tensor into the scratch buffer and every
output tensor out of it on every inference. On Cortex-M55-based firmware
targets that have a DMA engine, this is a significant CPU load, inference is spent in memcpy that could instead be DMA-offloaded so
the M55 sleeps while the transfer runs.

This change introduces a thin extern-C indirection — `arm_ethos_io_memcpy`
— that the EthosU backend uses everywhere it currently calls memcpy for
input/output scratch shuffling. The default (weak) implementation lives
in a separate translation unit (EthosUBackend_IoMemcpy.cpp) and just
calls std::memcpy, so behavior is unchanged for any consumer that doesn't
override it.

Firmware targets can supply a strong-symbol override (e.g. routing
through a DMA engine) without touching the upstream backend code.

Implementation notes:
- The weak default lives in its own TU so the compiler in the call-site
  TUs cannot inline its body and bypass the link-time override. This is
  the same pattern bolt_arm_memcpy_external uses.
- Three call sites updated: input scratch copy in EthosUBackend.cpp, the
  layout-adjustment chunk loop in EthosUBackend.cpp, and the output
  scratch copy in EthosUBackend_Cortex_M.cpp.

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: rascani

Differential Revision: D103455766
---
 backends/arm/CMakeLists.txt                   |  6 ++++--
 backends/arm/runtime/EthosUBackend.cpp        | 14 +++++++++++---
 .../arm/runtime/EthosUBackend_Cortex_M.cpp    |  9 ++++++++-
 .../arm/runtime/EthosUBackend_IoMemcpy.cpp    | 19 +++++++++++++++++++
 backends/arm/runtime/targets.bzl              |  1 +
 5 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 backends/arm/runtime/EthosUBackend_IoMemcpy.cpp

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 12c46107104..0c8b241522c 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
 
   set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
-  set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp
-                           backends/arm/runtime/VelaBinStream.cpp
+  set(_arm_backend_sources
+      backends/arm/runtime/EthosUBackend.cpp
+      backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
+      backends/arm/runtime/VelaBinStream.cpp
   )
   list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
 
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 2b17cf2c43d..4b78f9a7e28 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -26,6 +26,12 @@
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
+// Overridable memcpy used by the EthosU backend for input/output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 using namespace std;
 
 using executorch::aten::ScalarType;
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       if (both_char || both_int || both_short || both_bool) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
-        // Sizes match and elt size matches so memcpy
-        memcpy(
+        // Sizes match and elt size matches so memcpy.
+        // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+        arm_ethos_io_memcpy(
             scratch_addr,
             tensor_in.mutable_data_ptr<char>(),
             tensor_in.nbytes());
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment(
   }
   const char* src_bytes = src;
   for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-    memcpy(dest, src_bytes, chunk_size);
+    // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+    arm_ethos_io_memcpy(dest, src_bytes, chunk_size);
     src_bytes += vela_chunk_size;
     dest += chunk_size;
   }
diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
index 7962ef846df..96398762302 100644
--- a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
+++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
@@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex(
   return ethosu_reserve_driver();
 }
 
+// Overridable memcpy used by the EthosU backend for output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 namespace executorch {
 namespace backends {
 namespace arm {
@@ -136,7 +142,8 @@ Error platform_execute(
       }
       io_bytes_total += tensor_bytes;
     } else {
-      memcpy(
+      // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+      arm_ethos_io_memcpy(
           tensor_out.mutable_data_ptr<char>(),
           static_cast<const char*>(output_addr),
           tensor_bytes);
diff --git a/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
new file mode 100644
index 00000000000..1ef5b747b81
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstddef>
+#include <cstring>
+
+// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a
+// strong-symbol override (e.g. routing through DMA on Cortex-M55) without
+// touching the upstream EthosUBackend code. Lives in its own translation
+// unit so the compiler in the call-site TUs cannot inline this body and
+// bypass the link-time override (same trick as bolt_arm_memcpy_external).
+extern "C" __attribute__((weak)) void
+arm_ethos_io_memcpy(void* dst, const void* src, size_t size) {
+  std::memcpy(dst, src, size);
+}
diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl
index 42df03fb58b..51c0bf93f55 100644
--- a/backends/arm/runtime/targets.bzl
+++ b/backends/arm/runtime/targets.bzl
@@ -15,6 +15,7 @@ def define_common_targets():
         srcs = [
             "EthosUBackend.cpp",
             "EthosUBackend_Cortex_M.cpp",
+            "EthosUBackend_IoMemcpy.cpp",
         ],
         headers = ["EthosUBackend_Internal.h"],
         compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],