From b6d333d53e713ad01a4b32d2a3db07008ebfaa09 Mon Sep 17 00:00:00 2001 From: Eli Amesefe Date: Wed, 6 May 2026 11:37:15 -0700 Subject: [PATCH] route EthosU input/output memcpy through overridable hook (#19264) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: The EthosU backend's input/output scratch shuffling currently does plain CPU std::memcpy of every input tensor into the scratch buffer and every output tensor out of it on every inference. On Cortex-M55-based firmware targets that have a DMA engine, this is a significant CPU load, inference is spent in memcpy that could instead be DMA-offloaded so the M55 sleeps while the transfer runs. This change introduces a thin extern-C indirection — `arm_ethos_io_memcpy` — that the EthosU backend uses everywhere it currently calls memcpy for input/output scratch shuffling. The default (weak) implementation lives in a separate translation unit (EthosUBackend_IoMemcpy.cpp) and just calls std::memcpy, so behavior is unchanged for any consumer that doesn't override it. Firmware targets can supply a strong-symbol override (e.g. routing through a DMA engine) without touching the upstream backend code. Implementation notes: - The weak default lives in its own TU so the compiler in the call-site TUs cannot inline its body and bypass the link-time override. This is the same pattern bolt_arm_memcpy_external uses. - Three call sites updated: input scratch copy in EthosUBackend.cpp, the layout-adjustment chunk loop in EthosUBackend.cpp, and the output scratch copy in EthosUBackend_Cortex_M.cpp. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: rascani Differential Revision: D103455766 --- backends/arm/CMakeLists.txt | 6 ++++-- backends/arm/runtime/EthosUBackend.cpp | 14 +++++++++++--- .../arm/runtime/EthosUBackend_Cortex_M.cpp | 9 ++++++++- .../arm/runtime/EthosUBackend_IoMemcpy.cpp | 19 +++++++++++++++++++ backends/arm/runtime/targets.bzl | 1 + 5 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 backends/arm/runtime/EthosUBackend_IoMemcpy.cpp diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 12c46107104..0c8b241522c 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") - set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp - backends/arm/runtime/VelaBinStream.cpp + set(_arm_backend_sources + backends/arm/runtime/EthosUBackend.cpp + backends/arm/runtime/EthosUBackend_IoMemcpy.cpp + backends/arm/runtime/VelaBinStream.cpp ) list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/") diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index 2b17cf2c43d..4b78f9a7e28 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -26,6 +26,12 @@ #include #include +// Overridable memcpy used by the EthosU backend for input/output scratch +// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does +// std::memcpy. Firmware targets can supply a strong override (e.g. routing +// through a DMA engine) to reduce CPU memcpy load on the host MCU. +extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size); + using namespace std; using executorch::aten::ScalarType; @@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { if (both_char || both_int || both_short || both_bool) { EXECUTORCH_PROF_SCOPE( event_tracer, "+EthosUBackend::execute()handles.input.memcpy()"); - // Sizes match and elt size matches so memcpy - memcpy( + // Sizes match and elt size matches so memcpy. + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy( scratch_addr, tensor_in.mutable_data_ptr(), tensor_in.nbytes()); @@ -389,7 +396,8 @@ Error copy_with_layout_adjustment( } const char* src_bytes = src; for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { - memcpy(dest, src_bytes, chunk_size); + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy(dest, src_bytes, chunk_size); src_bytes += vela_chunk_size; dest += chunk_size; } diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp index 7962ef846df..96398762302 100644 --- a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp +++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp @@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex( return ethosu_reserve_driver(); } +// Overridable memcpy used by the EthosU backend for output scratch +// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does +// std::memcpy. Firmware targets can supply a strong override (e.g. routing +// through a DMA engine) to reduce CPU memcpy load on the host MCU. +extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size); + namespace executorch { namespace backends { namespace arm { @@ -136,7 +142,8 @@ Error platform_execute( } io_bytes_total += tensor_bytes; } else { - memcpy( + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy( tensor_out.mutable_data_ptr(), static_cast(output_addr), tensor_bytes); diff --git a/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp new file mode 100644 index 00000000000..1ef5b747b81 --- /dev/null +++ b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp @@ -0,0 +1,19 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a +// strong-symbol override (e.g. routing through DMA on Cortex-M55) without +// touching the upstream EthosUBackend code. Lives in its own translation +// unit so the compiler in the call-site TUs cannot inline this body and +// bypass the link-time override (same trick as bolt_arm_memcpy_external). +extern "C" __attribute__((weak)) void +arm_ethos_io_memcpy(void* dst, const void* src, size_t size) { + std::memcpy(dst, src, size); +} diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl index 42df03fb58b..51c0bf93f55 100644 --- a/backends/arm/runtime/targets.bzl +++ b/backends/arm/runtime/targets.bzl @@ -15,6 +15,7 @@ def define_common_targets(): srcs = [ "EthosUBackend.cpp", "EthosUBackend_Cortex_M.cpp", + "EthosUBackend_IoMemcpy.cpp", ], headers = ["EthosUBackend_Internal.h"], compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],