diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 12c46107104..0c8b241522c 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") - set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp - backends/arm/runtime/VelaBinStream.cpp + set(_arm_backend_sources + backends/arm/runtime/EthosUBackend.cpp + backends/arm/runtime/EthosUBackend_IoMemcpy.cpp + backends/arm/runtime/VelaBinStream.cpp ) list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/") diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index 2b17cf2c43d..4b78f9a7e28 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -26,6 +26,12 @@ #include #include +// Overridable memcpy used by the EthosU backend for input/output scratch +// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does +// std::memcpy. Firmware targets can supply a strong override (e.g. routing +// through a DMA engine) to reduce CPU memcpy load on the host MCU. +extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size); + using namespace std; using executorch::aten::ScalarType; @@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { if (both_char || both_int || both_short || both_bool) { EXECUTORCH_PROF_SCOPE( event_tracer, "+EthosUBackend::execute()handles.input.memcpy()"); - // Sizes match and elt size matches so memcpy - memcpy( + // Sizes match and elt size matches so memcpy. + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy( scratch_addr, tensor_in.mutable_data_ptr(), tensor_in.nbytes()); @@ -389,7 +396,8 @@ Error copy_with_layout_adjustment( } const char* src_bytes = src; for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { - memcpy(dest, src_bytes, chunk_size); + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy(dest, src_bytes, chunk_size); src_bytes += vela_chunk_size; dest += chunk_size; } diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp index 7962ef846df..96398762302 100644 --- a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp +++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp @@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex( return ethosu_reserve_driver(); } +// Overridable memcpy used by the EthosU backend for output scratch +// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does +// std::memcpy. Firmware targets can supply a strong override (e.g. routing +// through a DMA engine) to reduce CPU memcpy load on the host MCU. +extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size); + namespace executorch { namespace backends { namespace arm { @@ -136,7 +142,8 @@ Error platform_execute( } io_bytes_total += tensor_bytes; } else { - memcpy( + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. + arm_ethos_io_memcpy( tensor_out.mutable_data_ptr(), static_cast(output_addr), tensor_bytes); diff --git a/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp new file mode 100644 index 00000000000..1ef5b747b81 --- /dev/null +++ b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp @@ -0,0 +1,19 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a +// strong-symbol override (e.g. routing through DMA on Cortex-M55) without +// touching the upstream EthosUBackend code. Lives in its own translation +// unit so the compiler in the call-site TUs cannot inline this body and +// bypass the link-time override (same trick as bolt_arm_memcpy_external). +extern "C" __attribute__((weak)) void +arm_ethos_io_memcpy(void* dst, const void* src, size_t size) { + std::memcpy(dst, src, size); +} diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl index 42df03fb58b..51c0bf93f55 100644 --- a/backends/arm/runtime/targets.bzl +++ b/backends/arm/runtime/targets.bzl @@ -15,6 +15,7 @@ def define_common_targets(): srcs = [ "EthosUBackend.cpp", "EthosUBackend_Cortex_M.cpp", + "EthosUBackend_IoMemcpy.cpp", ], headers = ["EthosUBackend_Internal.h"], compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],