diff --git a/Bender.yml b/Bender.yml
index 2049f69e..7a5baae6 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -97,6 +97,18 @@ sources:
       # Level 2
       - src/frontend/desc64/idma_desc64_top.sv
 
+  # Snitch inst64 standalone harness; needs snitch_cluster for idma_inst64_top + opcode pkg
+  - target: all(snitch_cluster, idma_test)
+    files:
+      # Level 0
+      - systems/snitch/test/idma_inst64_tb_pkg.sv
+      # Level 1
+      - systems/snitch/test/idma_inst64_drv_if.sv
+      - systems/snitch/test/idma_inst64_base.sv
+      # Level 2
+      - systems/snitch/test/tb_idma_inst64_copy.sv
+      - systems/snitch/test/tb_idma_inst64_transpose.sv
+
   # Synthesis wrappers
   - target: synth
     files:
diff --git a/src/frontend/inst64/idma_inst64_top.sv b/src/frontend/inst64/idma_inst64_top.sv
index 4f93b55f..f4696e31 100644
--- a/src/frontend/inst64/idma_inst64_top.sv
+++ b/src/frontend/inst64/idma_inst64_top.sv
@@ -22,6 +22,13 @@ module idma_inst64_top #(
     parameter int unsigned NumChannels     = 32'd1,
     parameter bit          TCDMAliasEnable = 1'b0,
     parameter int unsigned DMATracing      = 32'd0,
+    /// Compile-time on-the-fly compute feature enables (e.g. transpose)
+    parameter idma_pkg::compute_enable_t ComputeEnable = '0,
+    /// Transpose via address generation (no FF engine) — for backends without
+    /// the engine, e.g. this multi-write OBI/TCDM variant
+    parameter bit          AddrGenTranspose = 1'b0,
+    /// Address-gen transpose: skew the dst pitch to avoid TCDM bank conflicts
+    parameter bit          BankSkew         = 1'b0,
     parameter type         axi_ar_chan_t   = logic,
     parameter type         axi_aw_chan_t   = logic,
     parameter type         axi_req_t       = logic,
@@ -70,7 +77,7 @@ module idma_inst64_top #(
     localparam int unsigned TfIdWidth    = 32'd32;
     localparam int unsigned TFLenWidth   = AxiAddrWidth;
     localparam int unsigned RepWidth     = 32'd32;
-    localparam int unsigned NumDim       = 32'd2;
+    localparam int unsigned NumDim       = ComputeEnable.transpose ? 32'd4 : 32'd2;
     localparam int unsigned BufferDepth  = 32'd3;
     localparam int unsigned NumRules     = 32'd5;
 
@@ -84,7 +91,8 @@ module idma_inst64_top #(
     localparam type id_t                 = logic[AxiIdWidth-1:0];
     localparam type tf_len_t             = logic[TFLenWidth-1:0];
     localparam type offset_t             = logic[OffsetWidth-1:0];
-    localparam type strides_t            = logic[RepWidth-1:0];
+    // strides must match addr_t: signed transpose deltas would not sign-extend if narrower
+    localparam type strides_t            = addr_t;
     localparam type reps_t               = logic[RepWidth-1:0];
     localparam type tf_id_t              = logic[TfIdWidth-1:0];
 
@@ -178,6 +186,7 @@ module idma_inst64_top #(
     logic [1:0] idma_fe_status;
     logic [2:0] idma_fe_sel_chan;
     logic       idma_fe_twod;
+    logic       idma_fe_tp_reject;
 
     // busy signals
     idma_pkg::idma_busy_t [NumChannels-1:0] idma_busy;
@@ -348,7 +357,7 @@ module idma_inst64_top #(
             .idma_req_t    ( idma_req_t    ),
             .idma_rsp_t    ( idma_rsp_t    ),
             .idma_nd_req_t ( idma_nd_req_t ),
-            .RepWidths     ( RepWidth      )
+            .RepWidths     ( {NumDim{RepWidth}} )
         ) i_idma_nd_midend (
             .clk_i,
             .rst_ni,
@@ -367,6 +376,33 @@ module idma_inst64_top #(
             .busy_o            ( idma_nd_busy      [c] )
         );
 
+        // FIFO output, before transpose expansion
+        idma_nd_req_t fifo_nd_req;
+        logic         fifo_nd_valid, fifo_nd_ready;
+
+        // expand transpose requests into the tiled ND walk
+        if (ComputeEnable.transpose) begin : gen_transpose
+            idma_transpose_midend #(
+                .NumDim           ( NumDim           ),
+                .AddrGenTranspose ( AddrGenTranspose ),
+                .BankSkew         ( BankSkew         ),
+                .StrbWidth        ( StrbWidth        ),
+                .addr_t           ( addr_t           ),
+                .idma_nd_req_t    ( idma_nd_req_t    )
+            ) i_idma_transpose_midend (
+                .nd_req_i ( fifo_nd_req           ),
+                .valid_i  ( fifo_nd_valid         ),
+                .ready_o  ( fifo_nd_ready         ),
+                .nd_req_o ( idma_nd_req       [c] ),
+                .valid_o  ( idma_nd_req_valid [c] ),
+                .ready_i  ( idma_nd_req_ready [c] )
+            );
+        end else begin : gen_no_transpose
+            assign idma_nd_req       [c] = fifo_nd_req;
+            assign idma_nd_req_valid [c] = fifo_nd_valid;
+            assign fifo_nd_ready         = idma_nd_req_ready [c];
+        end
+
         stream_fifo_optimal_wrap #(
             .Depth     ( DMAReqFifoDepth ),
             .type_t    ( idma_nd_req_t   ),
@@ -380,9 +416,9 @@ module idma_inst64_top #(
             .data_i     ( idma_fe_req           ),
             .valid_i    ( idma_fe_req_valid [c] ),
             .ready_o    ( idma_fe_req_ready [c] ),
-            .data_o     ( idma_nd_req       [c] ),
-            .valid_o    ( idma_nd_req_valid [c] ),
-            .ready_i    ( idma_nd_req_ready [c] )
+            .data_o     ( fifo_nd_req           ),
+            .valid_o    ( fifo_nd_valid         ),
+            .ready_i    ( fifo_nd_ready         )
         );
     end
 
@@ -519,10 +555,12 @@ module idma_inst64_top #(
         idma_fe_req_d.burst_req.opt.beo.src_reduce_len = 1'b0;
         idma_fe_req_d.burst_req.opt.beo.dst_reduce_len = 1'b0;
         idma_fe_req_d.burst_req.opt.last               = 1'b0;
+        idma_fe_req_d.burst_req.opt.compute            = '0;
 
         // frontend config
         idma_fe_cfg      = '0;
         idma_fe_status   = '0;
+        idma_fe_tp_reject = 1'b0;
         idma_fe_sel_chan = '0;
 
         // default handshaking
@@ -573,6 +611,28 @@ module idma_inst64_top #(
                         idma_inst64_snitch_pkg::DMCPY : begin
                             idma_fe_cfg      = acc_req_i.data_argb[1:0];
                             idma_fe_sel_chan = acc_req_i.data_argb[4:2];
+                            // transpose request (register form only): argb spare bits
+                            // carry {enable, mode, tensor_m, tensor_n}
+                            if (ComputeEnable.transpose && acc_req_i.data_argb[5]) begin
+                                idma_fe_req_d.burst_req.opt.compute.enable = 1'b1;
+                                idma_fe_req_d.burst_req.opt.compute.op     =
+                                    idma_pkg::COMPUTE_TRANSPOSE;
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.mode     =
+                                    acc_req_i.data_argb[7:6];
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_m =
+                                    acc_req_i.data_argb[19:8];
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_n =
+                                    acc_req_i.data_argb[31:20];
+                            end
+                            // reject malformed transpose requests: no hardware,
+                            // reserved mode, zero dims, unaligned dst
+                            if (acc_req_i.data_argb[5]) begin
+                                idma_fe_tp_reject = !ComputeEnable.transpose
+                                    | (acc_req_i.data_argb[7:6] == 2'd3)
+                                    | (acc_req_i.data_argb[19:8] == '0)
+                                    | (acc_req_i.data_argb[31:20] == '0)
+                                    | (idma_fe_req_d.burst_req.dst_addr[OffsetWidth-1:0] != '0);
+                            end
                         end
                         default:;
                     endcase
@@ -588,7 +648,15 @@ module idma_inst64_top #(
                     // 3. wait for twod transfer to be accepted (ready)
                     // 4. send acc response (pvalid)
                     // 5. acknowledge acc request (qready)
-                    if (acc_res_ready) begin
+                    // DMCPY launch; transpose requests reject malformed configs
+                    if (idma_fe_tp_reject) begin
+                        // error response; the transfer is not launched
+                        if (acc_res_ready) begin
+                            acc_res.id      = acc_req_i.id;
+                            acc_res_valid   = 1'b1;
+                            acc_req_ready_o = 1'b1;
+                        end
+                    end else if (acc_res_ready) begin
                         idma_fe_req_valid[idma_fe_sel_chan] = 1'b1;
                         if (idma_fe_req_ready[idma_fe_sel_chan]) begin
                             acc_res.id      = acc_req_i.id;
@@ -750,6 +818,12 @@ module idma_inst64_top #(
         if (!idma_fe_twod) begin
             idma_fe_req.d_req[0].reps = 'd1;
         end
+        // keep higher dims inert for plain requests (the transpose expander overwrites them)
+        for (int d = 1; d <= NumDim-2; d++) begin
+            idma_fe_req.d_req[d].reps        = 'd1;
+            idma_fe_req.d_req[d].src_strides = '0;
+            idma_fe_req.d_req[d].dst_strides = '0;
+        end
     end
 
     //--------------------------------------
@@ -763,6 +837,16 @@ module idma_inst64_top #(
     //--------------------------------------
     // only activate tracer if requested
 `ifndef SYNTHESIS
+    initial assert (idma_pkg::TransposeDimWidth == 32'd12) else
+        $fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12");
+`ifndef VERILATOR
+    // capability cross-check against the generated backend's baked compute set
+    // (engine route only; address-gen needs no compute-enabled backend)
+    if (ComputeEnable.transpose && !AddrGenTranspose) begin : gen_compute_check
+        initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else
+            $fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant");
+    end
+`endif
     if (DMATracing) begin : gen_tracer
         for (genvar c = 0; c < NumChannels; c++) begin : gen_channels
             // derive the name of the trace file from the hart and channel IDs
diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv
index b6f66325..4ebfc49a 100644
--- a/src/midend/idma_transpose_midend.sv
+++ b/src/midend/idma_transpose_midend.sv
@@ -5,12 +5,18 @@
 // Authors:
 // - Daniel Keller <dankeller@iis.ee.ethz.ch>
 
-/// Transpose geometry expander: expands an opt.compute=TRANSPOSE request into a
-/// NumDim=4 tiled ND walk for the generic idma_nd_midend. Non-transpose passes
-/// through. Combinational, quasi-static per request.
+/// Transpose geometry expander for the generic idma_nd_midend. Two modes:
+/// engine (NumDim=4 tiled walk feeding the FF transpose engine) and address-gen
+/// (element-granular swapped-stride walk, no engine, for backends without the
+/// engine e.g. multi-write OBI/TCDM). Non-transpose passes through.
 module idma_transpose_midend #(
-    /// Number of ND dimensions (must be >= 4 to express the tiled walk)
+    /// Number of ND dimensions (engine walk needs >= 4; address-gen needs >= 3)
     parameter int unsigned NumDim    = 32'd4,
+    /// Address-gen mode: element-granular swapped-stride transpose, no engine
+    parameter bit          AddrGenTranspose = 1'b0,
+    /// Address-gen: pad the dst row pitch by one bus-word when needed so the
+    /// per-column word stride is odd (conflict-free on power-of-2-bank TCDM)
+    parameter bit          BankSkew     = 1'b0,
     /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes)
     parameter int unsigned StrbWidth = 32'd64,
     /// Address type
@@ -51,6 +57,7 @@ module idma_transpose_midend #(
         logic [TensorW-1:0]      tm, tn;
         logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe;
         logic signed [WorkW-1:0] strb_c;   // NE*E == StrbWidth (mode cancels)
+        logic signed [WorkW-1:0] e, me, pad;  // address-gen: E (=1<<mode), M*E, pitch pad
 
         nd_req_o = nd_req_i;   // passthrough
 
@@ -59,43 +66,74 @@ module idma_transpose_midend #(
             tm   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_m;
             tn   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_n;
             // zero-extend bounded dims into the signed working width
-            m      = $signed({{(WorkW-TensorW){1'b0}}, tm});   // M
-            n      = $signed({{(WorkW-TensorW){1'b0}}, tn});   // N
-            log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
-            ne     = $signed(WorkW'(1)) <<< log2ne;            // tile side (elements)
-            yt     = (m + ne - 1) >>> log2ne;                  // ceil(M/NE)
-            nt     = (n + ne - 1) >>> log2ne;                  // ceil(N/NE)
-            nxe    = n  <<< mode;                              // N*E  (E = 1<<mode)
-            mpe    = yt <<< Log2Strb;                          // MP*E = YT*NE*E = YT*StrbWidth
-            strb_c = $signed(WorkW'(StrbWidth));               // NE*E (one tile-row = StrbWidth B)
+            m = $signed({{(WorkW-TensorW){1'b0}}, tm});   // M
+            n = $signed({{(WorkW-TensorW){1'b0}}, tn});   // N
 
-            nd_req_o.burst_req.length     = LenW'(StrbWidth);
+            if (AddrGenTranspose) begin
+                // Element-granular swapped-stride walk (out_T[c][r] = in[r][c]),
+                // dst an N x M' transpose. No engine: compute is cleared so the
+                // backend runs a plain strided copy. Correct on any protocol
+                // (ideal on random-access OBI/TCDM; slow on burst AXI).
+                e   = $signed(WorkW'(1)) <<< mode;             // E = 1<<mode
+                me  = m <<< mode;                              // M*E
+                // BankSkew: when M*E is an even number of bus words the column
+                // stride hammers one TCDM bank; pad the pitch by one word (NE
+                // elements) to make the word stride odd => round-robin all banks.
+                ne  = $signed(WorkW'(StrbWidth)) >>> mode;     // NE = StrbWidth/E
+                pad = (BankSkew && (me[Log2Strb:0] == '0)) ? ne : '0;
+                me  = (m + pad) <<< mode;                      // M'*E (padded pitch)
+                nd_req_o.burst_req.opt.compute.enable = 1'b0;
+                nd_req_o.burst_req.length     = LenW'(e);
+                // d_req[0] = column walk (reps N): src +E, dst +M'*E
+                nd_req_o.d_req[0].reps        = n[RepW-1:0];
+                nd_req_o.d_req[0].src_strides = addr_t'(e);
+                nd_req_o.d_req[0].dst_strides = addr_t'(me);
+                // d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M'*E (rewind)
+                nd_req_o.d_req[1].reps        = m[RepW-1:0];
+                nd_req_o.d_req[1].src_strides = addr_t'(e);
+                nd_req_o.d_req[1].dst_strides = addr_t'(e - (n - 1) * me);
+                for (int unsigned d = 2; d < NumDim-1; d++) begin
+                    nd_req_o.d_req[d].reps        = RepW'(1);
+                    nd_req_o.d_req[d].src_strides = '0;
+                    nd_req_o.d_req[d].dst_strides = '0;
+                end
+            end else begin
+                log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
+                ne     = $signed(WorkW'(1)) <<< log2ne;            // tile side (elements)
+                yt     = (m + ne - 1) >>> log2ne;                  // ceil(M/NE)
+                nt     = (n + ne - 1) >>> log2ne;                  // ceil(N/NE)
+                nxe    = n  <<< mode;                              // N*E  (E = 1<<mode)
+                mpe    = yt <<< Log2Strb;                          // MP*E = YT*NE*E = YT*StrbWidth
+                strb_c = $signed(WorkW'(StrbWidth));               // NE*E (one tile-row = StrbWidth B)
 
-            // d_req[0] = local row within tile (reps NE)
-            nd_req_o.d_req[0].reps        = ne[RepW-1:0];
-            nd_req_o.d_req[0].src_strides = addr_t'(nxe);
-            nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
-            // d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
-            nd_req_o.d_req[1].reps        = yt[RepW-1:0];
-            nd_req_o.d_req[1].src_strides = addr_t'(nxe);
-            nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
-            // d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
-            //            the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
-            nd_req_o.d_req[2].reps        = nt[RepW-1:0];
-            nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
-            nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
-            // the walk is exactly 4-D: neutralize any higher dims
-            for (int unsigned d = 3; d < NumDim-1; d++) begin
-                nd_req_o.d_req[d].reps        = RepW'(1);
-                nd_req_o.d_req[d].src_strides = '0;
-                nd_req_o.d_req[d].dst_strides = '0;
+                nd_req_o.burst_req.length     = LenW'(StrbWidth);
+
+                // d_req[0] = local row within tile (reps NE)
+                nd_req_o.d_req[0].reps        = ne[RepW-1:0];
+                nd_req_o.d_req[0].src_strides = addr_t'(nxe);
+                nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
+                // d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
+                nd_req_o.d_req[1].reps        = yt[RepW-1:0];
+                nd_req_o.d_req[1].src_strides = addr_t'(nxe);
+                nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
+                // d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
+                //            the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
+                nd_req_o.d_req[2].reps        = nt[RepW-1:0];
+                nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
+                nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
+                // the walk is exactly 4-D: neutralize any higher dims
+                for (int unsigned d = 3; d < NumDim-1; d++) begin
+                    nd_req_o.d_req[d].reps        = RepW'(1);
+                    nd_req_o.d_req[d].src_strides = '0;
+                    nd_req_o.d_req[d].dst_strides = '0;
+                end
             end
         end
     end
 
 `ifndef SYNTHESIS
-    initial assert (NumDim >= 4) else
-        $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim);
+    initial assert (NumDim >= (AddrGenTranspose ? 32'd3 : 32'd4)) else
+        $fatal(1, "idma_transpose_midend: NumDim too small (got %0d)", NumDim);
     // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2
     initial assert (Log2Strb >= 2) else
         $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth);
diff --git a/systems/snitch/.gitignore b/systems/snitch/.gitignore
new file mode 100644
index 00000000..180edf75
--- /dev/null
+++ b/systems/snitch/.gitignore
@@ -0,0 +1,5 @@
+build/
+*.so
+modelsim.ini
+transcript
+work/
diff --git a/systems/snitch/Makefile b/systems/snitch/Makefile
new file mode 100644
index 00000000..59f539e5
--- /dev/null
+++ b/systems/snitch/Makefile
@@ -0,0 +1,50 @@
+# Copyright 2026 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# Authors:
+# - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+# Standalone build + sim flow for the Snitch inst64 integration. Elaborates the
+# upstream single-head idma_inst64_top + the recycled harness against iDMA's own
+# deps (axi, common_cells, common_verification). Uses the split_rtl compute-
+# enabled rw_axi backend (the bundled idma_generated.sv predates opt.compute).
+
+SNITCH_DIR := $(realpath $(dir $(lastword $(MAKEFILE_LIST))))
+IDMA_ROOT  := $(realpath $(SNITCH_DIR)/../..)
+BENDER     ?= bender
+VSIM       ?= questa-2023.4 vsim
+
+BUILD      := $(SNITCH_DIR)/build
+TARGETS    := -t rtl -t split_rtl -t snitch_cluster -t idma_test -t test
+TOP        ?= tb_idma_inst64_copy
+
+.PHONY: snitch_sim snitch_compile snitch_transpose snitch_clean
+snitch_sim: snitch_compile
+	cd $(BUILD) && $(VSIM) -c $(TOP)_opt -do "run -all; quit"
+
+# Transpose end-to-end: the TB sweeps the geometry list (M/N/EB) internally; one
+# run per structural config (BankSkew off/on).
+snitch_transpose: $(BUILD)/compile_snitch.tcl
+	@for sk in 0 1; do \
+	  cd $(BUILD) && $(VSIM) -c -do \
+	    "source compile_snitch.tcl; vopt +acc tb_idma_inst64_transpose -gBankSkew=$$sk -o tb_tp; quit" \
+	    >/dev/null 2>&1; \
+	  printf 'BankSkew=%s: ' $$sk; \
+	  $(VSIM) -c tb_tp -do "run -all; quit" 2>&1 | grep -E '\[TP\] (PASS|FAIL)' | sed 's/# //'; \
+	done
+
+snitch_compile: $(BUILD)/compile_snitch.tcl
+	cd $(BUILD) && $(VSIM) -c -do \
+	  "source compile_snitch.tcl; vopt +acc $(TOP) -o $(TOP)_opt; quit"
+
+$(BUILD)/compile_snitch.tcl: | $(BUILD)
+	$(MAKE) -C $(IDMA_ROOT) idma_hw_all
+	cd $(IDMA_ROOT) && $(BENDER) script vsim $(TARGETS) \
+	  --vlog-arg="-svinputport=compat" > $@
+
+$(BUILD):
+	mkdir -p $(BUILD)
+
+snitch_clean:
+	rm -rf $(BUILD)
diff --git a/systems/snitch/README.md b/systems/snitch/README.md
new file mode 100644
index 00000000..4fd6a5bb
--- /dev/null
+++ b/systems/snitch/README.md
@@ -0,0 +1,65 @@
+# Snitch (inst64) iDMA integration
+
+Standalone host for the **inst64** ISA-coupled frontend (`idma_inst64_top`) — the
+tightly-coupled Snitch DMA interface. iDMA already owns `idma_inst64_top`; this
+directory adds a cluster-free verification harness and (Stage 2) the on-the-fly
+transpose wired through the accelerator interface.
+
+## Recycled, not reinvented
+
+The harness is **recycled from the vidma fork's inst64 verification interface**
+(`idma_alu_vec/test/frontend/`), adapted only as the clean upstream single-head
+`idma_inst64_top` requires:
+
+| File | Provenance |
+|------|------------|
+| `test/idma_inst64_tb_pkg.sv` | faithful copy (8-line delta: `AxiDataWidth`/`NumAxInFlight` sizing + header) |
+| `test/idma_inst64_drv_if.sv` | faithful copy; dropped the 4 vidma-only tasks (`DMOPC`, multi-head copy, immediate `DMCPYI`) to match upstream |
+| `test/idma_inst64_base.sv` | adapted: single-head (`axi_req_o[NumChannels]`, no `NumHeads`/`enable_single_head_mode`) |
+| `test/tb_idma_inst64_copy.sv` | Stage-1 plain-copy regression |
+
+The accelerator interface (the 4-field `acc_req`/`acc_res` bus + the `DM*`
+instruction BFM) is exactly the vidma one — no reinvention.
+
+## Why split_rtl
+
+`idma_inst64_top` is gated behind the `snitch_cluster` Bender target. The build
+uses `-t split_rtl` (per-variant RTL) because the **bundled `idma_generated.sv`
+predates the typed `opt.compute` struct** (it still references the old flat
+`opt.transpose_en` fields) and won't elaborate against the current package. The
+split_rtl `idma_backend_rw_axi` is compute-enabled (`IDMA_VIDMA_IDS=rw_axi`).
+
+## Standalone simulation
+
+```bash
+make -C systems/snitch snitch_sim                 # plain-copy regression (Stage 1)
+make -C systems/snitch snitch_sim TOP=tb_idma_inst64_transpose   # transpose (Stage 2)
+```
+
+Drives `DMSRC`/`DMDST`/`DMCPY` (+ `DMSTR`/`DMREP` for 2D) over the accelerator
+bus and verifies the AXI sim memory. Requires `questa-2023.4`.
+
+## Status
+
+- **Stage 1 (done):** plain copy through the single-head frontend — 3 transfers pass.
+- **Stage 2 (done):** multi-tile on-the-fly transpose, end-to-end. A transpose is
+  programmed with the spare `DMCPY` argb bits (`[5]`=enable, `[7:6]`=mode,
+  `[19:8]`=M, `[31:20]`=N), populating the typed per-transfer `opt.compute`; the
+  dedicated `src/midend/idma_transpose_midend.sv` expands `(M,N,mode)` into the
+  `NumDim=4` tiled walk; the unmodified `idma_nd_midend` walks it into the
+  compute-enabled `rw_axi` backend. Gated by `idma_inst64_top`'s
+  `ComputeEnable.transpose` (off by default, so other snitch_cluster consumers
+  are unaffected). Verified across int8/fp16/fp32, single/multi-tile, edge tiles,
+  padding integrity, back-to-back, and cross-transfer no-leak:
+  `make -C systems/snitch snitch_transpose_sweep`. Full functionality at any
+  `NumAxInFlight` (down to the backend min) — the compute backend internally
+  buffers a tile of write descriptors (`ComputeFifoDepth = StrbWidth`), so there
+  is no `NumAxInFlight >= NE` constraint.
+
+## Transpose memory contract
+
+A transposed transfer reads the source up to the tile-padded bounds
+(`ceil(M/NE)*NE` rows of `N` elements, the last row tile reading past row `M-1`)
+and writes the full padded destination extent (`ceil(N/NE)*NE` rows at pitch
+`MP = ceil(M/NE)*NE`; padding is strobe-masked but addressed). Both regions must
+be mapped, side-effect-free memory.
diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv
new file mode 100644
index 00000000..586bb2e1
--- /dev/null
+++ b/systems/snitch/test/idma_inst64_base.sv
@@ -0,0 +1,177 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+/// Base harness for the standalone single-head inst64 frontend.
+/// Clock/reset, the accelerator-bus driver, the upstream idma_inst64_top DUT,
+/// and one axi_sim_mem per channel. No Snitch cluster, no multi-head.
+module idma_inst64_base #(
+    parameter int unsigned DMATracing = idma_inst64_tb_pkg::DMATracing,
+    parameter idma_pkg::compute_enable_t ComputeEnable = '0,
+    parameter bit AddrGenTranspose = 1'b0,
+    parameter bit BankSkew         = 1'b0
+);
+  import idma_inst64_tb_pkg::*;
+  import idma_inst64_snitch_pkg::*;
+
+  logic clk;
+  logic rst_n;
+
+  clk_rst_gen #(
+    .ClkPeriod   ( Period      ),
+    .RstClkCycles( ResetCycles )
+  ) i_clock_reset_generator (
+    .clk_o ( clk   ),
+    .rst_no( rst_n )
+  );
+
+  idma_inst64_drv_if drv_if (
+    .clk  ( clk   ),
+    .rst_n( rst_n )
+  );
+
+  axi_req_t  [NumChannels-1:0] axi_req;
+  axi_resp_t [NumChannels-1:0] axi_res;
+  obi_req_t  [NumChannels-1:0] obi_req;
+  obi_res_t  [NumChannels-1:0] obi_res;
+  dma_events_t [NumChannels-1:0] events;
+  logic      [NumChannels-1:0] busy;
+
+  // route the test's AXI range via the default idx (ToSoC=AXI); the single rule
+  // maps an unused low TCDM range to OBI so the OBI port stays idle
+  addr_rule_t addr_map;
+  assign addr_map = '{
+    idx:        idma_pkg::TCDMDMA,
+    start_addr: 64'h0000_0000,
+    end_addr:   64'h1000_0000
+  };
+
+  idma_inst64_top #(
+    .AxiDataWidth    ( AxiDataWidth    ),
+    .AxiAddrWidth    ( AxiAddrWidth    ),
+    .AxiUserWidth    ( AxiUserWidth    ),
+    .AxiIdWidth      ( AxiIdWidth      ),
+    .NumAxInFlight   ( NumAxInFlight   ),
+    .DMAReqFifoDepth ( DMAReqFifoDepth ),
+    .NumChannels     ( NumChannels     ),
+    .DMATracing      ( DMATracing      ),
+    .ComputeEnable   ( ComputeEnable   ),
+    .AddrGenTranspose( AddrGenTranspose ),
+    .BankSkew        ( BankSkew         ),
+    .axi_ar_chan_t   ( axi_ar_chan_t   ),
+    .axi_aw_chan_t   ( axi_aw_chan_t   ),
+    .axi_req_t       ( axi_req_t       ),
+    .axi_res_t       ( axi_resp_t      ),
+    .init_req_chan_t ( init_req_chan_t ),
+    .init_rsp_chan_t ( init_rsp_chan_t ),
+    .init_req_t      ( init_req_t      ),
+    .init_rsp_t      ( init_rsp_t      ),
+    .obi_a_chan_t    ( obi_a_chan_t    ),
+    .obi_r_chan_t    ( obi_r_chan_t    ),
+    .obi_req_t       ( obi_req_t       ),
+    .obi_res_t       ( obi_res_t       ),
+    .acc_req_t       ( acc_req_t       ),
+    .acc_res_t       ( acc_res_t       ),
+    .dma_events_t    ( dma_events_t    ),
+    .addr_rule_t     ( addr_rule_t     )
+  ) i_dut (
+    .clk_i           ( clk                  ),
+    .rst_ni          ( rst_n                ),
+    .testmode_i      ( 1'b0                 ),
+    .axi_req_o       ( axi_req              ),
+    .axi_res_i       ( axi_res              ),
+    .obi_req_o       ( obi_req              ),
+    .obi_res_i       ( obi_res              ),
+    .busy_o          ( busy                 ),
+    .acc_req_i       ( drv_if.acc_req       ),
+    .acc_req_valid_i ( drv_if.acc_req_valid ),
+    .acc_req_ready_o ( drv_if.acc_req_ready ),
+    .acc_res_o       ( drv_if.acc_res       ),
+    .acc_res_valid_o ( drv_if.acc_res_valid ),
+    .acc_res_ready_i ( drv_if.acc_res_ready ),
+    .hart_id_i       ( 32'h0                ),
+    .events_o        ( events               ),
+    .addr_map_i      ( addr_map             )
+  );
+
+  for (genvar c = 0; c < NumChannels; c++) begin : gen_mem_ch
+    axi_sim_mem #(
+      .AddrWidth         ( AxiAddrWidth ),
+      .DataWidth         ( AxiDataWidth ),
+      .IdWidth           ( AxiIdWidth   ),
+      .UserWidth         ( AxiUserWidth ),
+      .axi_req_t         ( axi_req_t    ),
+      .axi_rsp_t         ( axi_resp_t   ),
+      .WarnUninitialized ( 1'b1         ),
+      .ClearErrOnAccess  ( 1'b1         ),
+      .ApplDelay         ( ApplDelay    ),
+      .AcqDelay          ( AcqDelay     )
+    ) i_axi_sim_mem (
+      .clk_i             ( clk          ),
+      .rst_ni            ( rst_n        ),
+      .axi_req_i         ( axi_req[c]   ),
+      .axi_rsp_o         ( axi_res[c]   ),
+      .mon_w_valid_o     (              ),
+      .mon_w_addr_o      (              ),
+      .mon_w_data_o      (              ),
+      .mon_w_id_o        (              ),
+      .mon_w_user_o      (              ),
+      .mon_w_beat_count_o(              ),
+      .mon_w_last_o      (              ),
+      .mon_r_valid_o     (              ),
+      .mon_r_addr_o      (              ),
+      .mon_r_data_o      (              ),
+      .mon_r_id_o        (              ),
+      .mon_r_user_o      (              ),
+      .mon_r_beat_count_o(              ),
+      .mon_r_last_o      (              )
+    );
+
+    // L1/TCDM model: connected but idle for the AXI-routed copy/transpose tests
+    obi_sim_mem #(
+      .ObiCfg            ( ObiCfg     ),
+      .obi_req_t         ( obi_req_t  ),
+      .obi_rsp_t         ( obi_res_t  ),
+      .obi_r_chan_t      ( obi_r_chan_t ),
+      .WarnUninitialized ( 1'b0       ),
+      .ClearErrOnAccess  ( 1'b1       ),
+      .ApplDelay         ( ApplDelay  ),
+      .AcqDelay          ( AcqDelay   )
+    ) i_obi_sim_mem (
+      .clk_i       ( clk        ),
+      .rst_ni      ( rst_n      ),
+      .obi_req_i   ( obi_req[c] ),
+      .obi_rsp_o   ( obi_res[c] ),
+      .mon_valid_o (            ),
+      .mon_we_o    (            ),
+      .mon_addr_o  (            ),
+      .mon_wdata_o (            ),
+      .mon_be_o    (            ),
+      .mon_id_o    (            )
+    );
+  end
+
+  // Memory helpers (channel 0)
+  task automatic mem_write_byte(input addr_t addr, input byte data);
+    gen_mem_ch[0].i_axi_sim_mem.mem[addr] = data;
+  endtask
+
+  function automatic logic [7:0] mem_read_byte(input addr_t addr);
+    if (gen_mem_ch[0].i_axi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_axi_sim_mem.mem[addr];
+    else return 8'hXX;
+  endfunction
+
+  // L1/TCDM (OBI) backdoor helpers (channel 0)
+  task automatic obi_write_byte(input addr_t addr, input byte data);
+    gen_mem_ch[0].i_obi_sim_mem.mem[addr] = data;
+  endtask
+
+  function automatic logic [7:0] obi_read_byte(input addr_t addr);
+    if (gen_mem_ch[0].i_obi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_obi_sim_mem.mem[addr];
+    else return 8'hXX;
+  endfunction
+
+endmodule
diff --git a/systems/snitch/test/idma_inst64_drv_if.sv b/systems/snitch/test/idma_inst64_drv_if.sv
new file mode 100644
index 00000000..8a7466a3
--- /dev/null
+++ b/systems/snitch/test/idma_inst64_drv_if.sv
@@ -0,0 +1,194 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+// Recycled from the vidma inst64 verification harness
+// (idma_alu_vec/test/frontend/idma_inst64_drv_if.sv). Faithful copy of the
+// copy/status tasks; the vidma-only DMOPC/multi-head/immediate tasks are
+// dropped to match the clean single-head upstream idma_inst64_top.
+//
+// One correctness fix vs the source: the handshake drops acc_req_valid the
+// cycle the request is accepted (sampling ready at AcqDelay) instead of holding
+// it one extra cycle. The source held valid past grant, which double-issues the
+// request to a still-ready frontend FIFO — harmless for idempotent copies but
+// corrupts non-idempotent transfers (transpose).
+
+interface idma_inst64_drv_if (
+    input logic clk,
+    input logic rst_n
+);
+    import idma_inst64_tb_pkg::*;
+    import idma_inst64_snitch_pkg::*;
+
+    // Accelerator Interface Signals
+    acc_req_t  acc_req;
+    logic      acc_req_valid;
+    logic      acc_req_ready;
+
+    acc_res_t  acc_res;
+    logic      acc_res_valid;
+    logic      acc_res_ready;
+
+    // Internal State for BFM
+    logic [31:0] req_id_counter;
+
+    // Performance Counters
+    longint unsigned dma_start_cycle;
+    longint unsigned dma_end_cycle;
+    longint unsigned dma_cycles;
+    longint unsigned cycle_counter;
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) cycle_counter <= 0;
+        else cycle_counter <= cycle_counter + 1;
+    end
+
+    // Initialization
+    initial begin
+        acc_req_valid = 1'b0;
+        acc_res_ready = 1'b1;
+        acc_req = '0;
+        req_id_counter = '0;
+        dma_start_cycle = 0;
+        dma_end_cycle = 0;
+        dma_cycles = 0;
+    end
+
+    // Drive one accelerator instruction; valid is asserted in the apply region
+    // and dropped the cycle the request is accepted (ready sampled at AcqDelay).
+    task automatic drive(input logic [31:0] op, input logic [63:0] arga, input logic [63:0] argb);
+        @(posedge clk);
+        #(ApplDelay);
+        acc_req.id        = req_id_counter++;
+        acc_req.data_op   = op;
+        acc_req.data_arga = arga;
+        acc_req.data_argb = argb;
+        acc_req_valid     = 1'b1;
+        do begin
+            @(posedge clk);
+            #(AcqDelay);
+        end while (!acc_req_ready);
+        acc_req_valid = 1'b0;
+    endtask
+
+    //--------------------------------------
+    // C-like API for DMA Programming
+    //--------------------------------------
+
+    task automatic dma_set_source(input addr_t addr);
+        drive(DMSRC, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]});
+    endtask
+
+    task automatic dma_set_dest(input addr_t addr);
+        drive(DMDST, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]});
+    endtask
+
+    task automatic dma_set_strides(input logic [31:0] src_stride, input logic [31:0] dst_stride);
+        drive(DMSTR, src_stride, dst_stride);
+    endtask
+
+    task automatic dma_set_reps(input logic [31:0] reps);
+        drive(DMREP, reps, '0);
+    endtask
+
+    // Launch a copy. cfg[1] = 2D enable; channel selects the AXI manager.
+    // Reads back the transfer id from the response.
+    task automatic dma_start_copy(
+        input  addr_t      length,
+        input  logic [1:0] cfg,
+        input  logic [2:0] channel,
+        output tf_id_t     transfer_id
+    );
+        drive(DMCPY, length, {59'b0, channel, cfg});
+        while (!acc_res_valid) @(posedge clk);
+        transfer_id = acc_res.data[31:0];
+    endtask
+
+    // Launch a transpose. Encodes {enable, mode, M, N} into the spare DMCPY argb
+    // bits (argb[1:0]=cfg, [4:2]=channel, [5]=transpose, [7:6]=mode,
+    // [19:8]=tensor_m, [31:20]=tensor_n). Length is derived by the midend.
+    task automatic dma_transpose(
+        input  addr_t       src,
+        input  addr_t       dst,
+        input  logic [11:0] tensor_m,
+        input  logic [11:0] tensor_n,
+        input  logic [1:0]  mode,
+        input  logic [2:0]  channel,
+        output tf_id_t      transfer_id
+    );
+        logic [63:0] argb;
+        dma_set_source(src);
+        dma_set_dest(dst);
+        argb          = '0;
+        argb[4:2]     = channel;
+        argb[5]       = 1'b1;
+        argb[7:6]     = mode;
+        argb[19:8]    = tensor_m;
+        argb[31:20]   = tensor_n;
+        drive(DMCPY, '0, argb);
+        while (!acc_res_valid) @(posedge clk);
+        transfer_id = acc_res.data[31:0];
+    endtask
+
+    // issue a transpose DMCPY and return the response error bit (negative tests)
+    task automatic dma_transpose_err(
+        input  addr_t       src,
+        input  addr_t       dst,
+        input  logic [11:0] tensor_m,
+        input  logic [11:0] tensor_n,
+        input  logic [1:0]  mode,
+        input  logic [2:0]  channel,
+        output logic        error
+    );
+        logic [63:0] argb;
+        dma_set_source(src);
+        dma_set_dest(dst);
+        argb        = '0;
+        argb[4:2]   = channel;
+        argb[5]     = 1'b1;
+        argb[7:6]   = mode;
+        argb[19:8]  = tensor_m;
+        argb[31:20] = tensor_n;
+        drive(DMCPY, '0, argb);
+        while (!acc_res_valid) @(posedge clk);
+        error = acc_res.error;
+    endtask
+
+    task automatic dma_poll_status(
+        input  logic [1:0]  status_idx,
+        input  logic [2:0]  channel,
+        output logic [63:0] status_value
+    );
+        drive(DMSTAT, '0, {59'b0, channel, status_idx});
+        while (!acc_res_valid) @(posedge clk);
+        status_value = acc_res.data;
+    endtask
+
+    task automatic dma_wait(input tf_id_t transfer_id, input logic [2:0] channel);
+        logic [63:0] completed_id;
+        $display("[%0t] dma_wait(ID=%0d, chan=%0d) - waiting...", $time, transfer_id, channel);
+        forever begin
+            dma_poll_status(2'b00, channel, completed_id);
+            if (completed_id >= transfer_id) begin
+                dma_end_cycle = cycle_counter;
+                dma_cycles = dma_end_cycle - dma_start_cycle;
+                break;
+            end
+            repeat(10) @(posedge clk);
+        end
+    endtask
+
+    task automatic dma_wait_idle(input logic [2:0] channel);
+        logic [63:0] busy_status;
+        $display("[%0t] dma_wait_idle(chan=%0d) - waiting...", $time, channel);
+        forever begin
+            dma_poll_status(2'b10, channel, busy_status);
+            if (busy_status[0] == 1'b0) break;
+            repeat(5) @(posedge clk);
+        end
+    endtask
+
+endinterface
diff --git a/systems/snitch/test/idma_inst64_tb_pkg.sv b/systems/snitch/test/idma_inst64_tb_pkg.sv
new file mode 100644
index 00000000..b668e9dd
--- /dev/null
+++ b/systems/snitch/test/idma_inst64_tb_pkg.sv
@@ -0,0 +1,166 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+// Recycled from the vidma inst64 verification harness
+// (idma_alu_vec/test/frontend/idma_inst64_tb_pkg.sv). Kept faithful. The
+// transpose imposes no NumAxInFlight>=NE constraint (the engine self-buffers a
+// tile); verified down to NumAxInFlight=3 at NE=64.
+
+`include "axi/typedef.svh"
+`include "obi/typedef.svh"
+
+package idma_inst64_tb_pkg;
+
+    localparam int unsigned AxiDataWidth    = 512;
+    localparam int unsigned AxiAddrWidth    = 64;
+    localparam int unsigned AxiUserWidth    = 1;
+    localparam int unsigned AxiIdWidth      = 3;
+    localparam int unsigned NumAxInFlight   = 3;    // default
+    localparam int unsigned DMAReqFifoDepth = 3;
+    localparam int unsigned NumChannels     = 1;
+    localparam int unsigned NumHeads        = 1;
+    localparam int unsigned DMATracing      = 0;
+    localparam int unsigned Seed            = 1337;
+
+    localparam time Period     = 10ns;
+    localparam time ApplDelay  = Period / 4;
+    localparam time AcqDelay   = Period * 3 / 4;
+    localparam integer ResetCycles = 10;
+
+    // Type definitions
+    typedef logic [AxiAddrWidth-1:0] addr_t;
+    typedef logic [AxiIdWidth-1:0]   axi_id_t;
+    typedef logic [31:0]             data_t;
+    typedef logic [31:0]             tf_id_t;
+
+    // AXI Types
+    typedef logic [AxiAddrWidth-1:0]     axi_addr_t;
+    typedef logic [AxiDataWidth-1:0]     axi_data_t;
+    typedef logic [AxiDataWidth/8-1:0]   axi_strb_t;
+    typedef logic [AxiUserWidth-1:0]     axi_user_t;
+
+    `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, axi_addr_t, axi_id_t, axi_user_t)
+    `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, axi_data_t, axi_strb_t, axi_user_t)
+    `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, axi_id_t, axi_user_t)
+    `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, axi_addr_t, axi_id_t, axi_user_t)
+    `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, axi_data_t, axi_id_t, axi_user_t)
+    `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t)
+    `AXI_TYPEDEF_RESP_T(axi_resp_t, axi_b_chan_t, axi_r_chan_t)
+
+    // OBI L1/TCDM types (DataWidth=AxiDataWidth, AddrWidth=AxiAddrWidth, IdWidth=AxiIdWidth)
+    typedef logic [AxiDataWidth/8-1:0] obi_strb_t;
+    typedef logic [AxiIdWidth-1:0]     obi_id_t;
+    `OBI_TYPEDEF_MINIMAL_A_OPTIONAL(obi_a_optional_t)
+    `OBI_TYPEDEF_MINIMAL_R_OPTIONAL(obi_r_optional_t)
+    `OBI_TYPEDEF_TYPE_A_CHAN_T(obi_a_chan_t, axi_addr_t, axi_data_t, obi_strb_t, obi_id_t, obi_a_optional_t)
+    `OBI_TYPEDEF_TYPE_R_CHAN_T(obi_r_chan_t, axi_data_t, obi_id_t, obi_r_optional_t)
+    `OBI_TYPEDEF_REQ_T(obi_req_t, obi_a_chan_t)
+    `OBI_TYPEDEF_RSP_T(obi_res_t, obi_r_chan_t)
+
+    localparam obi_pkg::obi_cfg_t ObiCfg = '{
+        UseRReady:   1'b1,
+        CombGnt:     1'b0,
+        AddrWidth:   AxiAddrWidth,
+        DataWidth:   AxiDataWidth,
+        IdWidth:     AxiIdWidth,
+        Integrity:   1'b0,
+        BeFull:      1'b1,
+        OptionalCfg: obi_pkg::ObiMinimalOptionalConfig
+    };
+
+    // INIT meta-channel types (mirror src/db/idma_init.yml)
+    typedef struct packed {
+        logic [AxiAddrWidth-1:0]   cfg;
+        logic [AxiDataWidth-1:0]   term;
+        logic [AxiDataWidth/8-1:0] strb;
+        logic [AxiIdWidth-1:0]     id;
+    } init_req_chan_t;
+
+    typedef struct packed {
+        init_req_chan_t req_chan;
+        logic           req_valid;
+        logic           rsp_ready;
+    } init_req_t;
+
+    typedef struct packed {
+        logic [AxiDataWidth-1:0] init;
+    } init_rsp_chan_t;
+
+    typedef struct packed {
+        init_rsp_chan_t rsp_chan;
+        logic           rsp_valid;
+        logic           req_ready;
+    } init_rsp_t;
+
+    // address-decode rule type (DUT default)
+    typedef axi_pkg::xbar_rule_64_t addr_rule_t;
+
+    // Accelerator request/response types (simplified Snitch accelerator interface)
+    typedef struct packed {
+        logic [31:0] id;
+        logic [31:0] data_op;
+        logic [63:0] data_arga;
+        logic [63:0] data_argb;
+    } acc_req_t;
+
+    typedef struct packed {
+        logic [31:0] id;
+        logic [63:0] data;
+        logic        error;
+    } acc_res_t;
+
+    // DMA events (simplified)
+    typedef struct packed {
+        // aw
+        logic                aw_valid;
+        logic                aw_ready;
+        logic                aw_done;
+        logic                aw_stall;
+        axi_pkg::len_t       aw_len;
+        axi_pkg::size_t      aw_size;
+        // ar
+        logic                ar_valid;
+        logic                ar_ready;
+        logic                ar_done;
+        logic                ar_stall;
+        axi_pkg::len_t       ar_len;
+        axi_pkg::size_t      ar_size;
+        // r
+        logic                r_valid;
+        logic                r_ready;
+        logic                r_done;
+        logic                r_bw;
+        logic                r_stall;
+        // w
+        logic                w_valid;
+        logic                w_ready;
+        logic                w_done;
+        logic                w_stall;
+        logic [31:0]         num_bytes_written;
+        // b
+        logic                b_valid;
+        logic                b_ready;
+        logic                b_done;
+        // busy
+        logic                dma_busy;
+    } dma_events_t;
+
+    // Golden reference for validation
+    typedef struct {
+        addr_t   src_addr;
+        addr_t   dst_addr;
+        addr_t   length;
+        logic [5:0] alu_opcode;
+        logic [31:0] src_strides;
+        logic [31:0] dst_strides;
+        logic [31:0] reps;
+        logic        twod;
+        int unsigned channel;
+        tf_id_t expected_id;
+    } transfer_t;
+
+endpackage
diff --git a/systems/snitch/test/tb_idma_inst64_copy.sv b/systems/snitch/test/tb_idma_inst64_copy.sv
new file mode 100644
index 00000000..dbbe6bc9
--- /dev/null
+++ b/systems/snitch/test/tb_idma_inst64_copy.sv
@@ -0,0 +1,57 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+/// Stage-1 plain-copy regression for the standalone single-head inst64 frontend.
+/// Drives DMSRC/DMDST/DMCPY over the accelerator bus and verifies the copy in
+/// the AXI sim memory. No compute.
+module tb_idma_inst64_copy;
+  import idma_inst64_tb_pkg::*;
+
+  idma_inst64_base #(.DMATracing(0)) harness();
+
+  localparam int unsigned TimeoutCycles = 20000;
+  int unsigned errors = 0;
+
+  task automatic run_copy(input addr_t src, input addr_t dst, input int unsigned len,
+                          input byte start);
+    tf_id_t tid;
+    for (int i = 0; i < len; i++) harness.mem_write_byte(src + i, start + i[7:0]);
+    for (int i = 0; i < len; i++) harness.mem_write_byte(dst + i, 8'h00);
+    harness.drv_if.dma_set_source(src);
+    harness.drv_if.dma_set_dest(dst);
+    harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid);
+    harness.drv_if.dma_wait(tid, 0);
+    for (int i = 0; i < len; i++) begin
+      automatic logic [7:0] exp = start + i[7:0];
+      automatic logic [7:0] got = harness.mem_read_byte(dst + i);
+      if (got !== exp) begin
+        if (errors < 10) $error("[COPY] mismatch at %0d: exp 0x%02x got 0x%02x", i, exp, got);
+        errors++;
+      end
+    end
+  endtask
+
+  initial begin
+    @(posedge harness.rst_n);
+    repeat (10) @(posedge harness.clk);
+
+    $display("=== inst64 plain-copy regression ===");
+    run_copy(64'h8000_0000, 64'h9000_0000, 256,  8'hA0);  // word-multiple
+    run_copy(64'h8001_0000, 64'h9001_0000, 4055, 8'h10);  // large, non-aligned length
+    run_copy(64'h8002_0000, 64'h9002_0000, 7,    8'h30);  // tiny, sub-beat
+
+    if (errors == 0) $display("[SV] inst64 copy: SUCCESS (3 transfers)");
+    else             $fatal(1, "[SV] inst64 copy: FAIL (%0d errors)", errors);
+    $finish;
+  end
+
+  initial begin
+    repeat (TimeoutCycles) @(posedge harness.clk);
+    $fatal(1, "[TIMEOUT] inst64 copy exceeded %0d cycles", TimeoutCycles);
+  end
+
+endmodule
diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv
new file mode 100644
index 00000000..6ef2252d
--- /dev/null
+++ b/systems/snitch/test/tb_idma_inst64_transpose.sv
@@ -0,0 +1,172 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+/// End-to-end on-the-fly transpose through the inst64 frontend over the OBI/
+/// TCDM port (and AXI->OBI): DMCPY transpose decode -> opt.compute ->
+/// idma_transpose_midend (address-gen) -> idma_nd_midend -> backend -> memory.
+/// Sweeps a geometry list in one elaboration (one run per structural config,
+/// i.e. per BankSkew). Checks transposed data, bank-skew padding, back-to-back
+/// geometry leak (consecutive cases), cross-transfer compute leak, and reject.
+module tb_idma_inst64_transpose #(
+  parameter bit BankSkew = 1'b0
+);
+  import idma_inst64_tb_pkg::*;
+
+  localparam int unsigned StrbWidth = AxiDataWidth/8;
+
+  // TCDM/OBI region (addr_map routes < 0x1000_0000 to the OBI/TCDM port);
+  // ASRC is an external matrix in the AXI (ToSoC) region for the AXI->OBI case.
+  localparam addr_t TSRC = 64'h0000_1000;
+  localparam addr_t TDST = 64'h0040_0000;
+  localparam addr_t CPY  = 64'h0080_0000;
+  localparam addr_t ASRC = 64'h8000_0000;
+
+  // Geometry cases (M, N, EB) swept in one elaboration: int8/fp16/fp32, square/
+  // rectangular/odd; 32x8 EB4 and 64x4 EB2 trigger the BankSkew pitch pad.
+  localparam int unsigned NC = 8;
+  localparam int unsigned Cases [NC][3] = '{
+    '{ 8,  8, 1}, '{ 6,  5, 1}, '{16, 16, 1}, '{ 5,  7, 2},
+    '{10,  6, 2}, '{12,  8, 4}, '{32,  8, 4}, '{64,  4, 2}
+  };
+
+  idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1),
+                     .BankSkew(BankSkew)) harness();
+
+  int unsigned errs = 0;
+
+  // backend burst counter (address-gen issues M*N one-element bursts)
+  longint unsigned burst_cnt = 0;
+  always @(posedge harness.clk)
+    if (harness.i_dut.idma_req_valid[0] && harness.i_dut.idma_req_ready[0]) burst_cnt++;
+
+  // Unique per-element fingerprint: byte b of element idx encodes (idx>>8b), so
+  // a mis-permutation cannot hide behind a value collision.
+  function automatic logic [7:0] fp(input int unsigned idx, input int unsigned b);
+    return 8'((idx >> (8*b)) & 32'hFF);
+  endfunction
+
+  // padded dst row pitch (matches idma_transpose_midend BankSkew rule): pad by
+  // one bus-word of elements when mm*eb is an even number of bus words
+  function automatic int unsigned skew_pitch(input int unsigned mm, input int unsigned eb);
+    if (BankSkew && ((mm*eb) % (2*StrbWidth) == 0)) return mm + StrbWidth/eb;
+    else return mm;
+  endfunction
+
+  // memory backdoors selected by protocol: OBI (TCDM) vs AXI (ToSoC)
+  task automatic seed_byte(input bit obi, input addr_t a, input logic [7:0] d);
+    if (obi) harness.obi_write_byte(a, d); else harness.mem_write_byte(a, d);
+  endtask
+  function automatic logic [7:0] peek_byte(input bit obi, input addr_t a);
+    return obi ? harness.obi_read_byte(a) : harness.mem_read_byte(a);
+  endfunction
+
+  // Run one mm x nn (eb-byte element) transpose src -> dst via address-gen.
+  // src_obi/dst_obi pick the TCDM(OBI) vs external(AXI) memory.
+  task automatic do_transpose(input int unsigned mm, input int unsigned nn, input int unsigned eb,
+                              input addr_t src, input addr_t dst,
+                              input bit src_obi, input bit dst_obi);
+    tf_id_t tid;
+    longint unsigned c0, cyc, b0;
+    int unsigned mp, mode;
+    mp   = skew_pitch(mm, eb);
+    mode = (eb == 4) ? 2 : (eb == 2) ? 1 : 0;
+    for (int unsigned idx = 0; idx < mm*nn; idx++)
+      for (int unsigned b = 0; b < eb; b++)
+        seed_byte(src_obi, src + idx*eb + b, fp(idx, b));
+    // dst is an N x mp transpose (mp == M unless bank-skew pads the pitch)
+    for (int unsigned k = 0; k < nn*mp*eb; k++)
+      seed_byte(dst_obi, dst + k, 8'hCC);
+
+    c0 = harness.drv_if.cycle_counter; b0 = burst_cnt;
+    harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(mode), 3'd0, tid);
+    harness.drv_if.dma_wait(tid, 0);
+    harness.drv_if.dma_wait_idle(0);   // ensure all writes retired before reading
+    cyc = harness.drv_if.cycle_counter - c0;
+    $display("  %0dx%0d EB=%0d %s->%s pitch=%0d: bursts=%0d (exp %0d) cycles=%0d", mm, nn, eb,
+             src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", mp, burst_cnt-b0, mm*nn, cyc);
+
+    // out_T[c][r] == in[r][c] at dst row pitch mp
+    for (int unsigned c = 0; c < nn; c++)
+      for (int unsigned r = 0; r < mm; r++)
+        for (int unsigned b = 0; b < eb; b++) begin
+          automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mp + r)*eb + b);
+          automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*eb + b);
+          if (got !== exp) begin
+            errs++;
+            if (errs <= 12)
+              $display("[TP] data mismatch %0dx%0d out_T[%0d][%0d].b%0d=%02h exp %02h",
+                       mm, nn, c, r, b, got, exp);
+          end
+        end
+    // bank-skew padding (columns r in [mm, mp)) must stay sentinel
+    for (int unsigned c = 0; c < nn; c++)
+      for (int unsigned r = mm; r < mp; r++)
+        for (int unsigned b = 0; b < eb; b++)
+          if (peek_byte(dst_obi, dst + (c*mp + r)*eb + b) !== 8'hCC) begin
+            errs++;
+            if (errs <= 12) $display("[TP] skew padding clobbered %0dx%0d [%0d][%0d]", mm, nn, c, r);
+          end
+  endtask
+
+  initial begin
+    @(posedge harness.rst_n);
+    repeat (10) @(posedge harness.clk);
+    $display("=== inst64 transpose (BankSkew=%0d, StrbWidth=%0d) ===", BankSkew, StrbWidth);
+
+    // geometry sweep, OBI->OBI (consecutive cases also cover back-to-back leak)
+    for (int unsigned k = 0; k < NC; k++)
+      do_transpose(Cases[k][0], Cases[k][1], Cases[k][2], TSRC, TDST, 1'b1, 1'b1);
+
+    // AXI->OBI: load an external matrix into TCDM transposed
+    do_transpose(16, 12, 4, ASRC, TDST, 1'b0, 1'b1);
+
+    // cross-transfer compute leak: a plain OBI copy must NOT inherit opt.compute
+    begin
+      automatic int unsigned len = 128;
+      tf_id_t tid2;
+      for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(TSRC + k, 8'hE0 + k[4:0]);
+      for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(CPY  + k, 8'h00);
+      harness.drv_if.dma_set_source(TSRC);
+      harness.drv_if.dma_set_dest(CPY);
+      harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid2);
+      harness.drv_if.dma_wait(tid2, 0);
+      harness.drv_if.dma_wait_idle(0);
+      for (int unsigned k = 0; k < len; k++)
+        if (harness.obi_read_byte(CPY + k) !== harness.obi_read_byte(TSRC + k)) begin
+          errs++;
+          if (errs <= 12) $display("[TP] leak: post-transpose copy wrong at %0d", k);
+        end
+    end
+
+    // malformed transpose requests: error response, nothing launched
+    begin
+      logic err;
+      longint unsigned b_rej;
+      b_rej = burst_cnt;
+      harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd8, 12'd8, 2'd3, 3'd0, err);
+      if (!err) begin errs++; $display("[TP] reject fail: reserved mode 3"); end
+      harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd0, 12'd8, 2'd0, 3'd0, err);
+      if (!err) begin errs++; $display("[TP] reject fail: M == 0"); end
+      harness.drv_if.dma_transpose_err(TSRC, TDST + 64'd1, 12'd8, 12'd8, 2'd0, 3'd0, err);
+      if (!err) begin errs++; $display("[TP] reject fail: unaligned dst"); end
+      repeat (50) @(posedge harness.clk);
+      if (burst_cnt != b_rej) begin
+        errs++; $display("[TP] reject fail: rejected request launched bursts");
+      end
+    end
+
+    if (errs == 0) $display("[TP] PASS: %0d-case sweep + AXI->OBI + no-leak + reject OK", NC);
+    else           $fatal(1, "[TP] FAIL: %0d mismatches", errs);
+    $finish;
+  end
+
+  initial begin
+    repeat (1_000_000) @(posedge harness.clk);
+    $fatal(1, "[TIMEOUT] inst64 transpose");
+  end
+
+endmodule