pulp-platform · DanielKellerM · Jun 10, 2026 · Jun 10, 2026 · Jun 23, 2026 · Jun 23, 2026
@@ -97,6 +97,18 @@ sources:
       # Level 2
       - src/frontend/desc64/idma_desc64_top.sv
 
+  # Snitch inst64 standalone harness; needs snitch_cluster for idma_inst64_top + opcode pkg
+  - target: all(snitch_cluster, idma_test)
+    files:
+      # Level 0
+      - systems/snitch/test/idma_inst64_tb_pkg.sv
+      # Level 1
+      - systems/snitch/test/idma_inst64_drv_if.sv
+      - systems/snitch/test/idma_inst64_base.sv
+      # Level 2
+      - systems/snitch/test/tb_idma_inst64_copy.sv
+      - systems/snitch/test/tb_idma_inst64_transpose.sv
+
   # Synthesis wrappers
   - target: synth
     files:

@@ -22,6 +22,13 @@ module idma_inst64_top #(
     parameter int unsigned NumChannels     = 32'd1,
     parameter bit          TCDMAliasEnable = 1'b0,
     parameter int unsigned DMATracing      = 32'd0,
+    /// Compile-time on-the-fly compute feature enables (e.g. transpose)
+    parameter idma_pkg::compute_enable_t ComputeEnable = '0,
+    /// Transpose via address generation (no FF engine) — for backends without
+    /// the engine, e.g. this multi-write OBI/TCDM variant
+    parameter bit          AddrGenTranspose = 1'b0,
+    /// Address-gen transpose: skew the dst pitch to avoid TCDM bank conflicts
+    parameter bit          BankSkew         = 1'b0,
     parameter type         axi_ar_chan_t   = logic,
     parameter type         axi_aw_chan_t   = logic,
     parameter type         axi_req_t       = logic,
@@ -70,7 +77,7 @@ module idma_inst64_top #(
     localparam int unsigned TfIdWidth    = 32'd32;
     localparam int unsigned TFLenWidth   = AxiAddrWidth;
     localparam int unsigned RepWidth     = 32'd32;
-    localparam int unsigned NumDim       = 32'd2;
+    localparam int unsigned NumDim       = ComputeEnable.transpose ? 32'd4 : 32'd2;
     localparam int unsigned BufferDepth  = 32'd3;
     localparam int unsigned NumRules     = 32'd5;
 
@@ -84,7 +91,8 @@ module idma_inst64_top #(
     localparam type id_t                 = logic[AxiIdWidth-1:0];
     localparam type tf_len_t             = logic[TFLenWidth-1:0];
     localparam type offset_t             = logic[OffsetWidth-1:0];
-    localparam type strides_t            = logic[RepWidth-1:0];
+    // strides must match addr_t: signed transpose deltas would not sign-extend if narrower
+    localparam type strides_t            = addr_t;
     localparam type reps_t               = logic[RepWidth-1:0];
     localparam type tf_id_t              = logic[TfIdWidth-1:0];
 
@@ -178,6 +186,7 @@ module idma_inst64_top #(
     logic [1:0] idma_fe_status;
     logic [2:0] idma_fe_sel_chan;
     logic       idma_fe_twod;
+    logic       idma_fe_tp_reject;
 
     // busy signals
     idma_pkg::idma_busy_t [NumChannels-1:0] idma_busy;
@@ -348,7 +357,7 @@ module idma_inst64_top #(
             .idma_req_t    ( idma_req_t    ),
             .idma_rsp_t    ( idma_rsp_t    ),
             .idma_nd_req_t ( idma_nd_req_t ),
-            .RepWidths     ( RepWidth      )
+            .RepWidths     ( {NumDim{RepWidth}} )
         ) i_idma_nd_midend (
             .clk_i,
             .rst_ni,
@@ -367,6 +376,33 @@ module idma_inst64_top #(
             .busy_o            ( idma_nd_busy      [c] )
         );
 
+        // FIFO output, before transpose expansion
+        idma_nd_req_t fifo_nd_req;
+        logic         fifo_nd_valid, fifo_nd_ready;
+
+        // expand transpose requests into the tiled ND walk
+        if (ComputeEnable.transpose) begin : gen_transpose
+            idma_transpose_midend #(
+                .NumDim           ( NumDim           ),
+                .AddrGenTranspose ( AddrGenTranspose ),
+                .BankSkew         ( BankSkew         ),
+                .StrbWidth        ( StrbWidth        ),
+                .addr_t           ( addr_t           ),
+                .idma_nd_req_t    ( idma_nd_req_t    )
+            ) i_idma_transpose_midend (
+                .nd_req_i ( fifo_nd_req           ),
+                .valid_i  ( fifo_nd_valid         ),
+                .ready_o  ( fifo_nd_ready         ),
+                .nd_req_o ( idma_nd_req       [c] ),
+                .valid_o  ( idma_nd_req_valid [c] ),
+                .ready_i  ( idma_nd_req_ready [c] )
+            );
+        end else begin : gen_no_transpose
+            assign idma_nd_req       [c] = fifo_nd_req;
+            assign idma_nd_req_valid [c] = fifo_nd_valid;
+            assign fifo_nd_ready         = idma_nd_req_ready [c];
+        end
+
         stream_fifo_optimal_wrap #(
             .Depth     ( DMAReqFifoDepth ),
             .type_t    ( idma_nd_req_t   ),
@@ -380,9 +416,9 @@ module idma_inst64_top #(
             .data_i     ( idma_fe_req           ),
             .valid_i    ( idma_fe_req_valid [c] ),
             .ready_o    ( idma_fe_req_ready [c] ),
-            .data_o     ( idma_nd_req       [c] ),
-            .valid_o    ( idma_nd_req_valid [c] ),
-            .ready_i    ( idma_nd_req_ready [c] )
+            .data_o     ( fifo_nd_req           ),
+            .valid_o    ( fifo_nd_valid         ),
+            .ready_i    ( fifo_nd_ready         )
         );
     end
 
@@ -519,10 +555,12 @@ module idma_inst64_top #(
         idma_fe_req_d.burst_req.opt.beo.src_reduce_len = 1'b0;
         idma_fe_req_d.burst_req.opt.beo.dst_reduce_len = 1'b0;
         idma_fe_req_d.burst_req.opt.last               = 1'b0;
+        idma_fe_req_d.burst_req.opt.compute            = '0;
 
         // frontend config
         idma_fe_cfg      = '0;
         idma_fe_status   = '0;
+        idma_fe_tp_reject = 1'b0;
         idma_fe_sel_chan = '0;
 
         // default handshaking
@@ -573,6 +611,28 @@ module idma_inst64_top #(
                         idma_inst64_snitch_pkg::DMCPY : begin
                             idma_fe_cfg      = acc_req_i.data_argb[1:0];
                             idma_fe_sel_chan = acc_req_i.data_argb[4:2];
+                            // transpose request (register form only): argb spare bits
+                            // carry {enable, mode, tensor_m, tensor_n}
+                            if (ComputeEnable.transpose && acc_req_i.data_argb[5]) begin
+                                idma_fe_req_d.burst_req.opt.compute.enable = 1'b1;
+                                idma_fe_req_d.burst_req.opt.compute.op     =
+                                    idma_pkg::COMPUTE_TRANSPOSE;
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.mode     =
+                                    acc_req_i.data_argb[7:6];
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_m =
+                                    acc_req_i.data_argb[19:8];
+                                idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_n =
+                                    acc_req_i.data_argb[31:20];
+                            end
+                            // reject malformed transpose requests: no hardware,
+                            // reserved mode, zero dims, unaligned dst
+                            if (acc_req_i.data_argb[5]) begin
+                                idma_fe_tp_reject = !ComputeEnable.transpose
+                                    | (acc_req_i.data_argb[7:6] == 2'd3)
+                                    | (acc_req_i.data_argb[19:8] == '0)
+                                    | (acc_req_i.data_argb[31:20] == '0)
+                                    | (idma_fe_req_d.burst_req.dst_addr[OffsetWidth-1:0] != '0);
+                            end
                         end
                         default:;
                     endcase
@@ -588,7 +648,15 @@ module idma_inst64_top #(
                     // 3. wait for twod transfer to be accepted (ready)
                     // 4. send acc response (pvalid)
                     // 5. acknowledge acc request (qready)
-                    if (acc_res_ready) begin
+                    // DMCPY launch; transpose requests reject malformed configs
+                    if (idma_fe_tp_reject) begin
+                        // error response; the transfer is not launched
+                        if (acc_res_ready) begin
+                            acc_res.id      = acc_req_i.id;
+                            acc_res_valid   = 1'b1;
+                            acc_req_ready_o = 1'b1;
+                        end
+                    end else if (acc_res_ready) begin
                         idma_fe_req_valid[idma_fe_sel_chan] = 1'b1;
                         if (idma_fe_req_ready[idma_fe_sel_chan]) begin
                             acc_res.id      = acc_req_i.id;
@@ -750,6 +818,12 @@ module idma_inst64_top #(
         if (!idma_fe_twod) begin
             idma_fe_req.d_req[0].reps = 'd1;
         end
+        // keep higher dims inert for plain requests (the transpose expander overwrites them)
+        for (int d = 1; d <= NumDim-2; d++) begin
+            idma_fe_req.d_req[d].reps        = 'd1;
+            idma_fe_req.d_req[d].src_strides = '0;
+            idma_fe_req.d_req[d].dst_strides = '0;
+        end
     end
 
     //--------------------------------------
@@ -763,6 +837,16 @@ module idma_inst64_top #(
     //--------------------------------------
     // only activate tracer if requested
 `ifndef SYNTHESIS
+    initial assert (idma_pkg::TransposeDimWidth == 32'd12) else
+        $fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12");
+`ifndef VERILATOR
+    // capability cross-check against the generated backend's baked compute set
+    // (engine route only; address-gen needs no compute-enabled backend)
+    if (ComputeEnable.transpose && !AddrGenTranspose) begin : gen_compute_check
+        initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else
+            $fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant");
+    end
+`endif
     if (DMATracing) begin : gen_tracer
         for (genvar c = 0; c < NumChannels; c++) begin : gen_channels
             // derive the name of the trace file from the hart and channel IDs

@@ -5,12 +5,18 @@
 // Authors:
 // - Daniel Keller <dankeller@iis.ee.ethz.ch>
 
-/// Transpose geometry expander: expands an opt.compute=TRANSPOSE request into a
-/// NumDim=4 tiled ND walk for the generic idma_nd_midend. Non-transpose passes
-/// through. Combinational, quasi-static per request.
+/// Transpose geometry expander for the generic idma_nd_midend. Two modes:
+/// engine (NumDim=4 tiled walk feeding the FF transpose engine) and address-gen
+/// (element-granular swapped-stride walk, no engine, for backends without the
+/// engine e.g. multi-write OBI/TCDM). Non-transpose passes through.
 module idma_transpose_midend #(
-    /// Number of ND dimensions (must be >= 4 to express the tiled walk)
+    /// Number of ND dimensions (engine walk needs >= 4; address-gen needs >= 3)
     parameter int unsigned NumDim    = 32'd4,
+    /// Address-gen mode: element-granular swapped-stride transpose, no engine
+    parameter bit          AddrGenTranspose = 1'b0,
+    /// Address-gen: pad the dst row pitch by one bus-word when needed so the
+    /// per-column word stride is odd (conflict-free on power-of-2-bank TCDM)
+    parameter bit          BankSkew     = 1'b0,
     /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes)
     parameter int unsigned StrbWidth = 32'd64,
     /// Address type
@@ -51,6 +57,7 @@
         logic [TensorW-1:0]      tm, tn;
         logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe;
         logic signed [WorkW-1:0] strb_c;   // NE*E == StrbWidth (mode cancels)
+        logic signed [WorkW-1:0] e, me, pad;  // address-gen: E (=1<<mode), M*E, pitch pad
 
         nd_req_o = nd_req_i;   // passthrough
 
@@ -59,43 +66,74 @@
             tm   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_m;
             tn   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_n;
             // zero-extend bounded dims into the signed working width
-            m      = $signed({{(WorkW-TensorW){1'b0}}, tm});   // M
-            n      = $signed({{(WorkW-TensorW){1'b0}}, tn});   // N
-            log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
-            ne     = $signed(WorkW'(1)) <<< log2ne;            // tile side (elements)
-            yt     = (m + ne - 1) >>> log2ne;                  // ceil(M/NE)
-            nt     = (n + ne - 1) >>> log2ne;                  // ceil(N/NE)
-            nxe    = n  <<< mode;                              // N*E  (E = 1<<mode)
-            mpe    = yt <<< Log2Strb;                          // MP*E = YT*NE*E = YT*StrbWidth
-            strb_c = $signed(WorkW'(StrbWidth));               // NE*E (one tile-row = StrbWidth B)
+            m = $signed({{(WorkW-TensorW){1'b0}}, tm});   // M
+            n = $signed({{(WorkW-TensorW){1'b0}}, tn});   // N
 
-            nd_req_o.burst_req.length     = LenW'(StrbWidth);
+            if (AddrGenTranspose) begin
+                // Element-granular swapped-stride walk (out_T[c][r] = in[r][c]),
+                // dst an N x M' transpose. No engine: compute is cleared so the
+                // backend runs a plain strided copy. Correct on any protocol
+                // (ideal on random-access OBI/TCDM; slow on burst AXI).
+                e   = $signed(WorkW'(1)) <<< mode;             // E = 1<<mode
+                me  = m <<< mode;                              // M*E
+                // BankSkew: when M*E is an even number of bus words the column
+                // stride hammers one TCDM bank; pad the pitch by one word (NE
+                // elements) to make the word stride odd => round-robin all banks.
+                ne  = $signed(WorkW'(StrbWidth)) >>> mode;     // NE = StrbWidth/E
+                pad = (BankSkew && (me[Log2Strb:0] == '0)) ? ne : '0;
+                me  = (m + pad) <<< mode;                      // M'*E (padded pitch)
+                nd_req_o.burst_req.opt.compute.enable = 1'b0;
+                nd_req_o.burst_req.length     = LenW'(e);
+                // d_req[0] = column walk (reps N): src +E, dst +M'*E
+                nd_req_o.d_req[0].reps        = n[RepW-1:0];
+                nd_req_o.d_req[0].src_strides = addr_t'(e);
+                nd_req_o.d_req[0].dst_strides = addr_t'(me);
+                // d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M'*E (rewind)
+                nd_req_o.d_req[1].reps        = m[RepW-1:0];
+                nd_req_o.d_req[1].src_strides = addr_t'(e);
+                nd_req_o.d_req[1].dst_strides = addr_t'(e - (n - 1) * me);
+                for (int unsigned d = 2; d < NumDim-1; d++) begin
+                    nd_req_o.d_req[d].reps        = RepW'(1);
+                    nd_req_o.d_req[d].src_strides = '0;
+                    nd_req_o.d_req[d].dst_strides = '0;
+                end
+            end else begin
+                log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
+                ne     = $signed(WorkW'(1)) <<< log2ne;            // tile side (elements)
+                yt     = (m + ne - 1) >>> log2ne;                  // ceil(M/NE)
+                nt     = (n + ne - 1) >>> log2ne;                  // ceil(N/NE)
+                nxe    = n  <<< mode;                              // N*E  (E = 1<<mode)
+                mpe    = yt <<< Log2Strb;                          // MP*E = YT*NE*E = YT*StrbWidth
+                strb_c = $signed(WorkW'(StrbWidth));               // NE*E (one tile-row = StrbWidth B)
 
-            // d_req[0] = local row within tile (reps NE)
-            nd_req_o.d_req[0].reps        = ne[RepW-1:0];
-            nd_req_o.d_req[0].src_strides = addr_t'(nxe);
-            nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
-            // d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
-            nd_req_o.d_req[1].reps        = yt[RepW-1:0];
-            nd_req_o.d_req[1].src_strides = addr_t'(nxe);
-            nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
-            // d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
-            //            the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
-            nd_req_o.d_req[2].reps        = nt[RepW-1:0];
-            nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
-            nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
-            // the walk is exactly 4-D: neutralize any higher dims
-            for (int unsigned d = 3; d < NumDim-1; d++) begin
-                nd_req_o.d_req[d].reps        = RepW'(1);
-                nd_req_o.d_req[d].src_strides = '0;
-                nd_req_o.d_req[d].dst_strides = '0;
+                nd_req_o.burst_req.length     = LenW'(StrbWidth);
+
+                // d_req[0] = local row within tile (reps NE)
+                nd_req_o.d_req[0].reps        = ne[RepW-1:0];
+                nd_req_o.d_req[0].src_strides = addr_t'(nxe);
+                nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
+                // d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
+                nd_req_o.d_req[1].reps        = yt[RepW-1:0];
+                nd_req_o.d_req[1].src_strides = addr_t'(nxe);
+                nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
+                // d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
+                //            the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
+                nd_req_o.d_req[2].reps        = nt[RepW-1:0];
+                nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
+                nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
+                // the walk is exactly 4-D: neutralize any higher dims
+                for (int unsigned d = 3; d < NumDim-1; d++) begin
+                    nd_req_o.d_req[d].reps        = RepW'(1);
+                    nd_req_o.d_req[d].src_strides = '0;
+                    nd_req_o.d_req[d].dst_strides = '0;
+                end
             end
         end
     end
 
 `ifndef SYNTHESIS
-    initial assert (NumDim >= 4) else
-        $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim);
+    initial assert (NumDim >= (AddrGenTranspose ? 32'd3 : 32'd4)) else
+        $fatal(1, "idma_transpose_midend: NumDim too small (got %0d)", NumDim);
     // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2
     initial assert (Log2Strb >= 2) else
         $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth);

@@ -0,0 +1,5 @@
+build/
+*.so
+modelsim.ini
+transcript
+work/