From 8c254fd65853415732372e8f44abaadbe571a755 Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Wed, 10 Jun 2026 16:04:02 +0200 Subject: [PATCH 1/7] frontend: Drive on-the-fly transpose through the inst64 frontend Decode the transpose from spare DMCPY argb bits into opt.compute, expand NumDim to 4 with addr-width strides and splice the transpose midend between the request FIFO and the nd_midend, gated by a ComputeEnable parameter. Malformed requests (no hardware, reserved mode, zero dims, unaligned dst) get an error response and the backend's baked compute set is cross-checked at elaboration. --- src/frontend/inst64/idma_inst64_top.sv | 90 ++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 7 deletions(-) diff --git a/src/frontend/inst64/idma_inst64_top.sv b/src/frontend/inst64/idma_inst64_top.sv index 4f93b55f..6c7586aa 100644 --- a/src/frontend/inst64/idma_inst64_top.sv +++ b/src/frontend/inst64/idma_inst64_top.sv @@ -22,6 +22,8 @@ module idma_inst64_top #( parameter int unsigned NumChannels = 32'd1, parameter bit TCDMAliasEnable = 1'b0, parameter int unsigned DMATracing = 32'd0, + /// Compile-time on-the-fly compute feature enables (e.g. transpose) + parameter idma_pkg::compute_enable_t ComputeEnable = '0, parameter type axi_ar_chan_t = logic, parameter type axi_aw_chan_t = logic, parameter type axi_req_t = logic, @@ -70,7 +72,7 @@ module idma_inst64_top #( localparam int unsigned TfIdWidth = 32'd32; localparam int unsigned TFLenWidth = AxiAddrWidth; localparam int unsigned RepWidth = 32'd32; - localparam int unsigned NumDim = 32'd2; + localparam int unsigned NumDim = ComputeEnable.transpose ? 32'd4 : 32'd2; localparam int unsigned BufferDepth = 32'd3; localparam int unsigned NumRules = 32'd5; @@ -84,7 +86,8 @@ module idma_inst64_top #( localparam type id_t = logic[AxiIdWidth-1:0]; localparam type tf_len_t = logic[TFLenWidth-1:0]; localparam type offset_t = logic[OffsetWidth-1:0]; - localparam type strides_t = logic[RepWidth-1:0]; + // strides must match addr_t: signed transpose deltas would not sign-extend if narrower + localparam type strides_t = addr_t; localparam type reps_t = logic[RepWidth-1:0]; localparam type tf_id_t = logic[TfIdWidth-1:0]; @@ -178,6 +181,7 @@ module idma_inst64_top #( logic [1:0] idma_fe_status; logic [2:0] idma_fe_sel_chan; logic idma_fe_twod; + logic idma_fe_tp_reject; // busy signals idma_pkg::idma_busy_t [NumChannels-1:0] idma_busy; @@ -348,7 +352,7 @@ module idma_inst64_top #( .idma_req_t ( idma_req_t ), .idma_rsp_t ( idma_rsp_t ), .idma_nd_req_t ( idma_nd_req_t ), - .RepWidths ( RepWidth ) + .RepWidths ( {NumDim{RepWidth}} ) ) i_idma_nd_midend ( .clk_i, .rst_ni, @@ -367,6 +371,31 @@ module idma_inst64_top #( .busy_o ( idma_nd_busy [c] ) ); + // FIFO output, before transpose expansion + idma_nd_req_t fifo_nd_req; + logic fifo_nd_valid, fifo_nd_ready; + + // expand transpose requests into the tiled ND walk + if (ComputeEnable.transpose) begin : gen_transpose + idma_transpose_midend #( + .NumDim ( NumDim ), + .StrbWidth ( StrbWidth ), + .addr_t ( addr_t ), + .idma_nd_req_t ( idma_nd_req_t ) + ) i_idma_transpose_midend ( + .nd_req_i ( fifo_nd_req ), + .valid_i ( fifo_nd_valid ), + .ready_o ( fifo_nd_ready ), + .nd_req_o ( idma_nd_req [c] ), + .valid_o ( idma_nd_req_valid [c] ), + .ready_i ( idma_nd_req_ready [c] ) + ); + end else begin : gen_no_transpose + assign idma_nd_req [c] = fifo_nd_req; + assign idma_nd_req_valid [c] = fifo_nd_valid; + assign fifo_nd_ready = idma_nd_req_ready [c]; + end + stream_fifo_optimal_wrap #( .Depth ( DMAReqFifoDepth ), .type_t ( idma_nd_req_t ), @@ -380,9 +409,9 @@ module idma_inst64_top #( .data_i ( idma_fe_req ), .valid_i ( idma_fe_req_valid [c] ), .ready_o ( idma_fe_req_ready [c] ), - .data_o ( idma_nd_req [c] ), - .valid_o ( idma_nd_req_valid [c] ), - .ready_i ( idma_nd_req_ready [c] ) + .data_o ( fifo_nd_req ), + .valid_o ( fifo_nd_valid ), + .ready_i ( fifo_nd_ready ) ); end @@ -519,10 +548,12 @@ module idma_inst64_top #( idma_fe_req_d.burst_req.opt.beo.src_reduce_len = 1'b0; idma_fe_req_d.burst_req.opt.beo.dst_reduce_len = 1'b0; idma_fe_req_d.burst_req.opt.last = 1'b0; + idma_fe_req_d.burst_req.opt.compute = '0; // frontend config idma_fe_cfg = '0; idma_fe_status = '0; + idma_fe_tp_reject = 1'b0; idma_fe_sel_chan = '0; // default handshaking @@ -573,6 +604,28 @@ module idma_inst64_top #( idma_inst64_snitch_pkg::DMCPY : begin idma_fe_cfg = acc_req_i.data_argb[1:0]; idma_fe_sel_chan = acc_req_i.data_argb[4:2]; + // transpose request (register form only): argb spare bits + // carry {enable, mode, tensor_m, tensor_n} + if (ComputeEnable.transpose && acc_req_i.data_argb[5]) begin + idma_fe_req_d.burst_req.opt.compute.enable = 1'b1; + idma_fe_req_d.burst_req.opt.compute.op = + idma_pkg::COMPUTE_TRANSPOSE; + idma_fe_req_d.burst_req.opt.compute.params.transpose.mode = + acc_req_i.data_argb[7:6]; + idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_m = + acc_req_i.data_argb[19:8]; + idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_n = + acc_req_i.data_argb[31:20]; + end + // reject malformed transpose requests: no hardware, + // reserved mode, zero dims, unaligned dst + if (acc_req_i.data_argb[5]) begin + idma_fe_tp_reject = !ComputeEnable.transpose + | (acc_req_i.data_argb[7:6] == 2'd3) + | (acc_req_i.data_argb[19:8] == '0) + | (acc_req_i.data_argb[31:20] == '0) + | (idma_fe_req_d.burst_req.dst_addr[OffsetWidth-1:0] != '0); + end end default:; endcase @@ -588,7 +641,15 @@ module idma_inst64_top #( // 3. wait for twod transfer to be accepted (ready) // 4. send acc response (pvalid) // 5. acknowledge acc request (qready) - if (acc_res_ready) begin + // DMCPY launch; transpose requests reject malformed configs + if (idma_fe_tp_reject) begin + // error response; the transfer is not launched + if (acc_res_ready) begin + acc_res.id = acc_req_i.id; + acc_res_valid = 1'b1; + acc_req_ready_o = 1'b1; + end + end else if (acc_res_ready) begin idma_fe_req_valid[idma_fe_sel_chan] = 1'b1; if (idma_fe_req_ready[idma_fe_sel_chan]) begin acc_res.id = acc_req_i.id; @@ -750,6 +811,12 @@ module idma_inst64_top #( if (!idma_fe_twod) begin idma_fe_req.d_req[0].reps = 'd1; end + // keep higher dims inert for plain requests (the transpose expander overwrites them) + for (int d = 1; d <= NumDim-2; d++) begin + idma_fe_req.d_req[d].reps = 'd1; + idma_fe_req.d_req[d].src_strides = '0; + idma_fe_req.d_req[d].dst_strides = '0; + end end //-------------------------------------- @@ -763,6 +830,15 @@ module idma_inst64_top #( //-------------------------------------- // only activate tracer if requested `ifndef SYNTHESIS + initial assert (idma_pkg::TransposeDimWidth == 32'd12) else + $fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12"); +`ifndef VERILATOR + // capability cross-check against the generated backend's baked compute set + if (ComputeEnable.transpose) begin : gen_compute_check + initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else + $fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant"); + end +`endif if (DMATracing) begin : gen_tracer for (genvar c = 0; c < NumChannels; c++) begin : gen_channels // derive the name of the trace file from the hart and channel IDs From afe66454196def8ac832cf8fefb0def4cbddca1c Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Wed, 10 Jun 2026 16:04:02 +0200 Subject: [PATCH 2/7] test: Add the snitch inst64 integration harness Standalone BFM harness driving the accelerator port: copy and transpose testbenches and a sweep covering all element sizes, tiling, edge, back-to-back, leak and reject cases, registered behind the snitch_cluster target; the flow regenerates the RTL before compiling. --- Bender.yml | 12 ++ systems/snitch/.gitignore | 5 + systems/snitch/Makefile | 56 +++++ systems/snitch/README.md | 65 ++++++ systems/snitch/test/idma_inst64_base.sv | 162 +++++++++++++++ systems/snitch/test/idma_inst64_drv_if.sv | 191 ++++++++++++++++++ systems/snitch/test/idma_inst64_tb_pkg.sv | 163 +++++++++++++++ systems/snitch/test/tb_idma_inst64_copy.sv | 56 +++++ .../snitch/test/tb_idma_inst64_transpose.sv | 151 ++++++++++++++ 9 files changed, 861 insertions(+) create mode 100644 systems/snitch/.gitignore create mode 100644 systems/snitch/Makefile create mode 100644 systems/snitch/README.md create mode 100644 systems/snitch/test/idma_inst64_base.sv create mode 100644 systems/snitch/test/idma_inst64_drv_if.sv create mode 100644 systems/snitch/test/idma_inst64_tb_pkg.sv create mode 100644 systems/snitch/test/tb_idma_inst64_copy.sv create mode 100644 systems/snitch/test/tb_idma_inst64_transpose.sv diff --git a/Bender.yml b/Bender.yml index 2049f69e..7a5baae6 100644 --- a/Bender.yml +++ b/Bender.yml @@ -97,6 +97,18 @@ sources: # Level 2 - src/frontend/desc64/idma_desc64_top.sv + # Snitch inst64 standalone harness; needs snitch_cluster for idma_inst64_top + opcode pkg + - target: all(snitch_cluster, idma_test) + files: + # Level 0 + - systems/snitch/test/idma_inst64_tb_pkg.sv + # Level 1 + - systems/snitch/test/idma_inst64_drv_if.sv + - systems/snitch/test/idma_inst64_base.sv + # Level 2 + - systems/snitch/test/tb_idma_inst64_copy.sv + - systems/snitch/test/tb_idma_inst64_transpose.sv + # Synthesis wrappers - target: synth files: diff --git a/systems/snitch/.gitignore b/systems/snitch/.gitignore new file mode 100644 index 00000000..180edf75 --- /dev/null +++ b/systems/snitch/.gitignore @@ -0,0 +1,5 @@ +build/ +*.so +modelsim.ini +transcript +work/ diff --git a/systems/snitch/Makefile b/systems/snitch/Makefile new file mode 100644 index 00000000..351cd6bb --- /dev/null +++ b/systems/snitch/Makefile @@ -0,0 +1,56 @@ +# Copyright 2026 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Daniel Keller +# +# Standalone build + sim flow for the Snitch inst64 integration. Elaborates the +# upstream single-head idma_inst64_top + the recycled harness against iDMA's own +# deps (axi, common_cells, common_verification). Uses the split_rtl compute- +# enabled rw_axi backend (the bundled idma_generated.sv predates opt.compute). + +SNITCH_DIR := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))) +IDMA_ROOT := $(realpath $(SNITCH_DIR)/../..) +BENDER ?= bender +VSIM ?= questa-2023.4 vsim + +BUILD := $(SNITCH_DIR)/build +TARGETS := -t rtl -t split_rtl -t snitch_cluster -t idma_test -t test +TOP ?= tb_idma_inst64_copy + +# Transpose end-to-end sweep: shapes across all element sizes (int8/fp16/fp32), +# single/multi-tile, edge, exact-multiple (zero padding), and the NE=StrbWidth +# in-flight boundary. Each mode (EB=1/2/4) gets single-tile + aligned-multi-tile +# + edge-multi-tile coverage. +TP_SWEEP := 8,8,4 40,70,4 48,16,4 \ + 32,32,2 50,40,2 96,96,2 70,96,2 \ + 64,64,1 130,80,1 128,64,1 256,96,1 + +.PHONY: snitch_sim snitch_compile snitch_transpose_sweep snitch_clean +snitch_sim: snitch_compile + cd $(BUILD) && $(VSIM) -c $(TOP)_opt -do "run -all; quit" + +snitch_transpose_sweep: $(BUILD)/compile_snitch.tcl + @for cfg in $(TP_SWEEP); do \ + m=$${cfg%%,*}; r=$${cfg#*,}; n=$${r%%,*}; e=$${r##*,}; \ + cd $(BUILD) && $(VSIM) -c -do \ + "source compile_snitch.tcl; vopt +acc tb_idma_inst64_transpose -gM=$$m -gN=$$n -gEB=$$e -o tb_sw; quit" \ + >/dev/null 2>&1; \ + printf '%-14s ' "$$m x $$n EB=$$e:"; \ + $(VSIM) -c tb_sw -do "run -all; quit" 2>&1 | grep -E '\[TP\] (PASS|FAIL)' | sed 's/# //'; \ + done + +snitch_compile: $(BUILD)/compile_snitch.tcl + cd $(BUILD) && $(VSIM) -c -do \ + "source compile_snitch.tcl; vopt +acc $(TOP) -o $(TOP)_opt; quit" + +$(BUILD)/compile_snitch.tcl: | $(BUILD) + $(MAKE) -C $(IDMA_ROOT) idma_hw_all + cd $(IDMA_ROOT) && $(BENDER) script vsim $(TARGETS) \ + --vlog-arg="-svinputport=compat" > $@ + +$(BUILD): + mkdir -p $(BUILD) + +snitch_clean: + rm -rf $(BUILD) diff --git a/systems/snitch/README.md b/systems/snitch/README.md new file mode 100644 index 00000000..4fd6a5bb --- /dev/null +++ b/systems/snitch/README.md @@ -0,0 +1,65 @@ +# Snitch (inst64) iDMA integration + +Standalone host for the **inst64** ISA-coupled frontend (`idma_inst64_top`) — the +tightly-coupled Snitch DMA interface. iDMA already owns `idma_inst64_top`; this +directory adds a cluster-free verification harness and (Stage 2) the on-the-fly +transpose wired through the accelerator interface. + +## Recycled, not reinvented + +The harness is **recycled from the vidma fork's inst64 verification interface** +(`idma_alu_vec/test/frontend/`), adapted only as the clean upstream single-head +`idma_inst64_top` requires: + +| File | Provenance | +|------|------------| +| `test/idma_inst64_tb_pkg.sv` | faithful copy (8-line delta: `AxiDataWidth`/`NumAxInFlight` sizing + header) | +| `test/idma_inst64_drv_if.sv` | faithful copy; dropped the 4 vidma-only tasks (`DMOPC`, multi-head copy, immediate `DMCPYI`) to match upstream | +| `test/idma_inst64_base.sv` | adapted: single-head (`axi_req_o[NumChannels]`, no `NumHeads`/`enable_single_head_mode`) | +| `test/tb_idma_inst64_copy.sv` | Stage-1 plain-copy regression | + +The accelerator interface (the 4-field `acc_req`/`acc_res` bus + the `DM*` +instruction BFM) is exactly the vidma one — no reinvention. + +## Why split_rtl + +`idma_inst64_top` is gated behind the `snitch_cluster` Bender target. The build +uses `-t split_rtl` (per-variant RTL) because the **bundled `idma_generated.sv` +predates the typed `opt.compute` struct** (it still references the old flat +`opt.transpose_en` fields) and won't elaborate against the current package. The +split_rtl `idma_backend_rw_axi` is compute-enabled (`IDMA_VIDMA_IDS=rw_axi`). + +## Standalone simulation + +```bash +make -C systems/snitch snitch_sim # plain-copy regression (Stage 1) +make -C systems/snitch snitch_sim TOP=tb_idma_inst64_transpose # transpose (Stage 2) +``` + +Drives `DMSRC`/`DMDST`/`DMCPY` (+ `DMSTR`/`DMREP` for 2D) over the accelerator +bus and verifies the AXI sim memory. Requires `questa-2023.4`. + +## Status + +- **Stage 1 (done):** plain copy through the single-head frontend — 3 transfers pass. +- **Stage 2 (done):** multi-tile on-the-fly transpose, end-to-end. A transpose is + programmed with the spare `DMCPY` argb bits (`[5]`=enable, `[7:6]`=mode, + `[19:8]`=M, `[31:20]`=N), populating the typed per-transfer `opt.compute`; the + dedicated `src/midend/idma_transpose_midend.sv` expands `(M,N,mode)` into the + `NumDim=4` tiled walk; the unmodified `idma_nd_midend` walks it into the + compute-enabled `rw_axi` backend. Gated by `idma_inst64_top`'s + `ComputeEnable.transpose` (off by default, so other snitch_cluster consumers + are unaffected). Verified across int8/fp16/fp32, single/multi-tile, edge tiles, + padding integrity, back-to-back, and cross-transfer no-leak: + `make -C systems/snitch snitch_transpose_sweep`. Full functionality at any + `NumAxInFlight` (down to the backend min) — the compute backend internally + buffers a tile of write descriptors (`ComputeFifoDepth = StrbWidth`), so there + is no `NumAxInFlight >= NE` constraint. + +## Transpose memory contract + +A transposed transfer reads the source up to the tile-padded bounds +(`ceil(M/NE)*NE` rows of `N` elements, the last row tile reading past row `M-1`) +and writes the full padded destination extent (`ceil(N/NE)*NE` rows at pitch +`MP = ceil(M/NE)*NE`; padding is strobe-masked but addressed). Both regions must +be mapped, side-effect-free memory. diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv new file mode 100644 index 00000000..5c9568e5 --- /dev/null +++ b/systems/snitch/test/idma_inst64_base.sv @@ -0,0 +1,162 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Daniel Keller + +/// Base harness for the standalone single-head inst64 frontend. +/// Clock/reset, the accelerator-bus driver, the upstream idma_inst64_top DUT, +/// and one axi_sim_mem per channel. No Snitch cluster, no multi-head. +module idma_inst64_base #( + parameter int unsigned DMATracing = idma_inst64_tb_pkg::DMATracing, + parameter idma_pkg::compute_enable_t ComputeEnable = '0 +); + import idma_inst64_tb_pkg::*; + import idma_inst64_snitch_pkg::*; + + logic clk; + logic rst_n; + + clk_rst_gen #( + .ClkPeriod ( Period ), + .RstClkCycles( ResetCycles ) + ) i_clock_reset_generator ( + .clk_o ( clk ), + .rst_no( rst_n ) + ); + + idma_inst64_drv_if drv_if ( + .clk ( clk ), + .rst_n( rst_n ) + ); + + axi_req_t [NumChannels-1:0] axi_req; + axi_resp_t [NumChannels-1:0] axi_res; + obi_req_t [NumChannels-1:0] obi_req; + obi_res_t [NumChannels-1:0] obi_res; + dma_events_t [NumChannels-1:0] events; + logic [NumChannels-1:0] busy; + + // route the test's AXI range via the default idx (ToSoC=AXI); the single rule + // maps an unused low TCDM range to OBI so the OBI port stays idle + addr_rule_t addr_map; + assign addr_map = '{ + idx: idma_pkg::TCDMDMA, + start_addr: 64'h0000_0000, + end_addr: 64'h1000_0000 + }; + + idma_inst64_top #( + .AxiDataWidth ( AxiDataWidth ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiUserWidth ( AxiUserWidth ), + .AxiIdWidth ( AxiIdWidth ), + .NumAxInFlight ( NumAxInFlight ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .NumChannels ( NumChannels ), + .DMATracing ( DMATracing ), + .ComputeEnable ( ComputeEnable ), + .axi_ar_chan_t ( axi_ar_chan_t ), + .axi_aw_chan_t ( axi_aw_chan_t ), + .axi_req_t ( axi_req_t ), + .axi_res_t ( axi_resp_t ), + .init_req_chan_t ( init_req_chan_t ), + .init_rsp_chan_t ( init_rsp_chan_t ), + .init_req_t ( init_req_t ), + .init_rsp_t ( init_rsp_t ), + .obi_a_chan_t ( obi_a_chan_t ), + .obi_r_chan_t ( obi_r_chan_t ), + .obi_req_t ( obi_req_t ), + .obi_res_t ( obi_res_t ), + .acc_req_t ( acc_req_t ), + .acc_res_t ( acc_res_t ), + .dma_events_t ( dma_events_t ), + .addr_rule_t ( addr_rule_t ) + ) i_dut ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .testmode_i ( 1'b0 ), + .axi_req_o ( axi_req ), + .axi_res_i ( axi_res ), + .obi_req_o ( obi_req ), + .obi_res_i ( obi_res ), + .busy_o ( busy ), + .acc_req_i ( drv_if.acc_req ), + .acc_req_valid_i ( drv_if.acc_req_valid ), + .acc_req_ready_o ( drv_if.acc_req_ready ), + .acc_res_o ( drv_if.acc_res ), + .acc_res_valid_o ( drv_if.acc_res_valid ), + .acc_res_ready_i ( drv_if.acc_res_ready ), + .hart_id_i ( 32'h0 ), + .events_o ( events ), + .addr_map_i ( addr_map ) + ); + + for (genvar c = 0; c < NumChannels; c++) begin : gen_mem_ch + axi_sim_mem #( + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .IdWidth ( AxiIdWidth ), + .UserWidth ( AxiUserWidth ), + .axi_req_t ( axi_req_t ), + .axi_rsp_t ( axi_resp_t ), + .WarnUninitialized ( 1'b1 ), + .ClearErrOnAccess ( 1'b1 ), + .ApplDelay ( ApplDelay ), + .AcqDelay ( AcqDelay ) + ) i_axi_sim_mem ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .axi_req_i ( axi_req[c] ), + .axi_rsp_o ( axi_res[c] ), + .mon_w_valid_o ( ), + .mon_w_addr_o ( ), + .mon_w_data_o ( ), + .mon_w_id_o ( ), + .mon_w_user_o ( ), + .mon_w_beat_count_o( ), + .mon_w_last_o ( ), + .mon_r_valid_o ( ), + .mon_r_addr_o ( ), + .mon_r_data_o ( ), + .mon_r_id_o ( ), + .mon_r_user_o ( ), + .mon_r_beat_count_o( ), + .mon_r_last_o ( ) + ); + + // L1/TCDM model: connected but idle for the AXI-routed copy/transpose tests + obi_sim_mem #( + .ObiCfg ( ObiCfg ), + .obi_req_t ( obi_req_t ), + .obi_rsp_t ( obi_res_t ), + .obi_r_chan_t ( obi_r_chan_t ), + .WarnUninitialized ( 1'b0 ), + .ClearErrOnAccess ( 1'b1 ), + .ApplDelay ( ApplDelay ), + .AcqDelay ( AcqDelay ) + ) i_obi_sim_mem ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .obi_req_i ( obi_req[c] ), + .obi_rsp_o ( obi_res[c] ), + .mon_valid_o ( ), + .mon_we_o ( ), + .mon_addr_o ( ), + .mon_wdata_o ( ), + .mon_be_o ( ), + .mon_id_o ( ) + ); + end + + // Memory helpers (channel 0) + task automatic mem_write_byte(input addr_t addr, input byte data); + gen_mem_ch[0].i_axi_sim_mem.mem[addr] = data; + endtask + + function automatic logic [7:0] mem_read_byte(input addr_t addr); + if (gen_mem_ch[0].i_axi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_axi_sim_mem.mem[addr]; + else return 8'hXX; + endfunction + +endmodule diff --git a/systems/snitch/test/idma_inst64_drv_if.sv b/systems/snitch/test/idma_inst64_drv_if.sv new file mode 100644 index 00000000..d4382a85 --- /dev/null +++ b/systems/snitch/test/idma_inst64_drv_if.sv @@ -0,0 +1,191 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Recycled from the vidma inst64 verification harness +// (idma_alu_vec/test/frontend/idma_inst64_drv_if.sv). Faithful copy of the +// copy/status tasks; the vidma-only DMOPC/multi-head/immediate tasks are +// dropped to match the clean single-head upstream idma_inst64_top. +// +// One correctness fix vs the source: the handshake drops acc_req_valid the +// cycle the request is accepted (sampling ready at AcqDelay) instead of holding +// it one extra cycle. The source held valid past grant, which double-issues the +// request to a still-ready frontend FIFO — harmless for idempotent copies but +// corrupts non-idempotent transfers (transpose). + +interface idma_inst64_drv_if ( + input logic clk, + input logic rst_n +); + import idma_inst64_tb_pkg::*; + import idma_inst64_snitch_pkg::*; + + // Accelerator Interface Signals + acc_req_t acc_req; + logic acc_req_valid; + logic acc_req_ready; + + acc_res_t acc_res; + logic acc_res_valid; + logic acc_res_ready; + + // Internal State for BFM + logic [31:0] req_id_counter; + + // Performance Counters + longint unsigned dma_start_cycle; + longint unsigned dma_end_cycle; + longint unsigned dma_cycles; + longint unsigned cycle_counter; + + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) cycle_counter <= 0; + else cycle_counter <= cycle_counter + 1; + end + + // Initialization + initial begin + acc_req_valid = 1'b0; + acc_res_ready = 1'b1; + acc_req = '0; + req_id_counter = '0; + dma_start_cycle = 0; + dma_end_cycle = 0; + dma_cycles = 0; + end + + // Drive one accelerator instruction; valid is asserted in the apply region + // and dropped the cycle the request is accepted (ready sampled at AcqDelay). + task automatic drive(input logic [31:0] op, input logic [63:0] arga, input logic [63:0] argb); + @(posedge clk); + #(ApplDelay); + acc_req.id = req_id_counter++; + acc_req.data_op = op; + acc_req.data_arga = arga; + acc_req.data_argb = argb; + acc_req_valid = 1'b1; + do begin + @(posedge clk); + #(AcqDelay); + end while (!acc_req_ready); + acc_req_valid = 1'b0; + endtask + + //-------------------------------------- + // C-like API for DMA Programming + //-------------------------------------- + + task automatic dma_set_source(input addr_t addr); + drive(DMSRC, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]}); + endtask + + task automatic dma_set_dest(input addr_t addr); + drive(DMDST, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]}); + endtask + + task automatic dma_set_strides(input logic [31:0] src_stride, input logic [31:0] dst_stride); + drive(DMSTR, src_stride, dst_stride); + endtask + + task automatic dma_set_reps(input logic [31:0] reps); + drive(DMREP, reps, '0); + endtask + + // Launch a copy. cfg[1] = 2D enable; channel selects the AXI manager. + // Reads back the transfer id from the response. + task automatic dma_start_copy( + input addr_t length, + input logic [1:0] cfg, + input logic [2:0] channel, + output tf_id_t transfer_id + ); + drive(DMCPY, length, {59'b0, channel, cfg}); + while (!acc_res_valid) @(posedge clk); + transfer_id = acc_res.data[31:0]; + endtask + + // Launch a transpose. Encodes {enable, mode, M, N} into the spare DMCPY argb + // bits (argb[1:0]=cfg, [4:2]=channel, [5]=transpose, [7:6]=mode, + // [19:8]=tensor_m, [31:20]=tensor_n). Length is derived by the midend. + task automatic dma_transpose( + input addr_t src, + input addr_t dst, + input logic [11:0] tensor_m, + input logic [11:0] tensor_n, + input logic [1:0] mode, + input logic [2:0] channel, + output tf_id_t transfer_id + ); + logic [63:0] argb; + dma_set_source(src); + dma_set_dest(dst); + argb = '0; + argb[4:2] = channel; + argb[5] = 1'b1; + argb[7:6] = mode; + argb[19:8] = tensor_m; + argb[31:20] = tensor_n; + drive(DMCPY, '0, argb); + while (!acc_res_valid) @(posedge clk); + transfer_id = acc_res.data[31:0]; + endtask + + // issue a transpose DMCPY and return the response error bit (negative tests) + task automatic dma_transpose_err( + input addr_t src, + input addr_t dst, + input logic [11:0] tensor_m, + input logic [11:0] tensor_n, + input logic [1:0] mode, + input logic [2:0] channel, + output logic error + ); + logic [63:0] argb; + dma_set_source(src); + dma_set_dest(dst); + argb = '0; + argb[4:2] = channel; + argb[5] = 1'b1; + argb[7:6] = mode; + argb[19:8] = tensor_m; + argb[31:20] = tensor_n; + drive(DMCPY, '0, argb); + while (!acc_res_valid) @(posedge clk); + error = acc_res.error; + endtask + + task automatic dma_poll_status( + input logic [1:0] status_idx, + input logic [2:0] channel, + output logic [63:0] status_value + ); + drive(DMSTAT, '0, {59'b0, channel, status_idx}); + while (!acc_res_valid) @(posedge clk); + status_value = acc_res.data; + endtask + + task automatic dma_wait(input tf_id_t transfer_id, input logic [2:0] channel); + logic [63:0] completed_id; + $display("[%0t] dma_wait(ID=%0d, chan=%0d) - waiting...", $time, transfer_id, channel); + forever begin + dma_poll_status(2'b00, channel, completed_id); + if (completed_id >= transfer_id) begin + dma_end_cycle = cycle_counter; + dma_cycles = dma_end_cycle - dma_start_cycle; + break; + end + repeat(10) @(posedge clk); + end + endtask + + task automatic dma_wait_idle(input logic [2:0] channel); + logic [63:0] busy_status; + $display("[%0t] dma_wait_idle(chan=%0d) - waiting...", $time, channel); + forever begin + dma_poll_status(2'b10, channel, busy_status); + if (busy_status[0] == 1'b0) break; + repeat(5) @(posedge clk); + end + endtask + +endinterface diff --git a/systems/snitch/test/idma_inst64_tb_pkg.sv b/systems/snitch/test/idma_inst64_tb_pkg.sv new file mode 100644 index 00000000..b02377d7 --- /dev/null +++ b/systems/snitch/test/idma_inst64_tb_pkg.sv @@ -0,0 +1,163 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Recycled from the vidma inst64 verification harness +// (idma_alu_vec/test/frontend/idma_inst64_tb_pkg.sv). Kept faithful. The +// transpose imposes no NumAxInFlight>=NE constraint (the engine self-buffers a +// tile); verified down to NumAxInFlight=3 at NE=64. + +`include "axi/typedef.svh" +`include "obi/typedef.svh" + +package idma_inst64_tb_pkg; + + localparam int unsigned AxiDataWidth = 512; + localparam int unsigned AxiAddrWidth = 64; + localparam int unsigned AxiUserWidth = 1; + localparam int unsigned AxiIdWidth = 3; + localparam int unsigned NumAxInFlight = 3; // default + localparam int unsigned DMAReqFifoDepth = 3; + localparam int unsigned NumChannels = 1; + localparam int unsigned NumHeads = 1; + localparam int unsigned DMATracing = 0; + localparam int unsigned Seed = 1337; + + localparam time Period = 10ns; + localparam time ApplDelay = Period / 4; + localparam time AcqDelay = Period * 3 / 4; + localparam integer ResetCycles = 10; + + // Type definitions + typedef logic [AxiAddrWidth-1:0] addr_t; + typedef logic [AxiIdWidth-1:0] axi_id_t; + typedef logic [31:0] data_t; + typedef logic [31:0] tf_id_t; + + // AXI Types + typedef logic [AxiAddrWidth-1:0] axi_addr_t; + typedef logic [AxiDataWidth-1:0] axi_data_t; + typedef logic [AxiDataWidth/8-1:0] axi_strb_t; + typedef logic [AxiUserWidth-1:0] axi_user_t; + + `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, axi_addr_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, axi_data_t, axi_strb_t, axi_user_t) + `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, axi_addr_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, axi_data_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t) + `AXI_TYPEDEF_RESP_T(axi_resp_t, axi_b_chan_t, axi_r_chan_t) + + // OBI L1/TCDM types (DataWidth=AxiDataWidth, AddrWidth=AxiAddrWidth, IdWidth=AxiIdWidth) + typedef logic [AxiDataWidth/8-1:0] obi_strb_t; + typedef logic [AxiIdWidth-1:0] obi_id_t; + `OBI_TYPEDEF_MINIMAL_A_OPTIONAL(obi_a_optional_t) + `OBI_TYPEDEF_MINIMAL_R_OPTIONAL(obi_r_optional_t) + `OBI_TYPEDEF_TYPE_A_CHAN_T(obi_a_chan_t, axi_addr_t, axi_data_t, obi_strb_t, obi_id_t, obi_a_optional_t) + `OBI_TYPEDEF_TYPE_R_CHAN_T(obi_r_chan_t, axi_data_t, obi_id_t, obi_r_optional_t) + `OBI_TYPEDEF_REQ_T(obi_req_t, obi_a_chan_t) + `OBI_TYPEDEF_RSP_T(obi_res_t, obi_r_chan_t) + + localparam obi_pkg::obi_cfg_t ObiCfg = '{ + UseRReady: 1'b1, + CombGnt: 1'b0, + AddrWidth: AxiAddrWidth, + DataWidth: AxiDataWidth, + IdWidth: AxiIdWidth, + Integrity: 1'b0, + BeFull: 1'b1, + OptionalCfg: obi_pkg::ObiMinimalOptionalConfig + }; + + // INIT meta-channel types (mirror src/db/idma_init.yml) + typedef struct packed { + logic [AxiAddrWidth-1:0] cfg; + logic [AxiDataWidth-1:0] term; + logic [AxiDataWidth/8-1:0] strb; + logic [AxiIdWidth-1:0] id; + } init_req_chan_t; + + typedef struct packed { + init_req_chan_t req_chan; + logic req_valid; + logic rsp_ready; + } init_req_t; + + typedef struct packed { + logic [AxiDataWidth-1:0] init; + } init_rsp_chan_t; + + typedef struct packed { + init_rsp_chan_t rsp_chan; + logic rsp_valid; + logic req_ready; + } init_rsp_t; + + // address-decode rule type (DUT default) + typedef axi_pkg::xbar_rule_64_t addr_rule_t; + + // Accelerator request/response types (simplified Snitch accelerator interface) + typedef struct packed { + logic [31:0] id; + logic [31:0] data_op; + logic [63:0] data_arga; + logic [63:0] data_argb; + } acc_req_t; + + typedef struct packed { + logic [31:0] id; + logic [63:0] data; + logic error; + } acc_res_t; + + // DMA events (simplified) + typedef struct packed { + // aw + logic aw_valid; + logic aw_ready; + logic aw_done; + logic aw_stall; + axi_pkg::len_t aw_len; + axi_pkg::size_t aw_size; + // ar + logic ar_valid; + logic ar_ready; + logic ar_done; + logic ar_stall; + axi_pkg::len_t ar_len; + axi_pkg::size_t ar_size; + // r + logic r_valid; + logic r_ready; + logic r_done; + logic r_bw; + logic r_stall; + // w + logic w_valid; + logic w_ready; + logic w_done; + logic w_stall; + logic [31:0] num_bytes_written; + // b + logic b_valid; + logic b_ready; + logic b_done; + // busy + logic dma_busy; + } dma_events_t; + + // Golden reference for validation + typedef struct { + addr_t src_addr; + addr_t dst_addr; + addr_t length; + logic [5:0] alu_opcode; + logic [31:0] src_strides; + logic [31:0] dst_strides; + logic [31:0] reps; + logic twod; + int unsigned channel; + tf_id_t expected_id; + } transfer_t; + +endpackage diff --git a/systems/snitch/test/tb_idma_inst64_copy.sv b/systems/snitch/test/tb_idma_inst64_copy.sv new file mode 100644 index 00000000..40108f4d --- /dev/null +++ b/systems/snitch/test/tb_idma_inst64_copy.sv @@ -0,0 +1,56 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Daniel Keller + +/// Stage-1 plain-copy regression for the standalone single-head inst64 frontend. +/// Drives DMSRC/DMDST/DMCPY over the accelerator bus and verifies the copy in +/// the AXI sim memory. No compute. +module tb_idma_inst64_copy; + import idma_inst64_tb_pkg::*; + + idma_inst64_base #(.DMATracing(0)) harness(); + + localparam int unsigned TimeoutCycles = 20000; + int unsigned errors = 0; + + task automatic run_copy(input addr_t src, input addr_t dst, input int unsigned len, + input byte start); + tf_id_t tid; + for (int i = 0; i < len; i++) harness.mem_write_byte(src + i, start + i[7:0]); + for (int i = 0; i < len; i++) harness.mem_write_byte(dst + i, 8'h00); + harness.drv_if.dma_set_source(src); + harness.drv_if.dma_set_dest(dst); + harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid); + harness.drv_if.dma_wait(tid, 0); + for (int i = 0; i < len; i++) begin + automatic logic [7:0] exp = start + i[7:0]; + automatic logic [7:0] got = harness.mem_read_byte(dst + i); + if (got !== exp) begin + if (errors < 10) $error("[COPY] mismatch at %0d: exp 0x%02x got 0x%02x", i, exp, got); + errors++; + end + end + endtask + + initial begin + @(posedge harness.rst_n); + repeat (10) @(posedge harness.clk); + + $display("=== inst64 plain-copy regression ==="); + run_copy(64'h8000_0000, 64'h9000_0000, 256, 8'hA0); // word-multiple + run_copy(64'h8001_0000, 64'h9001_0000, 4055, 8'h10); // large, non-aligned length + run_copy(64'h8002_0000, 64'h9002_0000, 7, 8'h30); // tiny, sub-beat + + if (errors == 0) $display("[SV] inst64 copy: SUCCESS (3 transfers)"); + else $fatal(1, "[SV] inst64 copy: FAIL (%0d errors)", errors); + $finish; + end + + initial begin + repeat (TimeoutCycles) @(posedge harness.clk); + $fatal(1, "[TIMEOUT] inst64 copy exceeded %0d cycles", TimeoutCycles); + end + +endmodule diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv new file mode 100644 index 00000000..e5d8f46e --- /dev/null +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -0,0 +1,151 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Daniel Keller + +/// End-to-end on-the-fly transpose through the inst64 frontend: +/// accelerator bus -> DMCPY transpose decode -> opt.compute -> transpose_midend +/// -> nd_midend -> rw_axi+compute backend -> axi_sim_mem. Checks transposed +/// data, padding integrity, multi-tile geometry, back-to-back (geometry leak), +/// and cross-transfer compute leak. +module tb_idma_inst64_transpose #( + parameter int unsigned M = 40, // matrix rows (elements) + parameter int unsigned N = 70, // matrix cols (elements) + parameter int unsigned EB = 4 // element size in bytes (1/2/4) +); + import idma_inst64_tb_pkg::*; + + localparam int unsigned StrbWidth = AxiDataWidth/8; + localparam int unsigned NE = StrbWidth/EB; + localparam int unsigned MODE = (EB==4) ? 2 : (EB==2) ? 1 : 0; + + localparam addr_t SRC = 64'h8000_0000; + localparam addr_t DST = 64'h9000_0000; + localparam addr_t CPY = 64'hA000_0000; + + idma_inst64_base #(.ComputeEnable('{transpose: 1'b1})) harness(); + + int unsigned errs = 0; + + // backend burst counter (proves the full NumDim=4 walk: NE*YT*NT bursts/tile-rows) + longint unsigned burst_cnt = 0; + always @(posedge harness.clk) + if (harness.i_dut.idma_req_valid[0] && harness.i_dut.idma_req_ready[0]) burst_cnt++; + + // Unique per-element fingerprint: byte b of element idx encodes (idx>>8b). + // Distinguishes distinct source elements so a mis-permutation cannot hide + // behind a value collision (a plain byte ramp aliases mod 256). + function automatic logic [7:0] fp(input int unsigned idx, input int unsigned b); + return 8'((idx >> (8*b)) & 32'hFF); + endfunction + + // Run one transpose of an mm x nn matrix at src -> dst (padded pitch), + // verify transposed data and that padding stays sentinel. + task automatic do_transpose(input int unsigned mm, input int unsigned nn, + input addr_t src, input addr_t dst); + tf_id_t tid; + longint unsigned c0, cyc, b0; + int unsigned yt = (mm + NE - 1)/NE; + int unsigned nt = (nn + NE - 1)/NE; + int unsigned mp = yt*NE; + for (int unsigned idx = 0; idx < mm*nn; idx++) + for (int unsigned b = 0; b < EB; b++) + harness.mem_write_byte(src + idx*EB + b, fp(idx, b)); + for (int unsigned i = 0; i < nt*NE; i++) + for (int unsigned j = 0; j < mp*EB; j++) + harness.mem_write_byte(dst + i*mp*EB + j, 8'hCC); + + c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; + harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(MODE), 3'd0, tid); + harness.drv_if.dma_wait(tid, 0); + harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading + cyc = harness.drv_if.cycle_counter - c0; + $display(" transpose %0dx%0d (NE=%0d, %0dx%0d=%0d tiles, MP=%0d):", mm, nn, NE, yt, nt, + yt*nt, mp); + $display(" bursts=%0d (exp NE*YT*NT=%0d) cycles=%0d eff=%0d B/cyc (bus peak %0d)", + burst_cnt-b0, NE*yt*nt, cyc, (mm*nn*EB)/cyc, StrbWidth); + + for (int unsigned c = 0; c < nn; c++) + for (int unsigned r = 0; r < mm; r++) + for (int unsigned b = 0; b < EB; b++) begin + automatic logic [7:0] got = harness.mem_read_byte(dst + (c*mp + r)*EB + b); + automatic logic [7:0] exp = harness.mem_read_byte(src + (r*nn + c)*EB + b); + if (got !== exp) begin + errs++; + if (errs <= 12) + $display("[TP] data mismatch out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp); + end + end + for (int unsigned i = 0; i < nt*NE; i++) + for (int unsigned j = 0; j < mp; j++) + if (i >= nn || j >= mm) + for (int unsigned b = 0; b < EB; b++) + if (harness.mem_read_byte(dst + (i*mp + j)*EB + b) !== 8'hCC) begin + errs++; + if (errs <= 12) $display("[TP] padding clobbered at [%0d][%0d].b%0d", i, j, b); + end + endtask + + initial begin + @(posedge harness.rst_n); + repeat (10) @(posedge harness.clk); + + $display("=== inst64 transpose EB=%0d ===", EB); + + // 1. the parameterized shape + do_transpose(M, N, SRC, DST); + + // 2. back-to-back: a different (swapped) shape right after, to a fresh dst. + // Catches geometry/state leak between consecutive transposes. + do_transpose(N, M, SRC + 64'h0010_0000, DST + 64'h0010_0000); + + // 3. cross-transfer compute leak: a plain copy after transposes must NOT + // inherit opt.compute (default-zeroed). Verify a 1:1 copy. + begin + automatic int unsigned len = 128; + tf_id_t tid2; + for (int unsigned k = 0; k < len; k++) harness.mem_write_byte(SRC + k, 8'hE0 + k[4:0]); + for (int unsigned k = 0; k < len; k++) harness.mem_write_byte(CPY + k, 8'h00); + harness.drv_if.dma_set_source(SRC); + harness.drv_if.dma_set_dest(CPY); + harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid2); + harness.drv_if.dma_wait(tid2, 0); + harness.drv_if.dma_wait_idle(0); + for (int unsigned k = 0; k < len; k++) + if (harness.mem_read_byte(CPY + k) !== harness.mem_read_byte(SRC + k)) begin + errs++; + if (errs <= 12) $display("[TP] leak: post-transpose copy wrong at %0d", k); + end + end + + // 4. malformed transpose requests: error response, nothing launched + begin + logic err; + longint unsigned b_rej; + b_rej = burst_cnt; + harness.drv_if.dma_transpose_err(SRC, DST, 12'd8, 12'd8, 2'd3, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: reserved mode 3"); end + harness.drv_if.dma_transpose_err(SRC, DST, 12'd0, 12'd8, 2'd0, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: M == 0"); end + harness.drv_if.dma_transpose_err(SRC, DST + 64'd1, 12'd8, 12'd8, 2'd0, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: unaligned dst"); end + repeat (50) @(posedge harness.clk); + if (burst_cnt != b_rej) begin + errs++; $display("[TP] reject fail: rejected request launched bursts"); + end + // a valid transpose must still work after rejections + do_transpose(8, 8, SRC + 64'h0020_0000, DST + 64'h0020_0000); + end + + if (errs == 0) $display("[TP] PASS: transpose data + padding + back-to-back + no-leak OK"); + else $fatal(1, "[TP] FAIL: %0d mismatches", errs); + $finish; + end + + initial begin + repeat (400000) @(posedge harness.clk); + $fatal(1, "[TIMEOUT] inst64 transpose"); + end + +endmodule From 158309b7047110d1d455d811c9925f892dbd74b5 Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Tue, 23 Jun 2026 10:56:58 +0200 Subject: [PATCH 3/7] frontend: Transpose via address generation on engine-less backends inst64 is a multi-write backend (rw_axi_rw_init_rw_obi) that cannot host the #112 FF transpose engine. Add an AddrGenTranspose mode to idma_transpose_midend: instead of the NumDim=4 tiled engine walk, emit an element-granular NumDim=3 swapped-stride program (out_T[c][r]=in[r][c], contiguous N x M dst) and clear compute.enable so the backend runs a plain strided copy. Correct on any protocol (ideal on random-access OBI/TCDM). idma_inst64_top gains the AddrGenTranspose param, wires it to the expander, and gates the engine-only gen_compute_check. The inst64 transpose harness drives it end-to-end (int8/fp16/fp32, square/rect/ swapped, back-to-back, reject) -- it could not even elaborate before. --- src/frontend/inst64/idma_inst64_top.sv | 15 ++- src/midend/idma_transpose_midend.sv | 97 ++++++++++++------- systems/snitch/test/idma_inst64_base.sv | 4 +- .../snitch/test/tb_idma_inst64_transpose.sv | 28 ++---- 4 files changed, 84 insertions(+), 60 deletions(-) diff --git a/src/frontend/inst64/idma_inst64_top.sv b/src/frontend/inst64/idma_inst64_top.sv index 6c7586aa..cdb851d7 100644 --- a/src/frontend/inst64/idma_inst64_top.sv +++ b/src/frontend/inst64/idma_inst64_top.sv @@ -24,6 +24,9 @@ module idma_inst64_top #( parameter int unsigned DMATracing = 32'd0, /// Compile-time on-the-fly compute feature enables (e.g. transpose) parameter idma_pkg::compute_enable_t ComputeEnable = '0, + /// Transpose via address generation (no FF engine) — for backends without + /// the engine, e.g. this multi-write OBI/TCDM variant + parameter bit AddrGenTranspose = 1'b0, parameter type axi_ar_chan_t = logic, parameter type axi_aw_chan_t = logic, parameter type axi_req_t = logic, @@ -378,10 +381,11 @@ module idma_inst64_top #( // expand transpose requests into the tiled ND walk if (ComputeEnable.transpose) begin : gen_transpose idma_transpose_midend #( - .NumDim ( NumDim ), - .StrbWidth ( StrbWidth ), - .addr_t ( addr_t ), - .idma_nd_req_t ( idma_nd_req_t ) + .NumDim ( NumDim ), + .AddrGenTranspose ( AddrGenTranspose ), + .StrbWidth ( StrbWidth ), + .addr_t ( addr_t ), + .idma_nd_req_t ( idma_nd_req_t ) ) i_idma_transpose_midend ( .nd_req_i ( fifo_nd_req ), .valid_i ( fifo_nd_valid ), @@ -834,7 +838,8 @@ module idma_inst64_top #( $fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12"); `ifndef VERILATOR // capability cross-check against the generated backend's baked compute set - if (ComputeEnable.transpose) begin : gen_compute_check + // (engine route only; address-gen needs no compute-enabled backend) + if (ComputeEnable.transpose && !AddrGenTranspose) begin : gen_compute_check initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else $fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant"); end diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv index b6f66325..e2d726d2 100644 --- a/src/midend/idma_transpose_midend.sv +++ b/src/midend/idma_transpose_midend.sv @@ -5,12 +5,15 @@ // Authors: // - Daniel Keller -/// Transpose geometry expander: expands an opt.compute=TRANSPOSE request into a -/// NumDim=4 tiled ND walk for the generic idma_nd_midend. Non-transpose passes -/// through. Combinational, quasi-static per request. +/// Transpose geometry expander for the generic idma_nd_midend. Two modes: +/// engine (NumDim=4 tiled walk feeding the FF transpose engine) and address-gen +/// (element-granular swapped-stride walk, no engine, for backends without the +/// engine e.g. multi-write OBI/TCDM). Non-transpose passes through. module idma_transpose_midend #( - /// Number of ND dimensions (must be >= 4 to express the tiled walk) + /// Number of ND dimensions (engine walk needs >= 4; address-gen needs >= 3) parameter int unsigned NumDim = 32'd4, + /// Address-gen mode: element-granular swapped-stride transpose, no engine + parameter bit AddrGenTranspose = 1'b0, /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes) parameter int unsigned StrbWidth = 32'd64, /// Address type @@ -51,6 +54,7 @@ module idma_transpose_midend #( logic [TensorW-1:0] tm, tn; logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe; logic signed [WorkW-1:0] strb_c; // NE*E == StrbWidth (mode cancels) + logic signed [WorkW-1:0] e, me; // address-gen: E (=1<>> log2ne; // ceil(M/NE) - nt = (n + ne - 1) >>> log2ne; // ceil(N/NE) - nxe = n <<< mode; // N*E (E = 1<>> log2ne; // ceil(M/NE) + nt = (n + ne - 1) >>> log2ne; // ceil(N/NE) + nxe = n <<< mode; // N*E (E = 1<= 4) else - $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim); + initial assert (NumDim >= (AddrGenTranspose ? 32'd3 : 32'd4)) else + $fatal(1, "idma_transpose_midend: NumDim too small (got %0d)", NumDim); // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2 initial assert (Log2Strb >= 2) else $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth); diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv index 5c9568e5..d68d3970 100644 --- a/systems/snitch/test/idma_inst64_base.sv +++ b/systems/snitch/test/idma_inst64_base.sv @@ -9,7 +9,8 @@ /// and one axi_sim_mem per channel. No Snitch cluster, no multi-head. module idma_inst64_base #( parameter int unsigned DMATracing = idma_inst64_tb_pkg::DMATracing, - parameter idma_pkg::compute_enable_t ComputeEnable = '0 + parameter idma_pkg::compute_enable_t ComputeEnable = '0, + parameter bit AddrGenTranspose = 1'b0 ); import idma_inst64_tb_pkg::*; import idma_inst64_snitch_pkg::*; @@ -56,6 +57,7 @@ module idma_inst64_base #( .NumChannels ( NumChannels ), .DMATracing ( DMATracing ), .ComputeEnable ( ComputeEnable ), + .AddrGenTranspose( AddrGenTranspose ), .axi_ar_chan_t ( axi_ar_chan_t ), .axi_aw_chan_t ( axi_aw_chan_t ), .axi_req_t ( axi_req_t ), diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv index e5d8f46e..3624c73e 100644 --- a/systems/snitch/test/tb_idma_inst64_transpose.sv +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -24,7 +24,7 @@ module tb_idma_inst64_transpose #( localparam addr_t DST = 64'h9000_0000; localparam addr_t CPY = 64'hA000_0000; - idma_inst64_base #(.ComputeEnable('{transpose: 1'b1})) harness(); + idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1)) harness(); int unsigned errs = 0; @@ -46,30 +46,26 @@ module tb_idma_inst64_transpose #( input addr_t src, input addr_t dst); tf_id_t tid; longint unsigned c0, cyc, b0; - int unsigned yt = (mm + NE - 1)/NE; - int unsigned nt = (nn + NE - 1)/NE; - int unsigned mp = yt*NE; for (int unsigned idx = 0; idx < mm*nn; idx++) for (int unsigned b = 0; b < EB; b++) harness.mem_write_byte(src + idx*EB + b, fp(idx, b)); - for (int unsigned i = 0; i < nt*NE; i++) - for (int unsigned j = 0; j < mp*EB; j++) - harness.mem_write_byte(dst + i*mp*EB + j, 8'hCC); + // address-gen output is a contiguous N x M transpose (pitch M, no padding) + for (int unsigned k = 0; k < nn*mm*EB; k++) + harness.mem_write_byte(dst + k, 8'hCC); c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(MODE), 3'd0, tid); harness.drv_if.dma_wait(tid, 0); harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading cyc = harness.drv_if.cycle_counter - c0; - $display(" transpose %0dx%0d (NE=%0d, %0dx%0d=%0d tiles, MP=%0d):", mm, nn, NE, yt, nt, - yt*nt, mp); - $display(" bursts=%0d (exp NE*YT*NT=%0d) cycles=%0d eff=%0d B/cyc (bus peak %0d)", - burst_cnt-b0, NE*yt*nt, cyc, (mm*nn*EB)/cyc, StrbWidth); + $display(" transpose %0dx%0d: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, + burst_cnt-b0, mm*nn, cyc); + // out_T[c][r] == in[r][c], dst contiguous N x M (pitch M) for (int unsigned c = 0; c < nn; c++) for (int unsigned r = 0; r < mm; r++) for (int unsigned b = 0; b < EB; b++) begin - automatic logic [7:0] got = harness.mem_read_byte(dst + (c*mp + r)*EB + b); + automatic logic [7:0] got = harness.mem_read_byte(dst + (c*mm + r)*EB + b); automatic logic [7:0] exp = harness.mem_read_byte(src + (r*nn + c)*EB + b); if (got !== exp) begin errs++; @@ -77,14 +73,6 @@ module tb_idma_inst64_transpose #( $display("[TP] data mismatch out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp); end end - for (int unsigned i = 0; i < nt*NE; i++) - for (int unsigned j = 0; j < mp; j++) - if (i >= nn || j >= mm) - for (int unsigned b = 0; b < EB; b++) - if (harness.mem_read_byte(dst + (i*mp + j)*EB + b) !== 8'hCC) begin - errs++; - if (errs <= 12) $display("[TP] padding clobbered at [%0d][%0d].b%0d", i, j, b); - end endtask initial begin From 6f551504beeea24e7af0169881fc0086b3818a41 Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Tue, 23 Jun 2026 11:05:42 +0200 Subject: [PATCH 4/7] test: Exercise inst64 transpose over the OBI/TCDM port The harness gains an obi_sim_mem backdoor; the transpose TB now drives the real OBI/TCDM port instead of AXI-range addresses: OBI->OBI (transpose a tile within L1/TCDM, the Snitch DMA case), AXI->OBI (load an external matrix into TCDM transposed), back-to-back, no-leak OBI copy, and reject. PASS for int8/ fp16/fp32. Closes the end-to-end gap -- previously the inst64 TB only hit the AXI path; the OBI read+write ports are now covered through the frontend. --- systems/snitch/test/idma_inst64_base.sv | 10 +++ .../snitch/test/tb_idma_inst64_transpose.sv | 70 +++++++++++-------- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv index d68d3970..4010c1a9 100644 --- a/systems/snitch/test/idma_inst64_base.sv +++ b/systems/snitch/test/idma_inst64_base.sv @@ -161,4 +161,14 @@ module idma_inst64_base #( else return 8'hXX; endfunction + // L1/TCDM (OBI) backdoor helpers (channel 0) + task automatic obi_write_byte(input addr_t addr, input byte data); + gen_mem_ch[0].i_obi_sim_mem.mem[addr] = data; + endtask + + function automatic logic [7:0] obi_read_byte(input addr_t addr); + if (gen_mem_ch[0].i_obi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_obi_sim_mem.mem[addr]; + else return 8'hXX; + endfunction + endmodule diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv index 3624c73e..89ae9e7f 100644 --- a/systems/snitch/test/tb_idma_inst64_transpose.sv +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -20,9 +20,12 @@ module tb_idma_inst64_transpose #( localparam int unsigned NE = StrbWidth/EB; localparam int unsigned MODE = (EB==4) ? 2 : (EB==2) ? 1 : 0; - localparam addr_t SRC = 64'h8000_0000; - localparam addr_t DST = 64'h9000_0000; - localparam addr_t CPY = 64'hA000_0000; + // TCDM/OBI region (addr_map routes < 0x1000_0000 to the OBI/TCDM port); + // ASRC is an external matrix in the AXI (ToSoC) region for the AXI->OBI case. + localparam addr_t TSRC = 64'h0000_1000; + localparam addr_t TDST = 64'h0040_0000; + localparam addr_t CPY = 64'h0080_0000; + localparam addr_t ASRC = 64'h8000_0000; idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1)) harness(); @@ -40,33 +43,42 @@ module tb_idma_inst64_transpose #( return 8'((idx >> (8*b)) & 32'hFF); endfunction - // Run one transpose of an mm x nn matrix at src -> dst (padded pitch), - // verify transposed data and that padding stays sentinel. + // memory backdoors selected by protocol: OBI (TCDM) vs AXI (ToSoC) + task automatic seed_byte(input bit obi, input addr_t a, input logic [7:0] d); + if (obi) harness.obi_write_byte(a, d); else harness.mem_write_byte(a, d); + endtask + function automatic logic [7:0] peek_byte(input bit obi, input addr_t a); + return obi ? harness.obi_read_byte(a) : harness.mem_read_byte(a); + endfunction + + // Run one transpose of an mm x nn matrix src -> dst (address-gen, contiguous + // N x M output). src_obi/dst_obi pick the TCDM(OBI) vs external(AXI) memory. task automatic do_transpose(input int unsigned mm, input int unsigned nn, - input addr_t src, input addr_t dst); + input addr_t src, input addr_t dst, + input bit src_obi, input bit dst_obi); tf_id_t tid; longint unsigned c0, cyc, b0; for (int unsigned idx = 0; idx < mm*nn; idx++) for (int unsigned b = 0; b < EB; b++) - harness.mem_write_byte(src + idx*EB + b, fp(idx, b)); + seed_byte(src_obi, src + idx*EB + b, fp(idx, b)); // address-gen output is a contiguous N x M transpose (pitch M, no padding) for (int unsigned k = 0; k < nn*mm*EB; k++) - harness.mem_write_byte(dst + k, 8'hCC); + seed_byte(dst_obi, dst + k, 8'hCC); c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(MODE), 3'd0, tid); harness.drv_if.dma_wait(tid, 0); harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading cyc = harness.drv_if.cycle_counter - c0; - $display(" transpose %0dx%0d: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, - burst_cnt-b0, mm*nn, cyc); + $display(" transpose %0dx%0d %s->%s: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, + src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", burst_cnt-b0, mm*nn, cyc); // out_T[c][r] == in[r][c], dst contiguous N x M (pitch M) for (int unsigned c = 0; c < nn; c++) for (int unsigned r = 0; r < mm; r++) for (int unsigned b = 0; b < EB; b++) begin - automatic logic [7:0] got = harness.mem_read_byte(dst + (c*mm + r)*EB + b); - automatic logic [7:0] exp = harness.mem_read_byte(src + (r*nn + c)*EB + b); + automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mm + r)*EB + b); + automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*EB + b); if (got !== exp) begin errs++; if (errs <= 12) @@ -81,49 +93,51 @@ module tb_idma_inst64_transpose #( $display("=== inst64 transpose EB=%0d ===", EB); - // 1. the parameterized shape - do_transpose(M, N, SRC, DST); + // 1. OBI->OBI: transpose a tile within L1/TCDM (the Snitch DMA case) + do_transpose(M, N, TSRC, TDST, 1'b1, 1'b1); + + // 2. AXI->OBI: load an external matrix into TCDM transposed + do_transpose(M, N, ASRC, TDST + 64'h0010_0000, 1'b0, 1'b1); - // 2. back-to-back: a different (swapped) shape right after, to a fresh dst. - // Catches geometry/state leak between consecutive transposes. - do_transpose(N, M, SRC + 64'h0010_0000, DST + 64'h0010_0000); + // 3. back-to-back (swapped shape, fresh dst): catch geometry/state leak + do_transpose(N, M, TSRC + 64'h0010_0000, TDST + 64'h0020_0000, 1'b1, 1'b1); - // 3. cross-transfer compute leak: a plain copy after transposes must NOT - // inherit opt.compute (default-zeroed). Verify a 1:1 copy. + // 4. cross-transfer compute leak: a plain copy after transposes must NOT + // inherit opt.compute (default-zeroed). Verify a 1:1 OBI copy. begin automatic int unsigned len = 128; tf_id_t tid2; - for (int unsigned k = 0; k < len; k++) harness.mem_write_byte(SRC + k, 8'hE0 + k[4:0]); - for (int unsigned k = 0; k < len; k++) harness.mem_write_byte(CPY + k, 8'h00); - harness.drv_if.dma_set_source(SRC); + for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(TSRC + k, 8'hE0 + k[4:0]); + for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(CPY + k, 8'h00); + harness.drv_if.dma_set_source(TSRC); harness.drv_if.dma_set_dest(CPY); harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid2); harness.drv_if.dma_wait(tid2, 0); harness.drv_if.dma_wait_idle(0); for (int unsigned k = 0; k < len; k++) - if (harness.mem_read_byte(CPY + k) !== harness.mem_read_byte(SRC + k)) begin + if (harness.obi_read_byte(CPY + k) !== harness.obi_read_byte(TSRC + k)) begin errs++; if (errs <= 12) $display("[TP] leak: post-transpose copy wrong at %0d", k); end end - // 4. malformed transpose requests: error response, nothing launched + // 5. malformed transpose requests: error response, nothing launched begin logic err; longint unsigned b_rej; b_rej = burst_cnt; - harness.drv_if.dma_transpose_err(SRC, DST, 12'd8, 12'd8, 2'd3, 3'd0, err); + harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd8, 12'd8, 2'd3, 3'd0, err); if (!err) begin errs++; $display("[TP] reject fail: reserved mode 3"); end - harness.drv_if.dma_transpose_err(SRC, DST, 12'd0, 12'd8, 2'd0, 3'd0, err); + harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd0, 12'd8, 2'd0, 3'd0, err); if (!err) begin errs++; $display("[TP] reject fail: M == 0"); end - harness.drv_if.dma_transpose_err(SRC, DST + 64'd1, 12'd8, 12'd8, 2'd0, 3'd0, err); + harness.drv_if.dma_transpose_err(TSRC, TDST + 64'd1, 12'd8, 12'd8, 2'd0, 3'd0, err); if (!err) begin errs++; $display("[TP] reject fail: unaligned dst"); end repeat (50) @(posedge harness.clk); if (burst_cnt != b_rej) begin errs++; $display("[TP] reject fail: rejected request launched bursts"); end // a valid transpose must still work after rejections - do_transpose(8, 8, SRC + 64'h0020_0000, DST + 64'h0020_0000); + do_transpose(8, 8, TSRC + 64'h0030_0000, TDST + 64'h0030_0000, 1'b1, 1'b1); end if (errs == 0) $display("[TP] PASS: transpose data + padding + back-to-back + no-leak OK"); From d71b8d1bdc3a755312e92c3a4022e758cf4abbec Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Tue, 23 Jun 2026 11:17:46 +0200 Subject: [PATCH 5/7] midend: Add bank-conflict skew to address-gen transpose A transposed write walks the dst with stride M*E; when M*E is an even number of bus words this hammers a single TCDM bank (1/B bandwidth on a B-bank L1). New BankSkew param (default off) pads the dst row pitch by one bus-word (NE elements) in that case, making the per-column word stride odd -> round-robins all banks on any power-of-2-bank TCDM, at <=1 word/row cost. Plumbed through idma_inst64_top; the harness/TB drive it and check the N x M' padded layout (padding columns stay sentinel). PASS skew-on (32x8 EB4 -> pitch 48) and skew-off (contiguous). Default off keeps the contiguous N x M output. --- src/frontend/inst64/idma_inst64_top.sv | 3 ++ src/midend/idma_transpose_midend.sv | 25 ++++++++----- systems/snitch/test/idma_inst64_base.sv | 4 ++- .../snitch/test/tb_idma_inst64_transpose.sv | 35 ++++++++++++++----- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/frontend/inst64/idma_inst64_top.sv b/src/frontend/inst64/idma_inst64_top.sv index cdb851d7..f4696e31 100644 --- a/src/frontend/inst64/idma_inst64_top.sv +++ b/src/frontend/inst64/idma_inst64_top.sv @@ -27,6 +27,8 @@ module idma_inst64_top #( /// Transpose via address generation (no FF engine) — for backends without /// the engine, e.g. this multi-write OBI/TCDM variant parameter bit AddrGenTranspose = 1'b0, + /// Address-gen transpose: skew the dst pitch to avoid TCDM bank conflicts + parameter bit BankSkew = 1'b0, parameter type axi_ar_chan_t = logic, parameter type axi_aw_chan_t = logic, parameter type axi_req_t = logic, @@ -383,6 +385,7 @@ module idma_inst64_top #( idma_transpose_midend #( .NumDim ( NumDim ), .AddrGenTranspose ( AddrGenTranspose ), + .BankSkew ( BankSkew ), .StrbWidth ( StrbWidth ), .addr_t ( addr_t ), .idma_nd_req_t ( idma_nd_req_t ) diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv index e2d726d2..4ebfc49a 100644 --- a/src/midend/idma_transpose_midend.sv +++ b/src/midend/idma_transpose_midend.sv @@ -14,6 +14,9 @@ module idma_transpose_midend #( parameter int unsigned NumDim = 32'd4, /// Address-gen mode: element-granular swapped-stride transpose, no engine parameter bit AddrGenTranspose = 1'b0, + /// Address-gen: pad the dst row pitch by one bus-word when needed so the + /// per-column word stride is odd (conflict-free on power-of-2-bank TCDM) + parameter bit BankSkew = 1'b0, /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes) parameter int unsigned StrbWidth = 32'd64, /// Address type @@ -54,7 +57,7 @@ module idma_transpose_midend #( logic [TensorW-1:0] tm, tn; logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe; logic signed [WorkW-1:0] strb_c; // NE*E == StrbWidth (mode cancels) - logic signed [WorkW-1:0] e, me; // address-gen: E (=1< round-robin all banks. + ne = $signed(WorkW'(StrbWidth)) >>> mode; // NE = StrbWidth/E + pad = (BankSkew && (me[Log2Strb:0] == '0)) ? ne : '0; + me = (m + pad) <<< mode; // M'*E (padded pitch) nd_req_o.burst_req.opt.compute.enable = 1'b0; nd_req_o.burst_req.length = LenW'(e); - // d_req[0] = column walk (reps N): src +E, dst +M*E + // d_req[0] = column walk (reps N): src +E, dst +M'*E nd_req_o.d_req[0].reps = n[RepW-1:0]; nd_req_o.d_req[0].src_strides = addr_t'(e); nd_req_o.d_req[0].dst_strides = addr_t'(me); - // d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M*E (rewind) + // d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M'*E (rewind) nd_req_o.d_req[1].reps = m[RepW-1:0]; nd_req_o.d_req[1].src_strides = addr_t'(e); nd_req_o.d_req[1].dst_strides = addr_t'(e - (n - 1) * me); diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv index 4010c1a9..0326e502 100644 --- a/systems/snitch/test/idma_inst64_base.sv +++ b/systems/snitch/test/idma_inst64_base.sv @@ -10,7 +10,8 @@ module idma_inst64_base #( parameter int unsigned DMATracing = idma_inst64_tb_pkg::DMATracing, parameter idma_pkg::compute_enable_t ComputeEnable = '0, - parameter bit AddrGenTranspose = 1'b0 + parameter bit AddrGenTranspose = 1'b0, + parameter bit BankSkew = 1'b0 ); import idma_inst64_tb_pkg::*; import idma_inst64_snitch_pkg::*; @@ -58,6 +59,7 @@ module idma_inst64_base #( .DMATracing ( DMATracing ), .ComputeEnable ( ComputeEnable ), .AddrGenTranspose( AddrGenTranspose ), + .BankSkew ( BankSkew ), .axi_ar_chan_t ( axi_ar_chan_t ), .axi_aw_chan_t ( axi_aw_chan_t ), .axi_req_t ( axi_req_t ), diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv index 89ae9e7f..e6b04684 100644 --- a/systems/snitch/test/tb_idma_inst64_transpose.sv +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -12,7 +12,8 @@ module tb_idma_inst64_transpose #( parameter int unsigned M = 40, // matrix rows (elements) parameter int unsigned N = 70, // matrix cols (elements) - parameter int unsigned EB = 4 // element size in bytes (1/2/4) + parameter int unsigned EB = 4, // element size in bytes (1/2/4) + parameter bit BankSkew = 1'b0 ); import idma_inst64_tb_pkg::*; @@ -27,7 +28,8 @@ module tb_idma_inst64_transpose #( localparam addr_t CPY = 64'h0080_0000; localparam addr_t ASRC = 64'h8000_0000; - idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1)) harness(); + idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1), + .BankSkew(BankSkew)) harness(); int unsigned errs = 0; @@ -43,6 +45,13 @@ module tb_idma_inst64_transpose #( return 8'((idx >> (8*b)) & 32'hFF); endfunction + // padded dst row pitch (matches idma_transpose_midend BankSkew rule): pad by + // one bus-word of elements when mm*EB is an even number of bus words + function automatic int unsigned skew_pitch(input int unsigned mm); + if (BankSkew && ((mm*EB) % (2*StrbWidth) == 0)) return mm + StrbWidth/EB; + else return mm; + endfunction + // memory backdoors selected by protocol: OBI (TCDM) vs AXI (ToSoC) task automatic seed_byte(input bit obi, input addr_t a, input logic [7:0] d); if (obi) harness.obi_write_byte(a, d); else harness.mem_write_byte(a, d); @@ -58,11 +67,13 @@ module tb_idma_inst64_transpose #( input bit src_obi, input bit dst_obi); tf_id_t tid; longint unsigned c0, cyc, b0; + int unsigned mp; + mp = skew_pitch(mm); // dst row pitch (>= mm when BankSkew pads it) for (int unsigned idx = 0; idx < mm*nn; idx++) for (int unsigned b = 0; b < EB; b++) seed_byte(src_obi, src + idx*EB + b, fp(idx, b)); - // address-gen output is a contiguous N x M transpose (pitch M, no padding) - for (int unsigned k = 0; k < nn*mm*EB; k++) + // dst is an N x mp transpose (mp == M unless bank-skew pads the pitch) + for (int unsigned k = 0; k < nn*mp*EB; k++) seed_byte(dst_obi, dst + k, 8'hCC); c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; @@ -70,14 +81,14 @@ module tb_idma_inst64_transpose #( harness.drv_if.dma_wait(tid, 0); harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading cyc = harness.drv_if.cycle_counter - c0; - $display(" transpose %0dx%0d %s->%s: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, - src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", burst_cnt-b0, mm*nn, cyc); + $display(" transpose %0dx%0d %s->%s pitch=%0d: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, + src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", mp, burst_cnt-b0, mm*nn, cyc); - // out_T[c][r] == in[r][c], dst contiguous N x M (pitch M) + // out_T[c][r] == in[r][c] at dst row pitch mp for (int unsigned c = 0; c < nn; c++) for (int unsigned r = 0; r < mm; r++) for (int unsigned b = 0; b < EB; b++) begin - automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mm + r)*EB + b); + automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mp + r)*EB + b); automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*EB + b); if (got !== exp) begin errs++; @@ -85,6 +96,14 @@ module tb_idma_inst64_transpose #( $display("[TP] data mismatch out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp); end end + // skew padding (columns r in [mm, mp)) must stay sentinel + for (int unsigned c = 0; c < nn; c++) + for (int unsigned r = mm; r < mp; r++) + for (int unsigned b = 0; b < EB; b++) + if (peek_byte(dst_obi, dst + (c*mp + r)*EB + b) !== 8'hCC) begin + errs++; + if (errs <= 12) $display("[TP] skew padding clobbered at [%0d][%0d].b%0d", c, r, b); + end endtask initial begin From 29ddc741cc49d08f29ac87b3449bda16d619d335 Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Tue, 23 Jun 2026 11:24:25 +0200 Subject: [PATCH 6/7] test: Internalize the inst64 transpose geometry sweep Per the iDMA TB convention, a self-checking TB drives its own stimulus in one elaboration. M/N/EB are runtime (the DMCPY carries them), so the transpose TB now loops a localparam geometry list (int8/fp16/fp32, square/rect/odd, incl. the bank-skew-triggering shapes) instead of taking M/N/EB as elaboration params swept from the Makefile. Consecutive cases also cover back-to-back leak. Only BankSkew stays structural: the make target runs one vsim per BankSkew config. Drops the external TP_SWEEP loop. PASS BankSkew off and on. --- systems/snitch/Makefile | 23 ++-- .../snitch/test/tb_idma_inst64_transpose.sv | 105 +++++++++--------- 2 files changed, 60 insertions(+), 68 deletions(-) diff --git a/systems/snitch/Makefile b/systems/snitch/Makefile index 351cd6bb..a2c415f5 100644 --- a/systems/snitch/Makefile +++ b/systems/snitch/Makefile @@ -18,26 +18,19 @@ BUILD := $(SNITCH_DIR)/build TARGETS := -t rtl -t split_rtl -t snitch_cluster -t idma_test -t test TOP ?= tb_idma_inst64_copy -# Transpose end-to-end sweep: shapes across all element sizes (int8/fp16/fp32), -# single/multi-tile, edge, exact-multiple (zero padding), and the NE=StrbWidth -# in-flight boundary. Each mode (EB=1/2/4) gets single-tile + aligned-multi-tile -# + edge-multi-tile coverage. -TP_SWEEP := 8,8,4 40,70,4 48,16,4 \ - 32,32,2 50,40,2 96,96,2 70,96,2 \ - 64,64,1 130,80,1 128,64,1 256,96,1 - -.PHONY: snitch_sim snitch_compile snitch_transpose_sweep snitch_clean +.PHONY: snitch_sim snitch_compile snitch_transpose snitch_clean snitch_sim: snitch_compile cd $(BUILD) && $(VSIM) -c $(TOP)_opt -do "run -all; quit" -snitch_transpose_sweep: $(BUILD)/compile_snitch.tcl - @for cfg in $(TP_SWEEP); do \ - m=$${cfg%%,*}; r=$${cfg#*,}; n=$${r%%,*}; e=$${r##*,}; \ +# Transpose end-to-end: the TB sweeps the geometry list (M/N/EB) internally; one +# run per structural config (BankSkew off/on). +snitch_transpose: $(BUILD)/compile_snitch.tcl + @for sk in 0 1; do \ cd $(BUILD) && $(VSIM) -c -do \ - "source compile_snitch.tcl; vopt +acc tb_idma_inst64_transpose -gM=$$m -gN=$$n -gEB=$$e -o tb_sw; quit" \ + "source compile_snitch.tcl; vopt +acc tb_idma_inst64_transpose -gBankSkew=$$sk -o tb_tp; quit" \ >/dev/null 2>&1; \ - printf '%-14s ' "$$m x $$n EB=$$e:"; \ - $(VSIM) -c tb_sw -do "run -all; quit" 2>&1 | grep -E '\[TP\] (PASS|FAIL)' | sed 's/# //'; \ + printf 'BankSkew=%s: ' $$sk; \ + $(VSIM) -c tb_tp -do "run -all; quit" 2>&1 | grep -E '\[TP\] (PASS|FAIL)' | sed 's/# //'; \ done snitch_compile: $(BUILD)/compile_snitch.tcl diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv index e6b04684..c46fc7ee 100644 --- a/systems/snitch/test/tb_idma_inst64_transpose.sv +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -4,22 +4,18 @@ // // Author: Daniel Keller -/// End-to-end on-the-fly transpose through the inst64 frontend: -/// accelerator bus -> DMCPY transpose decode -> opt.compute -> transpose_midend -/// -> nd_midend -> rw_axi+compute backend -> axi_sim_mem. Checks transposed -/// data, padding integrity, multi-tile geometry, back-to-back (geometry leak), -/// and cross-transfer compute leak. +/// End-to-end on-the-fly transpose through the inst64 frontend over the OBI/ +/// TCDM port (and AXI->OBI): DMCPY transpose decode -> opt.compute -> +/// idma_transpose_midend (address-gen) -> idma_nd_midend -> backend -> memory. +/// Sweeps a geometry list in one elaboration (one run per structural config, +/// i.e. per BankSkew). Checks transposed data, bank-skew padding, back-to-back +/// geometry leak (consecutive cases), cross-transfer compute leak, and reject. module tb_idma_inst64_transpose #( - parameter int unsigned M = 40, // matrix rows (elements) - parameter int unsigned N = 70, // matrix cols (elements) - parameter int unsigned EB = 4, // element size in bytes (1/2/4) - parameter bit BankSkew = 1'b0 + parameter bit BankSkew = 1'b0 ); import idma_inst64_tb_pkg::*; localparam int unsigned StrbWidth = AxiDataWidth/8; - localparam int unsigned NE = StrbWidth/EB; - localparam int unsigned MODE = (EB==4) ? 2 : (EB==2) ? 1 : 0; // TCDM/OBI region (addr_map routes < 0x1000_0000 to the OBI/TCDM port); // ASRC is an external matrix in the AXI (ToSoC) region for the AXI->OBI case. @@ -28,27 +24,34 @@ module tb_idma_inst64_transpose #( localparam addr_t CPY = 64'h0080_0000; localparam addr_t ASRC = 64'h8000_0000; + // Geometry cases (M, N, EB) swept in one elaboration: int8/fp16/fp32, square/ + // rectangular/odd; 32x8 EB4 and 64x4 EB2 trigger the BankSkew pitch pad. + localparam int unsigned NC = 8; + localparam int unsigned Cases [NC][3] = '{ + '{ 8, 8, 1}, '{ 6, 5, 1}, '{16, 16, 1}, '{ 5, 7, 2}, + '{10, 6, 2}, '{12, 8, 4}, '{32, 8, 4}, '{64, 4, 2} + }; + idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1), .BankSkew(BankSkew)) harness(); int unsigned errs = 0; - // backend burst counter (proves the full NumDim=4 walk: NE*YT*NT bursts/tile-rows) + // backend burst counter (address-gen issues M*N one-element bursts) longint unsigned burst_cnt = 0; always @(posedge harness.clk) if (harness.i_dut.idma_req_valid[0] && harness.i_dut.idma_req_ready[0]) burst_cnt++; - // Unique per-element fingerprint: byte b of element idx encodes (idx>>8b). - // Distinguishes distinct source elements so a mis-permutation cannot hide - // behind a value collision (a plain byte ramp aliases mod 256). + // Unique per-element fingerprint: byte b of element idx encodes (idx>>8b), so + // a mis-permutation cannot hide behind a value collision. function automatic logic [7:0] fp(input int unsigned idx, input int unsigned b); return 8'((idx >> (8*b)) & 32'hFF); endfunction // padded dst row pitch (matches idma_transpose_midend BankSkew rule): pad by - // one bus-word of elements when mm*EB is an even number of bus words - function automatic int unsigned skew_pitch(input int unsigned mm); - if (BankSkew && ((mm*EB) % (2*StrbWidth) == 0)) return mm + StrbWidth/EB; + // one bus-word of elements when mm*eb is an even number of bus words + function automatic int unsigned skew_pitch(input int unsigned mm, input int unsigned eb); + if (BankSkew && ((mm*eb) % (2*StrbWidth) == 0)) return mm + StrbWidth/eb; else return mm; endfunction @@ -60,74 +63,72 @@ module tb_idma_inst64_transpose #( return obi ? harness.obi_read_byte(a) : harness.mem_read_byte(a); endfunction - // Run one transpose of an mm x nn matrix src -> dst (address-gen, contiguous - // N x M output). src_obi/dst_obi pick the TCDM(OBI) vs external(AXI) memory. - task automatic do_transpose(input int unsigned mm, input int unsigned nn, + // Run one mm x nn (eb-byte element) transpose src -> dst via address-gen. + // src_obi/dst_obi pick the TCDM(OBI) vs external(AXI) memory. + task automatic do_transpose(input int unsigned mm, input int unsigned nn, input int unsigned eb, input addr_t src, input addr_t dst, input bit src_obi, input bit dst_obi); tf_id_t tid; longint unsigned c0, cyc, b0; - int unsigned mp; - mp = skew_pitch(mm); // dst row pitch (>= mm when BankSkew pads it) + int unsigned mp, mode; + mp = skew_pitch(mm, eb); + mode = (eb == 4) ? 2 : (eb == 2) ? 1 : 0; for (int unsigned idx = 0; idx < mm*nn; idx++) - for (int unsigned b = 0; b < EB; b++) - seed_byte(src_obi, src + idx*EB + b, fp(idx, b)); + for (int unsigned b = 0; b < eb; b++) + seed_byte(src_obi, src + idx*eb + b, fp(idx, b)); // dst is an N x mp transpose (mp == M unless bank-skew pads the pitch) - for (int unsigned k = 0; k < nn*mp*EB; k++) + for (int unsigned k = 0; k < nn*mp*eb; k++) seed_byte(dst_obi, dst + k, 8'hCC); c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; - harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(MODE), 3'd0, tid); + harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(mode), 3'd0, tid); harness.drv_if.dma_wait(tid, 0); harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading cyc = harness.drv_if.cycle_counter - c0; - $display(" transpose %0dx%0d %s->%s pitch=%0d: bursts=%0d (exp M*N=%0d) cycles=%0d", mm, nn, + $display(" %0dx%0d EB=%0d %s->%s pitch=%0d: bursts=%0d (exp %0d) cycles=%0d", mm, nn, eb, src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", mp, burst_cnt-b0, mm*nn, cyc); // out_T[c][r] == in[r][c] at dst row pitch mp for (int unsigned c = 0; c < nn; c++) for (int unsigned r = 0; r < mm; r++) - for (int unsigned b = 0; b < EB; b++) begin - automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mp + r)*EB + b); - automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*EB + b); + for (int unsigned b = 0; b < eb; b++) begin + automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mp + r)*eb + b); + automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*eb + b); if (got !== exp) begin errs++; if (errs <= 12) - $display("[TP] data mismatch out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp); + $display("[TP] data mismatch %0dx%0d out_T[%0d][%0d].b%0d=%02h exp %02h", + mm, nn, c, r, b, got, exp); end end - // skew padding (columns r in [mm, mp)) must stay sentinel + // bank-skew padding (columns r in [mm, mp)) must stay sentinel for (int unsigned c = 0; c < nn; c++) for (int unsigned r = mm; r < mp; r++) - for (int unsigned b = 0; b < EB; b++) - if (peek_byte(dst_obi, dst + (c*mp + r)*EB + b) !== 8'hCC) begin + for (int unsigned b = 0; b < eb; b++) + if (peek_byte(dst_obi, dst + (c*mp + r)*eb + b) !== 8'hCC) begin errs++; - if (errs <= 12) $display("[TP] skew padding clobbered at [%0d][%0d].b%0d", c, r, b); + if (errs <= 12) $display("[TP] skew padding clobbered %0dx%0d [%0d][%0d]", mm, nn, c, r); end endtask initial begin @(posedge harness.rst_n); repeat (10) @(posedge harness.clk); + $display("=== inst64 transpose (BankSkew=%0d, StrbWidth=%0d) ===", BankSkew, StrbWidth); - $display("=== inst64 transpose EB=%0d ===", EB); - - // 1. OBI->OBI: transpose a tile within L1/TCDM (the Snitch DMA case) - do_transpose(M, N, TSRC, TDST, 1'b1, 1'b1); - - // 2. AXI->OBI: load an external matrix into TCDM transposed - do_transpose(M, N, ASRC, TDST + 64'h0010_0000, 1'b0, 1'b1); + // geometry sweep, OBI->OBI (consecutive cases also cover back-to-back leak) + for (int unsigned k = 0; k < NC; k++) + do_transpose(Cases[k][0], Cases[k][1], Cases[k][2], TSRC, TDST, 1'b1, 1'b1); - // 3. back-to-back (swapped shape, fresh dst): catch geometry/state leak - do_transpose(N, M, TSRC + 64'h0010_0000, TDST + 64'h0020_0000, 1'b1, 1'b1); + // AXI->OBI: load an external matrix into TCDM transposed + do_transpose(16, 12, 4, ASRC, TDST, 1'b0, 1'b1); - // 4. cross-transfer compute leak: a plain copy after transposes must NOT - // inherit opt.compute (default-zeroed). Verify a 1:1 OBI copy. + // cross-transfer compute leak: a plain OBI copy must NOT inherit opt.compute begin automatic int unsigned len = 128; tf_id_t tid2; for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(TSRC + k, 8'hE0 + k[4:0]); - for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(CPY + k, 8'h00); + for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(CPY + k, 8'h00); harness.drv_if.dma_set_source(TSRC); harness.drv_if.dma_set_dest(CPY); harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid2); @@ -140,7 +141,7 @@ module tb_idma_inst64_transpose #( end end - // 5. malformed transpose requests: error response, nothing launched + // malformed transpose requests: error response, nothing launched begin logic err; longint unsigned b_rej; @@ -155,17 +156,15 @@ module tb_idma_inst64_transpose #( if (burst_cnt != b_rej) begin errs++; $display("[TP] reject fail: rejected request launched bursts"); end - // a valid transpose must still work after rejections - do_transpose(8, 8, TSRC + 64'h0030_0000, TDST + 64'h0030_0000, 1'b1, 1'b1); end - if (errs == 0) $display("[TP] PASS: transpose data + padding + back-to-back + no-leak OK"); + if (errs == 0) $display("[TP] PASS: %0d-case sweep + AXI->OBI + no-leak + reject OK", NC); else $fatal(1, "[TP] FAIL: %0d mismatches", errs); $finish; end initial begin - repeat (400000) @(posedge harness.clk); + repeat (1_000_000) @(posedge harness.clk); $fatal(1, "[TIMEOUT] inst64 transpose"); end From 76c0de1079d2783f24aac966ea0e5b0e8d8bc3bc Mon Sep 17 00:00:00 2001 From: Daniel Keller Date: Tue, 23 Jun 2026 11:43:04 +0200 Subject: [PATCH 7/7] ci: Fix file-header format for lint-author The snitch harness files used the singular '// Author:' header (and the Makefile a trailing '#' line); lint-authors requires a blank line after SPDX, plural '// Authors:', a '// - Name ' bullet, and a blank line after the author block. Normalize all six. --- systems/snitch/Makefile | 7 ++++--- systems/snitch/test/idma_inst64_base.sv | 5 +++-- systems/snitch/test/idma_inst64_drv_if.sv | 3 +++ systems/snitch/test/idma_inst64_tb_pkg.sv | 3 +++ systems/snitch/test/tb_idma_inst64_copy.sv | 5 +++-- systems/snitch/test/tb_idma_inst64_transpose.sv | 5 +++-- 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/systems/snitch/Makefile b/systems/snitch/Makefile index a2c415f5..59f539e5 100644 --- a/systems/snitch/Makefile +++ b/systems/snitch/Makefile @@ -1,9 +1,10 @@ # Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# -# Author: Daniel Keller -# + +# Authors: +# - Daniel Keller + # Standalone build + sim flow for the Snitch inst64 integration. Elaborates the # upstream single-head idma_inst64_top + the recycled harness against iDMA's own # deps (axi, common_cells, common_verification). Uses the split_rtl compute- diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv index 0326e502..586bb2e1 100644 --- a/systems/snitch/test/idma_inst64_base.sv +++ b/systems/snitch/test/idma_inst64_base.sv @@ -1,8 +1,9 @@ // Copyright 2026 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// -// Author: Daniel Keller + +// Authors: +// - Daniel Keller /// Base harness for the standalone single-head inst64 frontend. /// Clock/reset, the accelerator-bus driver, the upstream idma_inst64_top DUT, diff --git a/systems/snitch/test/idma_inst64_drv_if.sv b/systems/snitch/test/idma_inst64_drv_if.sv index d4382a85..8a7466a3 100644 --- a/systems/snitch/test/idma_inst64_drv_if.sv +++ b/systems/snitch/test/idma_inst64_drv_if.sv @@ -2,6 +2,9 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 +// Authors: +// - Daniel Keller + // Recycled from the vidma inst64 verification harness // (idma_alu_vec/test/frontend/idma_inst64_drv_if.sv). Faithful copy of the // copy/status tasks; the vidma-only DMOPC/multi-head/immediate tasks are diff --git a/systems/snitch/test/idma_inst64_tb_pkg.sv b/systems/snitch/test/idma_inst64_tb_pkg.sv index b02377d7..b668e9dd 100644 --- a/systems/snitch/test/idma_inst64_tb_pkg.sv +++ b/systems/snitch/test/idma_inst64_tb_pkg.sv @@ -2,6 +2,9 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 +// Authors: +// - Daniel Keller + // Recycled from the vidma inst64 verification harness // (idma_alu_vec/test/frontend/idma_inst64_tb_pkg.sv). Kept faithful. The // transpose imposes no NumAxInFlight>=NE constraint (the engine self-buffers a diff --git a/systems/snitch/test/tb_idma_inst64_copy.sv b/systems/snitch/test/tb_idma_inst64_copy.sv index 40108f4d..dbbe6bc9 100644 --- a/systems/snitch/test/tb_idma_inst64_copy.sv +++ b/systems/snitch/test/tb_idma_inst64_copy.sv @@ -1,8 +1,9 @@ // Copyright 2026 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// -// Author: Daniel Keller + +// Authors: +// - Daniel Keller /// Stage-1 plain-copy regression for the standalone single-head inst64 frontend. /// Drives DMSRC/DMDST/DMCPY over the accelerator bus and verifies the copy in diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv index c46fc7ee..6ef2252d 100644 --- a/systems/snitch/test/tb_idma_inst64_transpose.sv +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -1,8 +1,9 @@ // Copyright 2026 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// -// Author: Daniel Keller + +// Authors: +// - Daniel Keller /// End-to-end on-the-fly transpose through the inst64 frontend over the OBI/ /// TCDM port (and AXI->OBI): DMCPY transpose decode -> opt.compute ->