diff --git a/Bender.yml b/Bender.yml index 2049f69e..7a5baae6 100644 --- a/Bender.yml +++ b/Bender.yml @@ -97,6 +97,18 @@ sources: # Level 2 - src/frontend/desc64/idma_desc64_top.sv + # Snitch inst64 standalone harness; needs snitch_cluster for idma_inst64_top + opcode pkg + - target: all(snitch_cluster, idma_test) + files: + # Level 0 + - systems/snitch/test/idma_inst64_tb_pkg.sv + # Level 1 + - systems/snitch/test/idma_inst64_drv_if.sv + - systems/snitch/test/idma_inst64_base.sv + # Level 2 + - systems/snitch/test/tb_idma_inst64_copy.sv + - systems/snitch/test/tb_idma_inst64_transpose.sv + # Synthesis wrappers - target: synth files: diff --git a/src/frontend/inst64/idma_inst64_top.sv b/src/frontend/inst64/idma_inst64_top.sv index 4f93b55f..f4696e31 100644 --- a/src/frontend/inst64/idma_inst64_top.sv +++ b/src/frontend/inst64/idma_inst64_top.sv @@ -22,6 +22,13 @@ module idma_inst64_top #( parameter int unsigned NumChannels = 32'd1, parameter bit TCDMAliasEnable = 1'b0, parameter int unsigned DMATracing = 32'd0, + /// Compile-time on-the-fly compute feature enables (e.g. transpose) + parameter idma_pkg::compute_enable_t ComputeEnable = '0, + /// Transpose via address generation (no FF engine) — for backends without + /// the engine, e.g. this multi-write OBI/TCDM variant + parameter bit AddrGenTranspose = 1'b0, + /// Address-gen transpose: skew the dst pitch to avoid TCDM bank conflicts + parameter bit BankSkew = 1'b0, parameter type axi_ar_chan_t = logic, parameter type axi_aw_chan_t = logic, parameter type axi_req_t = logic, @@ -70,7 +77,7 @@ module idma_inst64_top #( localparam int unsigned TfIdWidth = 32'd32; localparam int unsigned TFLenWidth = AxiAddrWidth; localparam int unsigned RepWidth = 32'd32; - localparam int unsigned NumDim = 32'd2; + localparam int unsigned NumDim = ComputeEnable.transpose ? 32'd4 : 32'd2; localparam int unsigned BufferDepth = 32'd3; localparam int unsigned NumRules = 32'd5; @@ -84,7 +91,8 @@ module idma_inst64_top #( localparam type id_t = logic[AxiIdWidth-1:0]; localparam type tf_len_t = logic[TFLenWidth-1:0]; localparam type offset_t = logic[OffsetWidth-1:0]; - localparam type strides_t = logic[RepWidth-1:0]; + // strides must match addr_t: signed transpose deltas would not sign-extend if narrower + localparam type strides_t = addr_t; localparam type reps_t = logic[RepWidth-1:0]; localparam type tf_id_t = logic[TfIdWidth-1:0]; @@ -178,6 +186,7 @@ module idma_inst64_top #( logic [1:0] idma_fe_status; logic [2:0] idma_fe_sel_chan; logic idma_fe_twod; + logic idma_fe_tp_reject; // busy signals idma_pkg::idma_busy_t [NumChannels-1:0] idma_busy; @@ -348,7 +357,7 @@ module idma_inst64_top #( .idma_req_t ( idma_req_t ), .idma_rsp_t ( idma_rsp_t ), .idma_nd_req_t ( idma_nd_req_t ), - .RepWidths ( RepWidth ) + .RepWidths ( {NumDim{RepWidth}} ) ) i_idma_nd_midend ( .clk_i, .rst_ni, @@ -367,6 +376,33 @@ module idma_inst64_top #( .busy_o ( idma_nd_busy [c] ) ); + // FIFO output, before transpose expansion + idma_nd_req_t fifo_nd_req; + logic fifo_nd_valid, fifo_nd_ready; + + // expand transpose requests into the tiled ND walk + if (ComputeEnable.transpose) begin : gen_transpose + idma_transpose_midend #( + .NumDim ( NumDim ), + .AddrGenTranspose ( AddrGenTranspose ), + .BankSkew ( BankSkew ), + .StrbWidth ( StrbWidth ), + .addr_t ( addr_t ), + .idma_nd_req_t ( idma_nd_req_t ) + ) i_idma_transpose_midend ( + .nd_req_i ( fifo_nd_req ), + .valid_i ( fifo_nd_valid ), + .ready_o ( fifo_nd_ready ), + .nd_req_o ( idma_nd_req [c] ), + .valid_o ( idma_nd_req_valid [c] ), + .ready_i ( idma_nd_req_ready [c] ) + ); + end else begin : gen_no_transpose + assign idma_nd_req [c] = fifo_nd_req; + assign idma_nd_req_valid [c] = fifo_nd_valid; + assign fifo_nd_ready = idma_nd_req_ready [c]; + end + stream_fifo_optimal_wrap #( .Depth ( DMAReqFifoDepth ), .type_t ( idma_nd_req_t ), @@ -380,9 +416,9 @@ module idma_inst64_top #( .data_i ( idma_fe_req ), .valid_i ( idma_fe_req_valid [c] ), .ready_o ( idma_fe_req_ready [c] ), - .data_o ( idma_nd_req [c] ), - .valid_o ( idma_nd_req_valid [c] ), - .ready_i ( idma_nd_req_ready [c] ) + .data_o ( fifo_nd_req ), + .valid_o ( fifo_nd_valid ), + .ready_i ( fifo_nd_ready ) ); end @@ -519,10 +555,12 @@ module idma_inst64_top #( idma_fe_req_d.burst_req.opt.beo.src_reduce_len = 1'b0; idma_fe_req_d.burst_req.opt.beo.dst_reduce_len = 1'b0; idma_fe_req_d.burst_req.opt.last = 1'b0; + idma_fe_req_d.burst_req.opt.compute = '0; // frontend config idma_fe_cfg = '0; idma_fe_status = '0; + idma_fe_tp_reject = 1'b0; idma_fe_sel_chan = '0; // default handshaking @@ -573,6 +611,28 @@ module idma_inst64_top #( idma_inst64_snitch_pkg::DMCPY : begin idma_fe_cfg = acc_req_i.data_argb[1:0]; idma_fe_sel_chan = acc_req_i.data_argb[4:2]; + // transpose request (register form only): argb spare bits + // carry {enable, mode, tensor_m, tensor_n} + if (ComputeEnable.transpose && acc_req_i.data_argb[5]) begin + idma_fe_req_d.burst_req.opt.compute.enable = 1'b1; + idma_fe_req_d.burst_req.opt.compute.op = + idma_pkg::COMPUTE_TRANSPOSE; + idma_fe_req_d.burst_req.opt.compute.params.transpose.mode = + acc_req_i.data_argb[7:6]; + idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_m = + acc_req_i.data_argb[19:8]; + idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_n = + acc_req_i.data_argb[31:20]; + end + // reject malformed transpose requests: no hardware, + // reserved mode, zero dims, unaligned dst + if (acc_req_i.data_argb[5]) begin + idma_fe_tp_reject = !ComputeEnable.transpose + | (acc_req_i.data_argb[7:6] == 2'd3) + | (acc_req_i.data_argb[19:8] == '0) + | (acc_req_i.data_argb[31:20] == '0) + | (idma_fe_req_d.burst_req.dst_addr[OffsetWidth-1:0] != '0); + end end default:; endcase @@ -588,7 +648,15 @@ module idma_inst64_top #( // 3. wait for twod transfer to be accepted (ready) // 4. send acc response (pvalid) // 5. acknowledge acc request (qready) - if (acc_res_ready) begin + // DMCPY launch; transpose requests reject malformed configs + if (idma_fe_tp_reject) begin + // error response; the transfer is not launched + if (acc_res_ready) begin + acc_res.id = acc_req_i.id; + acc_res_valid = 1'b1; + acc_req_ready_o = 1'b1; + end + end else if (acc_res_ready) begin idma_fe_req_valid[idma_fe_sel_chan] = 1'b1; if (idma_fe_req_ready[idma_fe_sel_chan]) begin acc_res.id = acc_req_i.id; @@ -750,6 +818,12 @@ module idma_inst64_top #( if (!idma_fe_twod) begin idma_fe_req.d_req[0].reps = 'd1; end + // keep higher dims inert for plain requests (the transpose expander overwrites them) + for (int d = 1; d <= NumDim-2; d++) begin + idma_fe_req.d_req[d].reps = 'd1; + idma_fe_req.d_req[d].src_strides = '0; + idma_fe_req.d_req[d].dst_strides = '0; + end end //-------------------------------------- @@ -763,6 +837,16 @@ module idma_inst64_top #( //-------------------------------------- // only activate tracer if requested `ifndef SYNTHESIS + initial assert (idma_pkg::TransposeDimWidth == 32'd12) else + $fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12"); +`ifndef VERILATOR + // capability cross-check against the generated backend's baked compute set + // (engine route only; address-gen needs no compute-enabled backend) + if (ComputeEnable.transpose && !AddrGenTranspose) begin : gen_compute_check + initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else + $fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant"); + end +`endif if (DMATracing) begin : gen_tracer for (genvar c = 0; c < NumChannels; c++) begin : gen_channels // derive the name of the trace file from the hart and channel IDs diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv index b6f66325..4ebfc49a 100644 --- a/src/midend/idma_transpose_midend.sv +++ b/src/midend/idma_transpose_midend.sv @@ -5,12 +5,18 @@ // Authors: // - Daniel Keller -/// Transpose geometry expander: expands an opt.compute=TRANSPOSE request into a -/// NumDim=4 tiled ND walk for the generic idma_nd_midend. Non-transpose passes -/// through. Combinational, quasi-static per request. +/// Transpose geometry expander for the generic idma_nd_midend. Two modes: +/// engine (NumDim=4 tiled walk feeding the FF transpose engine) and address-gen +/// (element-granular swapped-stride walk, no engine, for backends without the +/// engine e.g. multi-write OBI/TCDM). Non-transpose passes through. module idma_transpose_midend #( - /// Number of ND dimensions (must be >= 4 to express the tiled walk) + /// Number of ND dimensions (engine walk needs >= 4; address-gen needs >= 3) parameter int unsigned NumDim = 32'd4, + /// Address-gen mode: element-granular swapped-stride transpose, no engine + parameter bit AddrGenTranspose = 1'b0, + /// Address-gen: pad the dst row pitch by one bus-word when needed so the + /// per-column word stride is odd (conflict-free on power-of-2-bank TCDM) + parameter bit BankSkew = 1'b0, /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes) parameter int unsigned StrbWidth = 32'd64, /// Address type @@ -51,6 +57,7 @@ module idma_transpose_midend #( logic [TensorW-1:0] tm, tn; logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe; logic signed [WorkW-1:0] strb_c; // NE*E == StrbWidth (mode cancels) + logic signed [WorkW-1:0] e, me, pad; // address-gen: E (=1<>> log2ne; // ceil(M/NE) - nt = (n + ne - 1) >>> log2ne; // ceil(N/NE) - nxe = n <<< mode; // N*E (E = 1< round-robin all banks. + ne = $signed(WorkW'(StrbWidth)) >>> mode; // NE = StrbWidth/E + pad = (BankSkew && (me[Log2Strb:0] == '0)) ? ne : '0; + me = (m + pad) <<< mode; // M'*E (padded pitch) + nd_req_o.burst_req.opt.compute.enable = 1'b0; + nd_req_o.burst_req.length = LenW'(e); + // d_req[0] = column walk (reps N): src +E, dst +M'*E + nd_req_o.d_req[0].reps = n[RepW-1:0]; + nd_req_o.d_req[0].src_strides = addr_t'(e); + nd_req_o.d_req[0].dst_strides = addr_t'(me); + // d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M'*E (rewind) + nd_req_o.d_req[1].reps = m[RepW-1:0]; + nd_req_o.d_req[1].src_strides = addr_t'(e); + nd_req_o.d_req[1].dst_strides = addr_t'(e - (n - 1) * me); + for (int unsigned d = 2; d < NumDim-1; d++) begin + nd_req_o.d_req[d].reps = RepW'(1); + nd_req_o.d_req[d].src_strides = '0; + nd_req_o.d_req[d].dst_strides = '0; + end + end else begin + log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode}); + ne = $signed(WorkW'(1)) <<< log2ne; // tile side (elements) + yt = (m + ne - 1) >>> log2ne; // ceil(M/NE) + nt = (n + ne - 1) >>> log2ne; // ceil(N/NE) + nxe = n <<< mode; // N*E (E = 1<= 4) else - $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim); + initial assert (NumDim >= (AddrGenTranspose ? 32'd3 : 32'd4)) else + $fatal(1, "idma_transpose_midend: NumDim too small (got %0d)", NumDim); // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2 initial assert (Log2Strb >= 2) else $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth); diff --git a/systems/snitch/.gitignore b/systems/snitch/.gitignore new file mode 100644 index 00000000..180edf75 --- /dev/null +++ b/systems/snitch/.gitignore @@ -0,0 +1,5 @@ +build/ +*.so +modelsim.ini +transcript +work/ diff --git a/systems/snitch/Makefile b/systems/snitch/Makefile new file mode 100644 index 00000000..59f539e5 --- /dev/null +++ b/systems/snitch/Makefile @@ -0,0 +1,50 @@ +# Copyright 2026 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# Authors: +# - Daniel Keller + +# Standalone build + sim flow for the Snitch inst64 integration. Elaborates the +# upstream single-head idma_inst64_top + the recycled harness against iDMA's own +# deps (axi, common_cells, common_verification). Uses the split_rtl compute- +# enabled rw_axi backend (the bundled idma_generated.sv predates opt.compute). + +SNITCH_DIR := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))) +IDMA_ROOT := $(realpath $(SNITCH_DIR)/../..) +BENDER ?= bender +VSIM ?= questa-2023.4 vsim + +BUILD := $(SNITCH_DIR)/build +TARGETS := -t rtl -t split_rtl -t snitch_cluster -t idma_test -t test +TOP ?= tb_idma_inst64_copy + +.PHONY: snitch_sim snitch_compile snitch_transpose snitch_clean +snitch_sim: snitch_compile + cd $(BUILD) && $(VSIM) -c $(TOP)_opt -do "run -all; quit" + +# Transpose end-to-end: the TB sweeps the geometry list (M/N/EB) internally; one +# run per structural config (BankSkew off/on). +snitch_transpose: $(BUILD)/compile_snitch.tcl + @for sk in 0 1; do \ + cd $(BUILD) && $(VSIM) -c -do \ + "source compile_snitch.tcl; vopt +acc tb_idma_inst64_transpose -gBankSkew=$$sk -o tb_tp; quit" \ + >/dev/null 2>&1; \ + printf 'BankSkew=%s: ' $$sk; \ + $(VSIM) -c tb_tp -do "run -all; quit" 2>&1 | grep -E '\[TP\] (PASS|FAIL)' | sed 's/# //'; \ + done + +snitch_compile: $(BUILD)/compile_snitch.tcl + cd $(BUILD) && $(VSIM) -c -do \ + "source compile_snitch.tcl; vopt +acc $(TOP) -o $(TOP)_opt; quit" + +$(BUILD)/compile_snitch.tcl: | $(BUILD) + $(MAKE) -C $(IDMA_ROOT) idma_hw_all + cd $(IDMA_ROOT) && $(BENDER) script vsim $(TARGETS) \ + --vlog-arg="-svinputport=compat" > $@ + +$(BUILD): + mkdir -p $(BUILD) + +snitch_clean: + rm -rf $(BUILD) diff --git a/systems/snitch/README.md b/systems/snitch/README.md new file mode 100644 index 00000000..4fd6a5bb --- /dev/null +++ b/systems/snitch/README.md @@ -0,0 +1,65 @@ +# Snitch (inst64) iDMA integration + +Standalone host for the **inst64** ISA-coupled frontend (`idma_inst64_top`) — the +tightly-coupled Snitch DMA interface. iDMA already owns `idma_inst64_top`; this +directory adds a cluster-free verification harness and (Stage 2) the on-the-fly +transpose wired through the accelerator interface. + +## Recycled, not reinvented + +The harness is **recycled from the vidma fork's inst64 verification interface** +(`idma_alu_vec/test/frontend/`), adapted only as the clean upstream single-head +`idma_inst64_top` requires: + +| File | Provenance | +|------|------------| +| `test/idma_inst64_tb_pkg.sv` | faithful copy (8-line delta: `AxiDataWidth`/`NumAxInFlight` sizing + header) | +| `test/idma_inst64_drv_if.sv` | faithful copy; dropped the 4 vidma-only tasks (`DMOPC`, multi-head copy, immediate `DMCPYI`) to match upstream | +| `test/idma_inst64_base.sv` | adapted: single-head (`axi_req_o[NumChannels]`, no `NumHeads`/`enable_single_head_mode`) | +| `test/tb_idma_inst64_copy.sv` | Stage-1 plain-copy regression | + +The accelerator interface (the 4-field `acc_req`/`acc_res` bus + the `DM*` +instruction BFM) is exactly the vidma one — no reinvention. + +## Why split_rtl + +`idma_inst64_top` is gated behind the `snitch_cluster` Bender target. The build +uses `-t split_rtl` (per-variant RTL) because the **bundled `idma_generated.sv` +predates the typed `opt.compute` struct** (it still references the old flat +`opt.transpose_en` fields) and won't elaborate against the current package. The +split_rtl `idma_backend_rw_axi` is compute-enabled (`IDMA_VIDMA_IDS=rw_axi`). + +## Standalone simulation + +```bash +make -C systems/snitch snitch_sim # plain-copy regression (Stage 1) +make -C systems/snitch snitch_sim TOP=tb_idma_inst64_transpose # transpose (Stage 2) +``` + +Drives `DMSRC`/`DMDST`/`DMCPY` (+ `DMSTR`/`DMREP` for 2D) over the accelerator +bus and verifies the AXI sim memory. Requires `questa-2023.4`. + +## Status + +- **Stage 1 (done):** plain copy through the single-head frontend — 3 transfers pass. +- **Stage 2 (done):** multi-tile on-the-fly transpose, end-to-end. A transpose is + programmed with the spare `DMCPY` argb bits (`[5]`=enable, `[7:6]`=mode, + `[19:8]`=M, `[31:20]`=N), populating the typed per-transfer `opt.compute`; the + dedicated `src/midend/idma_transpose_midend.sv` expands `(M,N,mode)` into the + `NumDim=4` tiled walk; the unmodified `idma_nd_midend` walks it into the + compute-enabled `rw_axi` backend. Gated by `idma_inst64_top`'s + `ComputeEnable.transpose` (off by default, so other snitch_cluster consumers + are unaffected). Verified across int8/fp16/fp32, single/multi-tile, edge tiles, + padding integrity, back-to-back, and cross-transfer no-leak: + `make -C systems/snitch snitch_transpose_sweep`. Full functionality at any + `NumAxInFlight` (down to the backend min) — the compute backend internally + buffers a tile of write descriptors (`ComputeFifoDepth = StrbWidth`), so there + is no `NumAxInFlight >= NE` constraint. + +## Transpose memory contract + +A transposed transfer reads the source up to the tile-padded bounds +(`ceil(M/NE)*NE` rows of `N` elements, the last row tile reading past row `M-1`) +and writes the full padded destination extent (`ceil(N/NE)*NE` rows at pitch +`MP = ceil(M/NE)*NE`; padding is strobe-masked but addressed). Both regions must +be mapped, side-effect-free memory. diff --git a/systems/snitch/test/idma_inst64_base.sv b/systems/snitch/test/idma_inst64_base.sv new file mode 100644 index 00000000..586bb2e1 --- /dev/null +++ b/systems/snitch/test/idma_inst64_base.sv @@ -0,0 +1,177 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +/// Base harness for the standalone single-head inst64 frontend. +/// Clock/reset, the accelerator-bus driver, the upstream idma_inst64_top DUT, +/// and one axi_sim_mem per channel. No Snitch cluster, no multi-head. +module idma_inst64_base #( + parameter int unsigned DMATracing = idma_inst64_tb_pkg::DMATracing, + parameter idma_pkg::compute_enable_t ComputeEnable = '0, + parameter bit AddrGenTranspose = 1'b0, + parameter bit BankSkew = 1'b0 +); + import idma_inst64_tb_pkg::*; + import idma_inst64_snitch_pkg::*; + + logic clk; + logic rst_n; + + clk_rst_gen #( + .ClkPeriod ( Period ), + .RstClkCycles( ResetCycles ) + ) i_clock_reset_generator ( + .clk_o ( clk ), + .rst_no( rst_n ) + ); + + idma_inst64_drv_if drv_if ( + .clk ( clk ), + .rst_n( rst_n ) + ); + + axi_req_t [NumChannels-1:0] axi_req; + axi_resp_t [NumChannels-1:0] axi_res; + obi_req_t [NumChannels-1:0] obi_req; + obi_res_t [NumChannels-1:0] obi_res; + dma_events_t [NumChannels-1:0] events; + logic [NumChannels-1:0] busy; + + // route the test's AXI range via the default idx (ToSoC=AXI); the single rule + // maps an unused low TCDM range to OBI so the OBI port stays idle + addr_rule_t addr_map; + assign addr_map = '{ + idx: idma_pkg::TCDMDMA, + start_addr: 64'h0000_0000, + end_addr: 64'h1000_0000 + }; + + idma_inst64_top #( + .AxiDataWidth ( AxiDataWidth ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiUserWidth ( AxiUserWidth ), + .AxiIdWidth ( AxiIdWidth ), + .NumAxInFlight ( NumAxInFlight ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .NumChannels ( NumChannels ), + .DMATracing ( DMATracing ), + .ComputeEnable ( ComputeEnable ), + .AddrGenTranspose( AddrGenTranspose ), + .BankSkew ( BankSkew ), + .axi_ar_chan_t ( axi_ar_chan_t ), + .axi_aw_chan_t ( axi_aw_chan_t ), + .axi_req_t ( axi_req_t ), + .axi_res_t ( axi_resp_t ), + .init_req_chan_t ( init_req_chan_t ), + .init_rsp_chan_t ( init_rsp_chan_t ), + .init_req_t ( init_req_t ), + .init_rsp_t ( init_rsp_t ), + .obi_a_chan_t ( obi_a_chan_t ), + .obi_r_chan_t ( obi_r_chan_t ), + .obi_req_t ( obi_req_t ), + .obi_res_t ( obi_res_t ), + .acc_req_t ( acc_req_t ), + .acc_res_t ( acc_res_t ), + .dma_events_t ( dma_events_t ), + .addr_rule_t ( addr_rule_t ) + ) i_dut ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .testmode_i ( 1'b0 ), + .axi_req_o ( axi_req ), + .axi_res_i ( axi_res ), + .obi_req_o ( obi_req ), + .obi_res_i ( obi_res ), + .busy_o ( busy ), + .acc_req_i ( drv_if.acc_req ), + .acc_req_valid_i ( drv_if.acc_req_valid ), + .acc_req_ready_o ( drv_if.acc_req_ready ), + .acc_res_o ( drv_if.acc_res ), + .acc_res_valid_o ( drv_if.acc_res_valid ), + .acc_res_ready_i ( drv_if.acc_res_ready ), + .hart_id_i ( 32'h0 ), + .events_o ( events ), + .addr_map_i ( addr_map ) + ); + + for (genvar c = 0; c < NumChannels; c++) begin : gen_mem_ch + axi_sim_mem #( + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .IdWidth ( AxiIdWidth ), + .UserWidth ( AxiUserWidth ), + .axi_req_t ( axi_req_t ), + .axi_rsp_t ( axi_resp_t ), + .WarnUninitialized ( 1'b1 ), + .ClearErrOnAccess ( 1'b1 ), + .ApplDelay ( ApplDelay ), + .AcqDelay ( AcqDelay ) + ) i_axi_sim_mem ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .axi_req_i ( axi_req[c] ), + .axi_rsp_o ( axi_res[c] ), + .mon_w_valid_o ( ), + .mon_w_addr_o ( ), + .mon_w_data_o ( ), + .mon_w_id_o ( ), + .mon_w_user_o ( ), + .mon_w_beat_count_o( ), + .mon_w_last_o ( ), + .mon_r_valid_o ( ), + .mon_r_addr_o ( ), + .mon_r_data_o ( ), + .mon_r_id_o ( ), + .mon_r_user_o ( ), + .mon_r_beat_count_o( ), + .mon_r_last_o ( ) + ); + + // L1/TCDM model: connected but idle for the AXI-routed copy/transpose tests + obi_sim_mem #( + .ObiCfg ( ObiCfg ), + .obi_req_t ( obi_req_t ), + .obi_rsp_t ( obi_res_t ), + .obi_r_chan_t ( obi_r_chan_t ), + .WarnUninitialized ( 1'b0 ), + .ClearErrOnAccess ( 1'b1 ), + .ApplDelay ( ApplDelay ), + .AcqDelay ( AcqDelay ) + ) i_obi_sim_mem ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .obi_req_i ( obi_req[c] ), + .obi_rsp_o ( obi_res[c] ), + .mon_valid_o ( ), + .mon_we_o ( ), + .mon_addr_o ( ), + .mon_wdata_o ( ), + .mon_be_o ( ), + .mon_id_o ( ) + ); + end + + // Memory helpers (channel 0) + task automatic mem_write_byte(input addr_t addr, input byte data); + gen_mem_ch[0].i_axi_sim_mem.mem[addr] = data; + endtask + + function automatic logic [7:0] mem_read_byte(input addr_t addr); + if (gen_mem_ch[0].i_axi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_axi_sim_mem.mem[addr]; + else return 8'hXX; + endfunction + + // L1/TCDM (OBI) backdoor helpers (channel 0) + task automatic obi_write_byte(input addr_t addr, input byte data); + gen_mem_ch[0].i_obi_sim_mem.mem[addr] = data; + endtask + + function automatic logic [7:0] obi_read_byte(input addr_t addr); + if (gen_mem_ch[0].i_obi_sim_mem.mem.exists(addr)) return gen_mem_ch[0].i_obi_sim_mem.mem[addr]; + else return 8'hXX; + endfunction + +endmodule diff --git a/systems/snitch/test/idma_inst64_drv_if.sv b/systems/snitch/test/idma_inst64_drv_if.sv new file mode 100644 index 00000000..8a7466a3 --- /dev/null +++ b/systems/snitch/test/idma_inst64_drv_if.sv @@ -0,0 +1,194 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +// Recycled from the vidma inst64 verification harness +// (idma_alu_vec/test/frontend/idma_inst64_drv_if.sv). Faithful copy of the +// copy/status tasks; the vidma-only DMOPC/multi-head/immediate tasks are +// dropped to match the clean single-head upstream idma_inst64_top. +// +// One correctness fix vs the source: the handshake drops acc_req_valid the +// cycle the request is accepted (sampling ready at AcqDelay) instead of holding +// it one extra cycle. The source held valid past grant, which double-issues the +// request to a still-ready frontend FIFO — harmless for idempotent copies but +// corrupts non-idempotent transfers (transpose). + +interface idma_inst64_drv_if ( + input logic clk, + input logic rst_n +); + import idma_inst64_tb_pkg::*; + import idma_inst64_snitch_pkg::*; + + // Accelerator Interface Signals + acc_req_t acc_req; + logic acc_req_valid; + logic acc_req_ready; + + acc_res_t acc_res; + logic acc_res_valid; + logic acc_res_ready; + + // Internal State for BFM + logic [31:0] req_id_counter; + + // Performance Counters + longint unsigned dma_start_cycle; + longint unsigned dma_end_cycle; + longint unsigned dma_cycles; + longint unsigned cycle_counter; + + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) cycle_counter <= 0; + else cycle_counter <= cycle_counter + 1; + end + + // Initialization + initial begin + acc_req_valid = 1'b0; + acc_res_ready = 1'b1; + acc_req = '0; + req_id_counter = '0; + dma_start_cycle = 0; + dma_end_cycle = 0; + dma_cycles = 0; + end + + // Drive one accelerator instruction; valid is asserted in the apply region + // and dropped the cycle the request is accepted (ready sampled at AcqDelay). + task automatic drive(input logic [31:0] op, input logic [63:0] arga, input logic [63:0] argb); + @(posedge clk); + #(ApplDelay); + acc_req.id = req_id_counter++; + acc_req.data_op = op; + acc_req.data_arga = arga; + acc_req.data_argb = argb; + acc_req_valid = 1'b1; + do begin + @(posedge clk); + #(AcqDelay); + end while (!acc_req_ready); + acc_req_valid = 1'b0; + endtask + + //-------------------------------------- + // C-like API for DMA Programming + //-------------------------------------- + + task automatic dma_set_source(input addr_t addr); + drive(DMSRC, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]}); + endtask + + task automatic dma_set_dest(input addr_t addr); + drive(DMDST, addr[31:0], {{(64-AxiAddrWidth){1'b0}}, addr[AxiAddrWidth-1:32]}); + endtask + + task automatic dma_set_strides(input logic [31:0] src_stride, input logic [31:0] dst_stride); + drive(DMSTR, src_stride, dst_stride); + endtask + + task automatic dma_set_reps(input logic [31:0] reps); + drive(DMREP, reps, '0); + endtask + + // Launch a copy. cfg[1] = 2D enable; channel selects the AXI manager. + // Reads back the transfer id from the response. + task automatic dma_start_copy( + input addr_t length, + input logic [1:0] cfg, + input logic [2:0] channel, + output tf_id_t transfer_id + ); + drive(DMCPY, length, {59'b0, channel, cfg}); + while (!acc_res_valid) @(posedge clk); + transfer_id = acc_res.data[31:0]; + endtask + + // Launch a transpose. Encodes {enable, mode, M, N} into the spare DMCPY argb + // bits (argb[1:0]=cfg, [4:2]=channel, [5]=transpose, [7:6]=mode, + // [19:8]=tensor_m, [31:20]=tensor_n). Length is derived by the midend. + task automatic dma_transpose( + input addr_t src, + input addr_t dst, + input logic [11:0] tensor_m, + input logic [11:0] tensor_n, + input logic [1:0] mode, + input logic [2:0] channel, + output tf_id_t transfer_id + ); + logic [63:0] argb; + dma_set_source(src); + dma_set_dest(dst); + argb = '0; + argb[4:2] = channel; + argb[5] = 1'b1; + argb[7:6] = mode; + argb[19:8] = tensor_m; + argb[31:20] = tensor_n; + drive(DMCPY, '0, argb); + while (!acc_res_valid) @(posedge clk); + transfer_id = acc_res.data[31:0]; + endtask + + // issue a transpose DMCPY and return the response error bit (negative tests) + task automatic dma_transpose_err( + input addr_t src, + input addr_t dst, + input logic [11:0] tensor_m, + input logic [11:0] tensor_n, + input logic [1:0] mode, + input logic [2:0] channel, + output logic error + ); + logic [63:0] argb; + dma_set_source(src); + dma_set_dest(dst); + argb = '0; + argb[4:2] = channel; + argb[5] = 1'b1; + argb[7:6] = mode; + argb[19:8] = tensor_m; + argb[31:20] = tensor_n; + drive(DMCPY, '0, argb); + while (!acc_res_valid) @(posedge clk); + error = acc_res.error; + endtask + + task automatic dma_poll_status( + input logic [1:0] status_idx, + input logic [2:0] channel, + output logic [63:0] status_value + ); + drive(DMSTAT, '0, {59'b0, channel, status_idx}); + while (!acc_res_valid) @(posedge clk); + status_value = acc_res.data; + endtask + + task automatic dma_wait(input tf_id_t transfer_id, input logic [2:0] channel); + logic [63:0] completed_id; + $display("[%0t] dma_wait(ID=%0d, chan=%0d) - waiting...", $time, transfer_id, channel); + forever begin + dma_poll_status(2'b00, channel, completed_id); + if (completed_id >= transfer_id) begin + dma_end_cycle = cycle_counter; + dma_cycles = dma_end_cycle - dma_start_cycle; + break; + end + repeat(10) @(posedge clk); + end + endtask + + task automatic dma_wait_idle(input logic [2:0] channel); + logic [63:0] busy_status; + $display("[%0t] dma_wait_idle(chan=%0d) - waiting...", $time, channel); + forever begin + dma_poll_status(2'b10, channel, busy_status); + if (busy_status[0] == 1'b0) break; + repeat(5) @(posedge clk); + end + endtask + +endinterface diff --git a/systems/snitch/test/idma_inst64_tb_pkg.sv b/systems/snitch/test/idma_inst64_tb_pkg.sv new file mode 100644 index 00000000..b668e9dd --- /dev/null +++ b/systems/snitch/test/idma_inst64_tb_pkg.sv @@ -0,0 +1,166 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +// Recycled from the vidma inst64 verification harness +// (idma_alu_vec/test/frontend/idma_inst64_tb_pkg.sv). Kept faithful. The +// transpose imposes no NumAxInFlight>=NE constraint (the engine self-buffers a +// tile); verified down to NumAxInFlight=3 at NE=64. + +`include "axi/typedef.svh" +`include "obi/typedef.svh" + +package idma_inst64_tb_pkg; + + localparam int unsigned AxiDataWidth = 512; + localparam int unsigned AxiAddrWidth = 64; + localparam int unsigned AxiUserWidth = 1; + localparam int unsigned AxiIdWidth = 3; + localparam int unsigned NumAxInFlight = 3; // default + localparam int unsigned DMAReqFifoDepth = 3; + localparam int unsigned NumChannels = 1; + localparam int unsigned NumHeads = 1; + localparam int unsigned DMATracing = 0; + localparam int unsigned Seed = 1337; + + localparam time Period = 10ns; + localparam time ApplDelay = Period / 4; + localparam time AcqDelay = Period * 3 / 4; + localparam integer ResetCycles = 10; + + // Type definitions + typedef logic [AxiAddrWidth-1:0] addr_t; + typedef logic [AxiIdWidth-1:0] axi_id_t; + typedef logic [31:0] data_t; + typedef logic [31:0] tf_id_t; + + // AXI Types + typedef logic [AxiAddrWidth-1:0] axi_addr_t; + typedef logic [AxiDataWidth-1:0] axi_data_t; + typedef logic [AxiDataWidth/8-1:0] axi_strb_t; + typedef logic [AxiUserWidth-1:0] axi_user_t; + + `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, axi_addr_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, axi_data_t, axi_strb_t, axi_user_t) + `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, axi_addr_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, axi_data_t, axi_id_t, axi_user_t) + `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t) + `AXI_TYPEDEF_RESP_T(axi_resp_t, axi_b_chan_t, axi_r_chan_t) + + // OBI L1/TCDM types (DataWidth=AxiDataWidth, AddrWidth=AxiAddrWidth, IdWidth=AxiIdWidth) + typedef logic [AxiDataWidth/8-1:0] obi_strb_t; + typedef logic [AxiIdWidth-1:0] obi_id_t; + `OBI_TYPEDEF_MINIMAL_A_OPTIONAL(obi_a_optional_t) + `OBI_TYPEDEF_MINIMAL_R_OPTIONAL(obi_r_optional_t) + `OBI_TYPEDEF_TYPE_A_CHAN_T(obi_a_chan_t, axi_addr_t, axi_data_t, obi_strb_t, obi_id_t, obi_a_optional_t) + `OBI_TYPEDEF_TYPE_R_CHAN_T(obi_r_chan_t, axi_data_t, obi_id_t, obi_r_optional_t) + `OBI_TYPEDEF_REQ_T(obi_req_t, obi_a_chan_t) + `OBI_TYPEDEF_RSP_T(obi_res_t, obi_r_chan_t) + + localparam obi_pkg::obi_cfg_t ObiCfg = '{ + UseRReady: 1'b1, + CombGnt: 1'b0, + AddrWidth: AxiAddrWidth, + DataWidth: AxiDataWidth, + IdWidth: AxiIdWidth, + Integrity: 1'b0, + BeFull: 1'b1, + OptionalCfg: obi_pkg::ObiMinimalOptionalConfig + }; + + // INIT meta-channel types (mirror src/db/idma_init.yml) + typedef struct packed { + logic [AxiAddrWidth-1:0] cfg; + logic [AxiDataWidth-1:0] term; + logic [AxiDataWidth/8-1:0] strb; + logic [AxiIdWidth-1:0] id; + } init_req_chan_t; + + typedef struct packed { + init_req_chan_t req_chan; + logic req_valid; + logic rsp_ready; + } init_req_t; + + typedef struct packed { + logic [AxiDataWidth-1:0] init; + } init_rsp_chan_t; + + typedef struct packed { + init_rsp_chan_t rsp_chan; + logic rsp_valid; + logic req_ready; + } init_rsp_t; + + // address-decode rule type (DUT default) + typedef axi_pkg::xbar_rule_64_t addr_rule_t; + + // Accelerator request/response types (simplified Snitch accelerator interface) + typedef struct packed { + logic [31:0] id; + logic [31:0] data_op; + logic [63:0] data_arga; + logic [63:0] data_argb; + } acc_req_t; + + typedef struct packed { + logic [31:0] id; + logic [63:0] data; + logic error; + } acc_res_t; + + // DMA events (simplified) + typedef struct packed { + // aw + logic aw_valid; + logic aw_ready; + logic aw_done; + logic aw_stall; + axi_pkg::len_t aw_len; + axi_pkg::size_t aw_size; + // ar + logic ar_valid; + logic ar_ready; + logic ar_done; + logic ar_stall; + axi_pkg::len_t ar_len; + axi_pkg::size_t ar_size; + // r + logic r_valid; + logic r_ready; + logic r_done; + logic r_bw; + logic r_stall; + // w + logic w_valid; + logic w_ready; + logic w_done; + logic w_stall; + logic [31:0] num_bytes_written; + // b + logic b_valid; + logic b_ready; + logic b_done; + // busy + logic dma_busy; + } dma_events_t; + + // Golden reference for validation + typedef struct { + addr_t src_addr; + addr_t dst_addr; + addr_t length; + logic [5:0] alu_opcode; + logic [31:0] src_strides; + logic [31:0] dst_strides; + logic [31:0] reps; + logic twod; + int unsigned channel; + tf_id_t expected_id; + } transfer_t; + +endpackage diff --git a/systems/snitch/test/tb_idma_inst64_copy.sv b/systems/snitch/test/tb_idma_inst64_copy.sv new file mode 100644 index 00000000..dbbe6bc9 --- /dev/null +++ b/systems/snitch/test/tb_idma_inst64_copy.sv @@ -0,0 +1,57 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +/// Stage-1 plain-copy regression for the standalone single-head inst64 frontend. +/// Drives DMSRC/DMDST/DMCPY over the accelerator bus and verifies the copy in +/// the AXI sim memory. No compute. +module tb_idma_inst64_copy; + import idma_inst64_tb_pkg::*; + + idma_inst64_base #(.DMATracing(0)) harness(); + + localparam int unsigned TimeoutCycles = 20000; + int unsigned errors = 0; + + task automatic run_copy(input addr_t src, input addr_t dst, input int unsigned len, + input byte start); + tf_id_t tid; + for (int i = 0; i < len; i++) harness.mem_write_byte(src + i, start + i[7:0]); + for (int i = 0; i < len; i++) harness.mem_write_byte(dst + i, 8'h00); + harness.drv_if.dma_set_source(src); + harness.drv_if.dma_set_dest(dst); + harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid); + harness.drv_if.dma_wait(tid, 0); + for (int i = 0; i < len; i++) begin + automatic logic [7:0] exp = start + i[7:0]; + automatic logic [7:0] got = harness.mem_read_byte(dst + i); + if (got !== exp) begin + if (errors < 10) $error("[COPY] mismatch at %0d: exp 0x%02x got 0x%02x", i, exp, got); + errors++; + end + end + endtask + + initial begin + @(posedge harness.rst_n); + repeat (10) @(posedge harness.clk); + + $display("=== inst64 plain-copy regression ==="); + run_copy(64'h8000_0000, 64'h9000_0000, 256, 8'hA0); // word-multiple + run_copy(64'h8001_0000, 64'h9001_0000, 4055, 8'h10); // large, non-aligned length + run_copy(64'h8002_0000, 64'h9002_0000, 7, 8'h30); // tiny, sub-beat + + if (errors == 0) $display("[SV] inst64 copy: SUCCESS (3 transfers)"); + else $fatal(1, "[SV] inst64 copy: FAIL (%0d errors)", errors); + $finish; + end + + initial begin + repeat (TimeoutCycles) @(posedge harness.clk); + $fatal(1, "[TIMEOUT] inst64 copy exceeded %0d cycles", TimeoutCycles); + end + +endmodule diff --git a/systems/snitch/test/tb_idma_inst64_transpose.sv b/systems/snitch/test/tb_idma_inst64_transpose.sv new file mode 100644 index 00000000..6ef2252d --- /dev/null +++ b/systems/snitch/test/tb_idma_inst64_transpose.sv @@ -0,0 +1,172 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +/// End-to-end on-the-fly transpose through the inst64 frontend over the OBI/ +/// TCDM port (and AXI->OBI): DMCPY transpose decode -> opt.compute -> +/// idma_transpose_midend (address-gen) -> idma_nd_midend -> backend -> memory. +/// Sweeps a geometry list in one elaboration (one run per structural config, +/// i.e. per BankSkew). Checks transposed data, bank-skew padding, back-to-back +/// geometry leak (consecutive cases), cross-transfer compute leak, and reject. +module tb_idma_inst64_transpose #( + parameter bit BankSkew = 1'b0 +); + import idma_inst64_tb_pkg::*; + + localparam int unsigned StrbWidth = AxiDataWidth/8; + + // TCDM/OBI region (addr_map routes < 0x1000_0000 to the OBI/TCDM port); + // ASRC is an external matrix in the AXI (ToSoC) region for the AXI->OBI case. + localparam addr_t TSRC = 64'h0000_1000; + localparam addr_t TDST = 64'h0040_0000; + localparam addr_t CPY = 64'h0080_0000; + localparam addr_t ASRC = 64'h8000_0000; + + // Geometry cases (M, N, EB) swept in one elaboration: int8/fp16/fp32, square/ + // rectangular/odd; 32x8 EB4 and 64x4 EB2 trigger the BankSkew pitch pad. + localparam int unsigned NC = 8; + localparam int unsigned Cases [NC][3] = '{ + '{ 8, 8, 1}, '{ 6, 5, 1}, '{16, 16, 1}, '{ 5, 7, 2}, + '{10, 6, 2}, '{12, 8, 4}, '{32, 8, 4}, '{64, 4, 2} + }; + + idma_inst64_base #(.ComputeEnable('{transpose: 1'b1}), .AddrGenTranspose(1'b1), + .BankSkew(BankSkew)) harness(); + + int unsigned errs = 0; + + // backend burst counter (address-gen issues M*N one-element bursts) + longint unsigned burst_cnt = 0; + always @(posedge harness.clk) + if (harness.i_dut.idma_req_valid[0] && harness.i_dut.idma_req_ready[0]) burst_cnt++; + + // Unique per-element fingerprint: byte b of element idx encodes (idx>>8b), so + // a mis-permutation cannot hide behind a value collision. + function automatic logic [7:0] fp(input int unsigned idx, input int unsigned b); + return 8'((idx >> (8*b)) & 32'hFF); + endfunction + + // padded dst row pitch (matches idma_transpose_midend BankSkew rule): pad by + // one bus-word of elements when mm*eb is an even number of bus words + function automatic int unsigned skew_pitch(input int unsigned mm, input int unsigned eb); + if (BankSkew && ((mm*eb) % (2*StrbWidth) == 0)) return mm + StrbWidth/eb; + else return mm; + endfunction + + // memory backdoors selected by protocol: OBI (TCDM) vs AXI (ToSoC) + task automatic seed_byte(input bit obi, input addr_t a, input logic [7:0] d); + if (obi) harness.obi_write_byte(a, d); else harness.mem_write_byte(a, d); + endtask + function automatic logic [7:0] peek_byte(input bit obi, input addr_t a); + return obi ? harness.obi_read_byte(a) : harness.mem_read_byte(a); + endfunction + + // Run one mm x nn (eb-byte element) transpose src -> dst via address-gen. + // src_obi/dst_obi pick the TCDM(OBI) vs external(AXI) memory. + task automatic do_transpose(input int unsigned mm, input int unsigned nn, input int unsigned eb, + input addr_t src, input addr_t dst, + input bit src_obi, input bit dst_obi); + tf_id_t tid; + longint unsigned c0, cyc, b0; + int unsigned mp, mode; + mp = skew_pitch(mm, eb); + mode = (eb == 4) ? 2 : (eb == 2) ? 1 : 0; + for (int unsigned idx = 0; idx < mm*nn; idx++) + for (int unsigned b = 0; b < eb; b++) + seed_byte(src_obi, src + idx*eb + b, fp(idx, b)); + // dst is an N x mp transpose (mp == M unless bank-skew pads the pitch) + for (int unsigned k = 0; k < nn*mp*eb; k++) + seed_byte(dst_obi, dst + k, 8'hCC); + + c0 = harness.drv_if.cycle_counter; b0 = burst_cnt; + harness.drv_if.dma_transpose(src, dst, 12'(mm), 12'(nn), 2'(mode), 3'd0, tid); + harness.drv_if.dma_wait(tid, 0); + harness.drv_if.dma_wait_idle(0); // ensure all writes retired before reading + cyc = harness.drv_if.cycle_counter - c0; + $display(" %0dx%0d EB=%0d %s->%s pitch=%0d: bursts=%0d (exp %0d) cycles=%0d", mm, nn, eb, + src_obi ? "OBI" : "AXI", dst_obi ? "OBI" : "AXI", mp, burst_cnt-b0, mm*nn, cyc); + + // out_T[c][r] == in[r][c] at dst row pitch mp + for (int unsigned c = 0; c < nn; c++) + for (int unsigned r = 0; r < mm; r++) + for (int unsigned b = 0; b < eb; b++) begin + automatic logic [7:0] got = peek_byte(dst_obi, dst + (c*mp + r)*eb + b); + automatic logic [7:0] exp = peek_byte(src_obi, src + (r*nn + c)*eb + b); + if (got !== exp) begin + errs++; + if (errs <= 12) + $display("[TP] data mismatch %0dx%0d out_T[%0d][%0d].b%0d=%02h exp %02h", + mm, nn, c, r, b, got, exp); + end + end + // bank-skew padding (columns r in [mm, mp)) must stay sentinel + for (int unsigned c = 0; c < nn; c++) + for (int unsigned r = mm; r < mp; r++) + for (int unsigned b = 0; b < eb; b++) + if (peek_byte(dst_obi, dst + (c*mp + r)*eb + b) !== 8'hCC) begin + errs++; + if (errs <= 12) $display("[TP] skew padding clobbered %0dx%0d [%0d][%0d]", mm, nn, c, r); + end + endtask + + initial begin + @(posedge harness.rst_n); + repeat (10) @(posedge harness.clk); + $display("=== inst64 transpose (BankSkew=%0d, StrbWidth=%0d) ===", BankSkew, StrbWidth); + + // geometry sweep, OBI->OBI (consecutive cases also cover back-to-back leak) + for (int unsigned k = 0; k < NC; k++) + do_transpose(Cases[k][0], Cases[k][1], Cases[k][2], TSRC, TDST, 1'b1, 1'b1); + + // AXI->OBI: load an external matrix into TCDM transposed + do_transpose(16, 12, 4, ASRC, TDST, 1'b0, 1'b1); + + // cross-transfer compute leak: a plain OBI copy must NOT inherit opt.compute + begin + automatic int unsigned len = 128; + tf_id_t tid2; + for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(TSRC + k, 8'hE0 + k[4:0]); + for (int unsigned k = 0; k < len; k++) harness.obi_write_byte(CPY + k, 8'h00); + harness.drv_if.dma_set_source(TSRC); + harness.drv_if.dma_set_dest(CPY); + harness.drv_if.dma_start_copy(addr_t'(len), 2'b00, 3'd0, tid2); + harness.drv_if.dma_wait(tid2, 0); + harness.drv_if.dma_wait_idle(0); + for (int unsigned k = 0; k < len; k++) + if (harness.obi_read_byte(CPY + k) !== harness.obi_read_byte(TSRC + k)) begin + errs++; + if (errs <= 12) $display("[TP] leak: post-transpose copy wrong at %0d", k); + end + end + + // malformed transpose requests: error response, nothing launched + begin + logic err; + longint unsigned b_rej; + b_rej = burst_cnt; + harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd8, 12'd8, 2'd3, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: reserved mode 3"); end + harness.drv_if.dma_transpose_err(TSRC, TDST, 12'd0, 12'd8, 2'd0, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: M == 0"); end + harness.drv_if.dma_transpose_err(TSRC, TDST + 64'd1, 12'd8, 12'd8, 2'd0, 3'd0, err); + if (!err) begin errs++; $display("[TP] reject fail: unaligned dst"); end + repeat (50) @(posedge harness.clk); + if (burst_cnt != b_rej) begin + errs++; $display("[TP] reject fail: rejected request launched bursts"); + end + end + + if (errs == 0) $display("[TP] PASS: %0d-case sweep + AXI->OBI + no-leak + reject OK", NC); + else $fatal(1, "[TP] FAIL: %0d mismatches", errs); + $finish; + end + + initial begin + repeat (1_000_000) @(posedge harness.clk); + $fatal(1, "[TIMEOUT] inst64 transpose"); + end + +endmodule