Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ sources:
# Level 2
- src/frontend/desc64/idma_desc64_top.sv

# Snitch inst64 standalone harness; needs snitch_cluster for idma_inst64_top + opcode pkg
- target: all(snitch_cluster, idma_test)
files:
# Level 0
- systems/snitch/test/idma_inst64_tb_pkg.sv
# Level 1
- systems/snitch/test/idma_inst64_drv_if.sv
- systems/snitch/test/idma_inst64_base.sv
# Level 2
- systems/snitch/test/tb_idma_inst64_copy.sv
- systems/snitch/test/tb_idma_inst64_transpose.sv

# Synthesis wrappers
- target: synth
files:
Expand Down
98 changes: 91 additions & 7 deletions src/frontend/inst64/idma_inst64_top.sv
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ module idma_inst64_top #(
parameter int unsigned NumChannels = 32'd1,
parameter bit TCDMAliasEnable = 1'b0,
parameter int unsigned DMATracing = 32'd0,
/// Compile-time on-the-fly compute feature enables (e.g. transpose)
parameter idma_pkg::compute_enable_t ComputeEnable = '0,
/// Transpose via address generation (no FF engine) — for backends without
/// the engine, e.g. this multi-write OBI/TCDM variant
parameter bit AddrGenTranspose = 1'b0,
/// Address-gen transpose: skew the dst pitch to avoid TCDM bank conflicts
parameter bit BankSkew = 1'b0,
parameter type axi_ar_chan_t = logic,
parameter type axi_aw_chan_t = logic,
parameter type axi_req_t = logic,
Expand Down Expand Up @@ -70,7 +77,7 @@ module idma_inst64_top #(
localparam int unsigned TfIdWidth = 32'd32;
localparam int unsigned TFLenWidth = AxiAddrWidth;
localparam int unsigned RepWidth = 32'd32;
localparam int unsigned NumDim = 32'd2;
localparam int unsigned NumDim = ComputeEnable.transpose ? 32'd4 : 32'd2;
localparam int unsigned BufferDepth = 32'd3;
localparam int unsigned NumRules = 32'd5;

Expand All @@ -84,7 +91,8 @@ module idma_inst64_top #(
localparam type id_t = logic[AxiIdWidth-1:0];
localparam type tf_len_t = logic[TFLenWidth-1:0];
localparam type offset_t = logic[OffsetWidth-1:0];
localparam type strides_t = logic[RepWidth-1:0];
// strides must match addr_t: signed transpose deltas would not sign-extend if narrower
localparam type strides_t = addr_t;
localparam type reps_t = logic[RepWidth-1:0];
localparam type tf_id_t = logic[TfIdWidth-1:0];

Expand Down Expand Up @@ -178,6 +186,7 @@ module idma_inst64_top #(
logic [1:0] idma_fe_status;
logic [2:0] idma_fe_sel_chan;
logic idma_fe_twod;
logic idma_fe_tp_reject;

// busy signals
idma_pkg::idma_busy_t [NumChannels-1:0] idma_busy;
Expand Down Expand Up @@ -348,7 +357,7 @@ module idma_inst64_top #(
.idma_req_t ( idma_req_t ),
.idma_rsp_t ( idma_rsp_t ),
.idma_nd_req_t ( idma_nd_req_t ),
.RepWidths ( RepWidth )
.RepWidths ( {NumDim{RepWidth}} )
) i_idma_nd_midend (
.clk_i,
.rst_ni,
Expand All @@ -367,6 +376,33 @@ module idma_inst64_top #(
.busy_o ( idma_nd_busy [c] )
);

// FIFO output, before transpose expansion
idma_nd_req_t fifo_nd_req;
logic fifo_nd_valid, fifo_nd_ready;

// expand transpose requests into the tiled ND walk
if (ComputeEnable.transpose) begin : gen_transpose
idma_transpose_midend #(
.NumDim ( NumDim ),
.AddrGenTranspose ( AddrGenTranspose ),
.BankSkew ( BankSkew ),
.StrbWidth ( StrbWidth ),
.addr_t ( addr_t ),
.idma_nd_req_t ( idma_nd_req_t )
) i_idma_transpose_midend (
.nd_req_i ( fifo_nd_req ),
.valid_i ( fifo_nd_valid ),
.ready_o ( fifo_nd_ready ),
.nd_req_o ( idma_nd_req [c] ),
.valid_o ( idma_nd_req_valid [c] ),
.ready_i ( idma_nd_req_ready [c] )
);
end else begin : gen_no_transpose
assign idma_nd_req [c] = fifo_nd_req;
assign idma_nd_req_valid [c] = fifo_nd_valid;
assign fifo_nd_ready = idma_nd_req_ready [c];
end

stream_fifo_optimal_wrap #(
.Depth ( DMAReqFifoDepth ),
.type_t ( idma_nd_req_t ),
Expand All @@ -380,9 +416,9 @@ module idma_inst64_top #(
.data_i ( idma_fe_req ),
.valid_i ( idma_fe_req_valid [c] ),
.ready_o ( idma_fe_req_ready [c] ),
.data_o ( idma_nd_req [c] ),
.valid_o ( idma_nd_req_valid [c] ),
.ready_i ( idma_nd_req_ready [c] )
.data_o ( fifo_nd_req ),
.valid_o ( fifo_nd_valid ),
.ready_i ( fifo_nd_ready )
);
end

Expand Down Expand Up @@ -519,10 +555,12 @@ module idma_inst64_top #(
idma_fe_req_d.burst_req.opt.beo.src_reduce_len = 1'b0;
idma_fe_req_d.burst_req.opt.beo.dst_reduce_len = 1'b0;
idma_fe_req_d.burst_req.opt.last = 1'b0;
idma_fe_req_d.burst_req.opt.compute = '0;

// frontend config
idma_fe_cfg = '0;
idma_fe_status = '0;
idma_fe_tp_reject = 1'b0;
idma_fe_sel_chan = '0;

// default handshaking
Expand Down Expand Up @@ -573,6 +611,28 @@ module idma_inst64_top #(
idma_inst64_snitch_pkg::DMCPY : begin
idma_fe_cfg = acc_req_i.data_argb[1:0];
idma_fe_sel_chan = acc_req_i.data_argb[4:2];
// transpose request (register form only): argb spare bits
// carry {enable, mode, tensor_m, tensor_n}
if (ComputeEnable.transpose && acc_req_i.data_argb[5]) begin
idma_fe_req_d.burst_req.opt.compute.enable = 1'b1;
idma_fe_req_d.burst_req.opt.compute.op =
idma_pkg::COMPUTE_TRANSPOSE;
idma_fe_req_d.burst_req.opt.compute.params.transpose.mode =
acc_req_i.data_argb[7:6];
idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_m =
acc_req_i.data_argb[19:8];
idma_fe_req_d.burst_req.opt.compute.params.transpose.tensor_n =
acc_req_i.data_argb[31:20];
end
// reject malformed transpose requests: no hardware,
// reserved mode, zero dims, unaligned dst
if (acc_req_i.data_argb[5]) begin
idma_fe_tp_reject = !ComputeEnable.transpose
| (acc_req_i.data_argb[7:6] == 2'd3)
| (acc_req_i.data_argb[19:8] == '0)
| (acc_req_i.data_argb[31:20] == '0)
| (idma_fe_req_d.burst_req.dst_addr[OffsetWidth-1:0] != '0);
end
end
default:;
endcase
Expand All @@ -588,7 +648,15 @@ module idma_inst64_top #(
// 3. wait for twod transfer to be accepted (ready)
// 4. send acc response (pvalid)
// 5. acknowledge acc request (qready)
if (acc_res_ready) begin
// DMCPY launch; transpose requests reject malformed configs
if (idma_fe_tp_reject) begin
// error response; the transfer is not launched
if (acc_res_ready) begin
acc_res.id = acc_req_i.id;
acc_res_valid = 1'b1;
acc_req_ready_o = 1'b1;
end
end else if (acc_res_ready) begin
idma_fe_req_valid[idma_fe_sel_chan] = 1'b1;
if (idma_fe_req_ready[idma_fe_sel_chan]) begin
acc_res.id = acc_req_i.id;
Expand Down Expand Up @@ -750,6 +818,12 @@ module idma_inst64_top #(
if (!idma_fe_twod) begin
idma_fe_req.d_req[0].reps = 'd1;
end
// keep higher dims inert for plain requests (the transpose expander overwrites them)
for (int d = 1; d <= NumDim-2; d++) begin
idma_fe_req.d_req[d].reps = 'd1;
idma_fe_req.d_req[d].src_strides = '0;
idma_fe_req.d_req[d].dst_strides = '0;
end
end

//--------------------------------------
Expand All @@ -763,6 +837,16 @@ module idma_inst64_top #(
//--------------------------------------
// only activate tracer if requested
`ifndef SYNTHESIS
initial assert (idma_pkg::TransposeDimWidth == 32'd12) else
$fatal(1, "DMCPY argb transpose packing requires TransposeDimWidth == 12");
`ifndef VERILATOR
// capability cross-check against the generated backend's baked compute set
// (engine route only; address-gen needs no compute-enabled backend)
if (ComputeEnable.transpose && !AddrGenTranspose) begin : gen_compute_check
initial assert (gen_backend[0].i_idma_backend_rw_axi.ComputeEnable.transpose) else
$fatal(1, "ComputeEnable.transpose requires a compute-enabled backend variant");
end
`endif
if (DMATracing) begin : gen_tracer
for (genvar c = 0; c < NumChannels; c++) begin : gen_channels
// derive the name of the trace file from the hart and channel IDs
Expand Down
106 changes: 72 additions & 34 deletions src/midend/idma_transpose_midend.sv
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
// Authors:
// - Daniel Keller <dankeller@iis.ee.ethz.ch>

/// Transpose geometry expander: expands an opt.compute=TRANSPOSE request into a
/// NumDim=4 tiled ND walk for the generic idma_nd_midend. Non-transpose passes
/// through. Combinational, quasi-static per request.
/// Transpose geometry expander for the generic idma_nd_midend. Two modes:
/// engine (NumDim=4 tiled walk feeding the FF transpose engine) and address-gen
/// (element-granular swapped-stride walk, no engine, for backends without the
/// engine e.g. multi-write OBI/TCDM). Non-transpose passes through.
module idma_transpose_midend #(
/// Number of ND dimensions (must be >= 4 to express the tiled walk)
/// Number of ND dimensions (engine walk needs >= 4; address-gen needs >= 3)
parameter int unsigned NumDim = 32'd4,
/// Address-gen mode: element-granular swapped-stride transpose, no engine
parameter bit AddrGenTranspose = 1'b0,
/// Address-gen: pad the dst row pitch by one bus-word when needed so the
/// per-column word stride is odd (conflict-free on power-of-2-bank TCDM)
parameter bit BankSkew = 1'b0,
/// Write data-path width in bytes (tile side NE = StrbWidth / element bytes)
parameter int unsigned StrbWidth = 32'd64,
/// Address type
Expand Down Expand Up @@ -51,6 +57,7 @@
logic [TensorW-1:0] tm, tn;
logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe;
logic signed [WorkW-1:0] strb_c; // NE*E == StrbWidth (mode cancels)
logic signed [WorkW-1:0] e, me, pad; // address-gen: E (=1<<mode), M*E, pitch pad

nd_req_o = nd_req_i; // passthrough

Expand All @@ -59,43 +66,74 @@
tm = nd_req_i.burst_req.opt.compute.params.transpose.tensor_m;
tn = nd_req_i.burst_req.opt.compute.params.transpose.tensor_n;
// zero-extend bounded dims into the signed working width
m = $signed({{(WorkW-TensorW){1'b0}}, tm}); // M
n = $signed({{(WorkW-TensorW){1'b0}}, tn}); // N
log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
ne = $signed(WorkW'(1)) <<< log2ne; // tile side (elements)
yt = (m + ne - 1) >>> log2ne; // ceil(M/NE)
nt = (n + ne - 1) >>> log2ne; // ceil(N/NE)
nxe = n <<< mode; // N*E (E = 1<<mode)
mpe = yt <<< Log2Strb; // MP*E = YT*NE*E = YT*StrbWidth
strb_c = $signed(WorkW'(StrbWidth)); // NE*E (one tile-row = StrbWidth B)
m = $signed({{(WorkW-TensorW){1'b0}}, tm}); // M
n = $signed({{(WorkW-TensorW){1'b0}}, tn}); // N

nd_req_o.burst_req.length = LenW'(StrbWidth);
if (AddrGenTranspose) begin
// Element-granular swapped-stride walk (out_T[c][r] = in[r][c]),
// dst an N x M' transpose. No engine: compute is cleared so the
// backend runs a plain strided copy. Correct on any protocol
// (ideal on random-access OBI/TCDM; slow on burst AXI).
e = $signed(WorkW'(1)) <<< mode; // E = 1<<mode
me = m <<< mode; // M*E
// BankSkew: when M*E is an even number of bus words the column
// stride hammers one TCDM bank; pad the pitch by one word (NE
// elements) to make the word stride odd => round-robin all banks.
ne = $signed(WorkW'(StrbWidth)) >>> mode; // NE = StrbWidth/E
pad = (BankSkew && (me[Log2Strb:0] == '0)) ? ne : '0;
me = (m + pad) <<< mode; // M'*E (padded pitch)
nd_req_o.burst_req.opt.compute.enable = 1'b0;
nd_req_o.burst_req.length = LenW'(e);
// d_req[0] = column walk (reps N): src +E, dst +M'*E
nd_req_o.d_req[0].reps = n[RepW-1:0];
nd_req_o.d_req[0].src_strides = addr_t'(e);
nd_req_o.d_req[0].dst_strides = addr_t'(me);
// d_req[1] = row walk (reps M): src +E, dst +E - (N-1)*M'*E (rewind)
nd_req_o.d_req[1].reps = m[RepW-1:0];
nd_req_o.d_req[1].src_strides = addr_t'(e);
nd_req_o.d_req[1].dst_strides = addr_t'(e - (n - 1) * me);
for (int unsigned d = 2; d < NumDim-1; d++) begin
nd_req_o.d_req[d].reps = RepW'(1);
nd_req_o.d_req[d].src_strides = '0;
nd_req_o.d_req[d].dst_strides = '0;
end
end else begin
log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
ne = $signed(WorkW'(1)) <<< log2ne; // tile side (elements)
yt = (m + ne - 1) >>> log2ne; // ceil(M/NE)
nt = (n + ne - 1) >>> log2ne; // ceil(N/NE)
nxe = n <<< mode; // N*E (E = 1<<mode)
mpe = yt <<< Log2Strb; // MP*E = YT*NE*E = YT*StrbWidth
strb_c = $signed(WorkW'(StrbWidth)); // NE*E (one tile-row = StrbWidth B)

Check warning on line 107 in src/midend/idma_transpose_midend.sv

View workflow job for this annotation

GitHub Actions / lint / lint-sv

[verible-verilog-lint] reported by reviewdog 🐶 Line length exceeds max: 100; is: 103 [Style: line-length] [line-length] Raw Output: message:"Line length exceeds max: 100; is: 103 [Style: line-length] [line-length]" location:{path:"src/midend/idma_transpose_midend.sv" range:{start:{line:107 column:101}}} severity:WARNING source:{name:"verible-verilog-lint" url:"https://github.com/chipsalliance/verible"}

Check warning on line 107 in src/midend/idma_transpose_midend.sv

View workflow job for this annotation

GitHub Actions / verible-verilog-lint

[verible-verilog-lint] src/midend/idma_transpose_midend.sv#L107

Line length exceeds max: 100; is: 103 [Style: line-length] [line-length]
Raw output
message:"Line length exceeds max: 100; is: 103 [Style: line-length] [line-length]"  location:{path:"src/midend/idma_transpose_midend.sv"  range:{start:{line:107  column:101}}}  severity:WARNING  source:{name:"verible-verilog-lint"  url:"https://github.com/chipsalliance/verible"}

// d_req[0] = local row within tile (reps NE)
nd_req_o.d_req[0].reps = ne[RepW-1:0];
nd_req_o.d_req[0].src_strides = addr_t'(nxe);
nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
// d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
nd_req_o.d_req[1].reps = yt[RepW-1:0];
nd_req_o.d_req[1].src_strides = addr_t'(nxe);
nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
// d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
// the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
nd_req_o.d_req[2].reps = nt[RepW-1:0];
nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
// the walk is exactly 4-D: neutralize any higher dims
for (int unsigned d = 3; d < NumDim-1; d++) begin
nd_req_o.d_req[d].reps = RepW'(1);
nd_req_o.d_req[d].src_strides = '0;
nd_req_o.d_req[d].dst_strides = '0;
nd_req_o.burst_req.length = LenW'(StrbWidth);

// d_req[0] = local row within tile (reps NE)
nd_req_o.d_req[0].reps = ne[RepW-1:0];
nd_req_o.d_req[0].src_strides = addr_t'(nxe);
nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
// d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
nd_req_o.d_req[1].reps = yt[RepW-1:0];
nd_req_o.d_req[1].src_strides = addr_t'(nxe);
nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
// d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
// the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
nd_req_o.d_req[2].reps = nt[RepW-1:0];
nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
// the walk is exactly 4-D: neutralize any higher dims
for (int unsigned d = 3; d < NumDim-1; d++) begin
nd_req_o.d_req[d].reps = RepW'(1);
nd_req_o.d_req[d].src_strides = '0;
nd_req_o.d_req[d].dst_strides = '0;
end
end
end
end

`ifndef SYNTHESIS
initial assert (NumDim >= 4) else
$fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim);
initial assert (NumDim >= (AddrGenTranspose ? 32'd3 : 32'd4)) else
$fatal(1, "idma_transpose_midend: NumDim too small (got %0d)", NumDim);
// mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2
initial assert (Log2Strb >= 2) else
$fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth);
Expand Down
5 changes: 5 additions & 0 deletions systems/snitch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
build/
*.so
modelsim.ini
transcript
work/
Loading
Loading