diff --git a/Bender.yml b/Bender.yml index 9a0972b2..2c259b31 100644 --- a/Bender.yml +++ b/Bender.yml @@ -43,6 +43,8 @@ sources: - src/backend/idma_axis_write.sv - src/backend/idma_channel_coupler.sv - src/backend/idma_dataflow_element.sv + - src/backend/idma_otf_transpose.sv + - src/backend/idma_otf_compute.sv - src/backend/idma_error_handler.sv - src/backend/idma_init_read.sv - src/backend/idma_init_write.sv @@ -53,11 +55,58 @@ sources: - src/backend/idma_tilelink_read.sv - src/backend/idma_tilelink_write.sv - # Generated content - - target: rtl + # Generated content (bundled single-file; default flow) + - target: all(rtl, not(split_rtl)) files: - target/rtl/idma_generated.sv + # Generated content (per-variant files; opt-in via `-t split_rtl`). + # Used for the transpose-engine prototype where the rw_axi backend/transport/ + # legalizer are hand-edited in place before being ported back to templates. + # Mutually exclusive with idma_generated.sv to avoid duplicate module defs. + - target: all(rtl, split_rtl) + files: + - target/rtl/idma_transport_layer_rw_axi.sv + - target/rtl/idma_transport_layer_r_obi_w_axi.sv + - target/rtl/idma_transport_layer_r_axi_w_obi.sv + - target/rtl/idma_transport_layer_rw_axi_rw_axis.sv + - target/rtl/idma_transport_layer_rw_obi.sv + - target/rtl/idma_transport_layer_r_obi_rw_init_w_axi.sv + - target/rtl/idma_transport_layer_r_axi_rw_init_rw_obi.sv + - target/rtl/idma_legalizer_rw_axi.sv + - target/rtl/idma_legalizer_r_obi_w_axi.sv + - target/rtl/idma_legalizer_r_axi_w_obi.sv + - target/rtl/idma_legalizer_rw_axi_rw_axis.sv + - target/rtl/idma_legalizer_rw_obi.sv + - target/rtl/idma_legalizer_r_obi_rw_init_w_axi.sv + - target/rtl/idma_legalizer_r_axi_rw_init_rw_obi.sv + - target/rtl/idma_backend_rw_axi.sv + - target/rtl/idma_backend_r_obi_w_axi.sv + - target/rtl/idma_backend_r_axi_w_obi.sv + - target/rtl/idma_backend_rw_axi_rw_axis.sv + - target/rtl/idma_backend_rw_obi.sv + - target/rtl/idma_backend_r_obi_rw_init_w_axi.sv + - target/rtl/idma_backend_r_axi_rw_init_rw_obi.sv + - target/rtl/idma_backend_synth_rw_axi.sv + - target/rtl/idma_backend_synth_r_obi_w_axi.sv + - target/rtl/idma_backend_synth_r_axi_w_obi.sv + - target/rtl/idma_backend_synth_rw_axi_rw_axis.sv + - target/rtl/idma_backend_synth_rw_obi.sv + - target/rtl/idma_backend_synth_r_obi_rw_init_w_axi.sv + - target/rtl/idma_backend_synth_r_axi_rw_init_rw_obi.sv + - target/rtl/idma_desc64_reg_pkg.sv + - target/rtl/idma_reg32_3d_reg_pkg.sv + - target/rtl/idma_reg64_2d_reg_pkg.sv + - target/rtl/idma_reg64_1d_reg_pkg.sv + - target/rtl/idma_desc64_reg_top.sv + - target/rtl/idma_reg32_3d_reg_top.sv + - target/rtl/idma_reg64_2d_reg_top.sv + - target/rtl/idma_reg64_1d_reg_top.sv + - target/rtl/idma_desc64_top.sv + - target/rtl/idma_reg32_3d_top.sv + - target/rtl/idma_reg64_2d_top.sv + - target/rtl/idma_reg64_1d_top.sv + # Midends - target: rtl files: @@ -66,6 +115,7 @@ sources: - src/midend/idma_mp_split_midend.sv - src/midend/idma_nd_midend.sv - src/midend/idma_rt_midend.sv + - src/midend/idma_transpose_midend.sv # RISC-V opcode package for ooc use of inst64 - target: all(rtl,snitch_cluster) @@ -94,6 +144,7 @@ sources: # Level 2 - src/frontend/desc64/idma_desc64_top.sv + # Synthesis wrappers - target: synth files: @@ -119,7 +170,9 @@ sources: - test/future/idma_tb_per2axi.sv - test/future/TLToAXI4.v - test/midend/tb_idma_nd_midend.sv + - test/midend/tb_idma_nd_midend_b2b.sv - test/midend/tb_idma_rt_midend.sv + - test/midend/tb_idma_transpose_midend.sv # Level 2 - test/future/idma_obi2axi_bridge.sv - test/future/idma_tilelink2axi_bridge.sv @@ -128,3 +181,8 @@ sources: - target: idma_test files: - target/rtl/tb_idma_generated.sv + # Multi-tile transpose path: ND-midend (transposed strides) -> backend + - target: all(idma_test, split_rtl) + files: + - test/tb_idma_transpose_nd.sv + - test/tb_idma_transpose_b2b.sv diff --git a/doc/transpose-engine-routing-plan.md b/doc/transpose-engine-routing-plan.md new file mode 100644 index 00000000..1428d5aa --- /dev/null +++ b/doc/transpose-engine-routing-plan.md @@ -0,0 +1,461 @@ +# Transpose Engine — Design Routing & Signaling Plan + +Status: design doc (no frontend/midend RTL yet). The transpose **engine** +(`src/backend/idma_otf_transpose.sv`) is implemented and standalone-verified +(full-duplex ping-pong, runtime int8/fp16/fp32, DPI golden). This document plans +how it is **routed through the whole design** — frontend → midend → legalizer → +backend → transport → engine, and the response path back. + +It is grounded in a re-read of the viDMA fork (the prior OTF work) and is +deliberately **adversarial about viDMA's signaling approach** (§3). Anchors are +`file:line` against this repo unless prefixed `vidma:` +(`/home/dankeller/Projects/pulp_rs/vidma_rtl/src/`). + +--- + +## 1. Three-plane framing + +The engine reorders the **data stream** only — it has **no address ports**. +Routing therefore splits into three planes of very uneven difficulty: + +| Plane | Difficulty | What it needs | +|-------|-----------|---------------| +| **Control** | easy | deliver `{transpose_en, E, M, N}` per transfer to the engine *and* the address generator | +| **Data** | mostly done | splice the engine `valid/ready/strb_o` into the transport `buffer_out → write-shifter` seam behind `EnableTranspose`; engine self-drains | +| **Address** | the crux | generate read + write address streams so the bytes land transposed, with `strb_o` masking edge tiles | + +The central insight that makes the address plane tractable: + +> **The engine transposes *inside* each `NE×NE` tile. Read and write therefore +> walk the *same* tile-visitation order — they differ only in stride +> *magnitude*, not iteration *order*.** A full matrix transpose is expressed as +> an ND transfer whose `dst_strides` encode the N×M output pitch while +> `src_strides` encode the M×N input pitch, over one shared iteration space. + +This matters because the ND midend (`src/midend/idma_nd_midend.sv`) advances +`src_addr` and `dst_addr` from a **single shared iteration counter** +(`stride_sel_q`, `burst_sent_q`, `:153-186`) — it *cannot* make read and write +visit a different order, but it *can* give them independent strides. The +engine-internal transpose is exactly what lets us live within that constraint. + +``` + CONTROL: {transpose_en, E, M, N} as typed, request-scoped idma_req_t.opt fields + ADDRESS: NumDim=4 ND descriptor; src_strides (M×N) + transposed dst_strides (N×M) + ┌──────────┐ idma_nd_req_t ┌─────────┐ idma_req_t ┌───────────┐ + │ FRONTEND │ ───────────────► │ ND │ ────────────► │ LEGALIZER │ + │ reg/desc │ burst_req.opt.* │ MIDEND │ src_addr / │ opt_tf_q │ + │ /inst64 │ + d_req[].reps, │ shared │ dst_addr │ r_tf_q │ AR/AW + │ │ src/dst_strides │ counter │ (per-dim │ w_tf_q ├──────► + └──────────┘ │ walks │ strides) └─────┬─────┘ meta + ▲ │ both │ │ r_dp_req / w_dp_req + │ done_id (idma_rsp_t) │ ptrs │ │ (+ transp_*) + │ on WRITE of last tile └────┬────┘ ▼ + │ │ nd_rsp_o (on .last) ┌────────────────────┐ + │ ▼ │ BACKEND TOP │ + └──────────────────────────────────────────────── │ legalizer↔transport │ + └─────────┬───────────┘ + │ w_dp_req.transp_*, shift + DATA: ┌────────┐ data ┌───────────┐ data ┌──────────┐ data ┌──────────┐ + │ buffer │ ───► │ TRANSPOSE │ ───► │ write │ ───► │ axi_write│ ─► W.strb = + │ _out │ ◄─── │ ENGINE │ ◄─── │ barrel │strb_o│ mask_out │ align_mask + └────────┘ rdy │(EnTranspose)│rdy │ shift │ && │ & strb │ & engine_strb + └───────────┘ └──────────┘ └──────────┘ + strb_o ─────────────────(shift by w_dp_req.shift)──────────► (new AND term) +``` + +Response semantics are unchanged: **one transfer = one `done_id`** — but it must +fire on **write** completion of the last tile (§5.3). + +--- + +## 2. Control plane — which fields travel where + +Four control items: `transpose_en` (1b), `transp_mode`/E (2b), `M` (12b, rows in +elements), `N` (12b, cols in elements). Decision: carry the small per-transfer +items as **typed fields in `options_t`**; `M`/`N` ride the ND geometry but are +*also* surfaced as explicit `opt` fields because the engine needs them at the +transport seam. + +### 2.1 The struct of record + +`IDMA_TYPEDEF_OPTIONS_T(options_t, axi_id_t)` (`src/include/idma/typedef.svh:21-30`) +is the request-scoped options struct embedded in every `idma_req_t` (`:37-44`, +the `opt` field). Add: + +```systemverilog +logic transpose_en; // engine on/off for this transfer +logic [1:0] transp_mode; // E selector: 00=1B, 01=2B, 10=4B +logic [11:0] tensor_m; // rows in ELEMENTS +logic [11:0] tensor_n; // cols in ELEMENTS +``` + +It rides through the ND midend bypass for free — `idma_nd_midend.sv:194` copies +`burst_req_o = nd_req_i.burst_req` wholesale, overwriting only `src_addr`/ +`dst_addr`/`opt.last` (`:196-198`) — and through every request FIFO/fork. It is +then mirrored into the legalizer's mutable options and into `w_dp_req_t` to reach +the engine (§5). + +### 2.2 Per-frontend insertion of `transpose_en` + `E` + +| Frontend | Carrier | Anchor | +|---|---|---| +| **reg** | new `conf` CSR bits (bits 0:16 used; **17:31 free**): `transpose_en`=17, `transp_mode`=19:18 | fields `target/rtl/idma_reg64_2d.hjson:31-66`; template `src/frontend/reg/tpl/idma_reg.hjson.tpl:23-64`; wire in `proc_hw_req_conv`, `src/frontend/reg/tpl/idma_reg.sv.tpl` (~124-141) | +| **desc64** | descriptor `flags[31:24]` reserved → `flags[24]=transpose_en`, `flags[26:25]=transp_mode` | `src/frontend/desc64/idma_desc64_top.sv:114-115`; assign in reshaper opt block `src/frontend/desc64/idma_desc64_reshaper.sv:27-60` | +| **inst64** | new instruction `DMTRANSP` in the `casez (acc_req_i.data_op)`, **request-scoped** write to `burst_req.opt.transpose_en/.transp_mode` (contrast viDMA's sticky `otf_opcode_q`) | casez `src/frontend/inst64/idma_inst64_top.sv:386`; opt defaults `:343-364` | + +### 2.3 Matrix dims M, N + +M, N are **redundant with the ND geometry** (they equal the descriptor's +element extents — see §4) but the engine still needs them as explicit ports +(`tensor_size_m_i`/`tensor_size_n_i`) at the transport seam to drive its tile +counters and edge `strb_o`. Decision: surface them as explicit `opt.tensor_m`/ +`opt.tensor_n` (24 b of flops), forwarded to `w_dp_req`, rather than +reconstructing them from byte counts at the seam. + +- **reg / inst64**: drivers already program the dims/strides CSRs (length, reps, + strides); add two small writes for `tensor_m/tensor_n` (or compute them in the + reg HWIF from `length`/`reps`). +- **desc64**: `flags` has no spare 24-bit budget — desc64 should **derive** M/N + from `length`/`reps`/strides in the reshaper rather than carry them literally. + +--- + +## 3. Signaling decision — adversarial critique of viDMA, chosen alternative + +### 3.1 What viDMA does + +viDMA signals OTF compute with a **single sticky 8-bit `otf_opcode_q` register**, +written by a custom RISC-V instruction `DMOPC` (`vidma:idma_inst64_top.sv:571-602`, +`otf_opcode_q <= acc_req_i.data_arga[7:0]`, reset to passthrough `0x08`), passed +as a **side-band port** `otf_opcode_i` into the backend +(`vidma:idma_backend_rw_axi.sv:89`) — parallel to `idma_req_i`, bypassing the +request descriptor and the ND midend — latched per-transfer into +`opt_tf_q.otf_opcode` at legalizer refill (`vidma:idma_legalizer_rw_axi.sv:403-442`), +copied into `r_dp_req`/`w_dp_req.otf_opcode`, and consumed in the transport layer +via a **same-cycle "effective opcode" bypass** plus MX/FP **drain FSMs** +(`vidma:idma_transport_layer_rw_axi.sv:231-243`). Element sizes and the write +length are **derived from the opcode** by hard-coded tables +(`vidma:vidma_otf_pkg.sv`: `otf_element_sizes`, `otf_write_length = r_len/in*out`). + +### 3.2 Critique — reject this model for transpose + +1. **Sticky + global desyncs config from the transfer.** One register feeds every + backend channel, but each channel has its own `nd_req` FIFO. `DMOPC` then + `DMCPY` is non-atomic; an interrupt or a second `DMOPC` re-tags a pending + transfer. The legalizer accept-time latch only *partially* masks this. For a + transpose interleaved with plain copies this is a correctness hazard. +2. **8 bits cannot carry geometry.** The opcode is a pure op selector with + hard-coded element sizes and a scalar length ratio. Transpose needs runtime + `E ∈ {1,2,4}` plus 12-bit `M` and `N` (≥26 b). Extending viDMA means adding + sticky `DMSIZE`-class registers — reintroducing hazard (1) plus multi-register + atomicity. +3. **viDMA never remaps addresses — the disqualifying gap.** Its `w_addr` is + monotonic; `otf_write_length`/`otf_coordinated_r_bytes` assume a *scalar + length ratio on a linear stream* (`vidma:vidma_otf_pkg.sv`). Transpose is 1:1 + in bytes (ratio = identity) but needs a **tile-strided write address program**. + The opcode/ratio model has no concept of it. +4. **Transport bloat is avoidable.** The same-cycle effective-opcode bypass is a + timing smell; the MX/FP drain FSMs and expansion bypass are viDMA-op-specific. + The transpose engine **self-drains via `exec_done`**, so integration needs + zero drain FSMs, zero opcode mux, zero bypass. + +### 3.3 Chosen alternative + +**Carry `{transpose_en, transp_mode, tensor_m, tensor_n}` as typed, +request-scoped `idma_req_t.opt` fields, and express the address remap as an ND +transfer (transposed `dst_strides`), not a legalizer ratio function.** + +| Layer | viDMA cost | Chosen cost | Why better | +|---|---|---|---| +| frontend | sticky `otf_opcode_q` + `DMOPC` | a few `opt` assigns | request-scoped; no cross-transfer leak | +| port | side-band `otf_opcode_i` hand-synced | none — rides `opt` through existing FIFOs/forks | no global wire, no manual sync | +| midend | bypassed | bypassed; **remap reuses `src/dst_strides`** | remap is native, not bolted on | +| legalizer | latch + `otf_write_length` ratio | latch + forward; **no ratio math** (W==R) | identity length is correct as-is | +| transport | effective-opcode bypass + drain FSMs | splice `valid/ready/strb_o`, pulse `clear_i` | engine self-drains | + +**Decision: adopt it.** Strictly cheaper and strictly more composable than the +viDMA side-band. + +--- + +## 4. Address plane — the corrected model + +### 4.1 Routing model + +Both `AR` and `AW` walk the **same** tile order (matching the engine's identical +fill/drain walkers, `idma_otf_transpose.sv:172-201`). The transpose lives +**entirely inside the engine** (intra-tile element swap, the readout at `:224-231`). +The destination **strides** encode the N×M output pitch. This is realizable with +the ND midend's existing logic **unchanged** — it already advances independent +`src_addr`/`dst_addr` per dimension (`:169` src, `:181` dst) over one shared +iteration order. The only cost is a **NumDim bump** (synth-time config), not new +midend logic. + +> ⚠️ This corrects an earlier framing that described "transposed *write +> sequence* / scattered tiles." The midend **cannot** reorder tiles between read +> and write (single `stride_sel_q`/`burst_sent_q`, `:153-186`). It does not need +> to: same tile order, **transposed dst strides**, engine does the element swap. + +### 4.2 The descriptor (NumDim = 4) and the exact stride program + +Let `E` = element bytes, `NE = StrbWidth/E` (tile side in elements, += beat in elements), `SW = StrbWidth` (bytes/beat), `YT = ceil(M/NE)`, +`NT = ceil(N/NE)`, and `MP = YT·NE` — the **padded** output row pitch (M rounded +up to a tile/`SW` boundary; `MP = M` when `M % NE == 0`). Iteration order +(outer→inner): `nt` (col-tile) → `rt` (row-tile) → `j` (row/beat within tile). +Each 1D run = one tile-row = `NE` elements = `SW` bytes, contiguous on both src +and dst. + +Source `A` is M×N row-major at `src`; transposed `Aᵀ` is stored **N×MP** +row-major at `dst` (real data in columns `[0, M)`, columns `[M, MP)` are padding). +For iteration `(nt, rt, j)` the absolute addresses are: + +``` +src_addr = src + (rt·NE + j)·(N·E) + nt·(NE·E) // reads A[rt·NE+j][nt·NE .. +NE) +dst_addr = dst + (nt·NE + j)·(MP·E) + rt·(NE·E) // writes Aᵀ[nt·NE+j][rt·NE .. +NE) +``` + +> ⚠️ **The ND midend strides are INCREMENTAL deltas, not these absolute per-dim +> steps.** `idma_nd_midend.sv:164-186` adds exactly **one** stride per burst — +> `addr += d_req[stride_sel].strides`, where `stride_sel = popcount(rolled-over +> dims)`. So a dimension's `*_strides` must encode *"undo the inner loops and step +> the outer dim once"*, i.e. the delta from the last inner position to the next +> outer position — not the absolute pitch. The table below is the delta form +> (verified in `test/tb_idma_transpose_nd.sv`); an absolute-stride table would +> corrupt a real driver. + +| Level | reps | `src_strides` (Δ) | `dst_strides` (Δ) | +|------|------|---------------|---------------| +| 1D burst | `length = NE·E (= SW B)` | contiguous | contiguous | +| `d_req[0]` = j (row in tile) | `NE` | `N·E` | `MP·E` | +| `d_req[1]` = rt (row-tile) | `YT` | `N·E` | `NE·E − (NE−1)·MP·E` | +| `d_req[2]` = nt (col-tile) | `NT` | `NE·E − (YT·NE−1)·N·E` | `MP·E − (YT−1)·NE·E` | + +⇒ **NumDim = 4** (1D + 3 `d_req`). The driver computes this program from +`(src, dst, M, N, E)`; no new midend RTL. Pairing is positional: the engine +fills with read beats `j=0..NE-1` of a tile, then drains output beats +`j=0..NE-1` (columns); write iteration `j` consumes output beat `j`. `decouple_rw` +lets the read machine run a tile ahead while the engine buffers (§5.2). + +**Two asymmetries that the deltas encode (both load-bearing for edge tiles):** + +1. **dst uses the padded pitch `MP·E`, src uses the real pitch `N·E`.** The engine + drains exactly one beat per `Aᵀ` row and the write path has **no buffer after + the engine** to coalesce a *misaligned* (word-crossing, hence 2-beat) write. If + `M·E` is not a multiple of `SW` (i.e. `M % NE != 0`), an unpadded row pitch + makes every `Aᵀ` row start at a sub-word offset → the legalizer splits each + row into 2 beats → the engine retires its single beat after the first and the + second beat starves → **write-channel deadlock**. Padding the pitch to `MP·E` + keeps every row `SW`-aligned (single-beat). Reads need no such padding: + misaligned source reads coalesce in the dataflow buffer, which sits *before* + the engine (verified: e.g. `8×6 EB=1` has misaligned reads and passes). + +2. **`d_req[2].src` rewinds by `YT·NE−1` rows, not `M−1`.** The read walks `NE` + rows per tile *including* the padding rows of the last row-tile (rows + `[M, YT·NE)`), so the column-tile rewind must clear the full *padded* column + height. (For aligned `M = YT·NE` the two forms coincide.) + +**Allocation / liveness obligations for a real driver (sim satisfies these):** +- **`dst` MUST be allocated for the FULL tile-padded extent `NT·NE × MP` elements** + (not just `N × MP`). This is a hard requirement, not an optimization. When + `N % NE != 0` the engine drains `NT·NE` output-row beats and the all-padding + rows `[N, NT·NE)` are issued as real `AW`s (correct addr+len) carrying + `wstrb = 0` (§4.3). A permissive slave (e.g. `axi_sim_mem`) writes nothing for + them, but **a strict slave decodes the `AW` address *before* the strobes — an + under-allocated `dst` therefore risks `DECERR` on those rows**, surfaced as a + transfer error. The real data is the `[0, N) × [0, M)` sub-block of the padded + buffer. (A future hardening could suppress `AW` issuance for all-padding rows + so under-allocation is safe; until then, over-allocation is mandatory.) +- `src` reads extend up to row `YT·NE−1`, col `NT·NE−1` (tile padding); the + source buffer must cover that or the reads must be benign (masked on write). +- **`NumAxInFlight ≥ NE`.** The engine must buffer a whole `NE`-beat tile before + its first output beat, and the backend holds one transaction slot per single-beat + burst from `AR` until `B`; so `NE−1` bursts sit read-done/write-pending while the + `NE`-th completes. The tight deadlock boundary is therefore `NumAxInFlight ≥ NE−1` + (measured: `NE=8`→`≥7`, `NE=4`→`≥3`); the bound scales with `NE`, not buffer + depth (`BufferDepth` does not move it). Use `≥ NE` for a one-slot margin and a + clean rule. With runtime `E`, `NE = SW` in the worst case (`E=1`), so a backend + that must serve int8 transpose needs **synth-time `NumAxInFlight ≥ StrbWidth`**. + Below the threshold the write channel deadlocks *loudly* (W watchdog), never + silently. Verified by sweep in `test/tb_idma_transpose_nd.sv` (param + `NumAxInFlight`, auto-sized to `StrbWidth`). + +### 4.3 `strb_o → wstrb` — the load-bearing RTL change (IMPLEMENTED) + +`wstrb` was born **solely** from address-alignment masks in +`src/backend/idma_axi_write.sv`: `w_first_mask = '1 << offset`, `w_last_mask = +'1 >> (StrbWidth - tailer)`, combined into `mask_out` (`proc_out_mask_generator`), +driven onto `write_req_o.w.strb` **and** used to pop the buffer +`buffer_out_ready_o = mask_out`. There was no input port for an engine mask. + +Implemented (shared `idma_axi_write.sv`, backward-compatible): +1. New input **`mask_ext_i`** (strb-wide), **AND**ed into `mask_out` + (`mask_out = alignment_masks & mask_ext_i`). Other backends tie it to `'1` + (no change). This is the deciding modification: it both narrows `w.strb` to + the real bytes of an edge tile **and** narrows the buffer pop so partial/zero + beats don't require the full beat. +2. New output **`w_beat_done_o = write_happening`** — a strobe-*independent* + "a W beat was accepted" pulse. The transpose handshake retires the engine + output on `w_beat_done` (not on `|buffer_out_ready`), so an **all-padding + beat** (`strb_o = 0`, e.g. the non-existent `Aᵀ` rows of a `N % NE != 0` + edge) still issues one `wstrb=0` W beat and drains — instead of stalling on + `buffer_out_valid_i != 0`. +3. In the transport (`idma_transport_layer_rw_axi.sv`), the transpose path drives + **presence = all-ones** (`buffer_out_valid_i = {StrbWidth{tp_out_valid}}`) and + carries the engine strobe separately as `wr_strb → mask_ext_i`, shifted by + `w_dp_req_i.shift` **identically** to the data (parallel `mask_ext_shifted`). + Presence-as-all-ones is what lets a zero-strobe beat satisfy `ready_to_write` + (`(presence & mask_out) == mask_out` holds for `mask_out = 0`) while the + `buffer_out_valid_i != 0` guard still protects the non-transpose path. + +`strb_o` is **already byte-granular** — `strb_int[p] = em[p >> transp_mode_i]` +(`idma_otf_transpose.sv`) expands the per-element mask to one bit per byte — so it +maps directly onto byte-granular `wstrb` for E ∈ {1,2,4}. No expansion needed. + +> The original plan listed only step 1. Steps 2–3 are the part it missed: edge +> tiles produce **both** partially-masked beats (`leftover_rows`, within a row) +> **and** all-zero beats (`leftover_cols`, whole `Aᵀ` rows that don't exist). +> Masking alone deadlocks on the all-zero beats; the strobe-independent retire + +> all-ones presence is what makes them drain. Both are proven load-bearing by the +> padding-sentinel negative control in `tb_idma_transpose_nd.sv` (disable the +> strobe → padding is clobbered with real transposed data → test fails). + +### 4.4 INCR-only — respected + +The legalizer hard-asserts `BURST_INCR` (`target/rtl/idma_legalizer_rw_axi.sv:417-419`) +and a 1D run is one contiguous burst. Model §4.1 respects this: every 1D run is a +contiguous tile-row; all tiling/scatter is expressed **above** the legalizer as +ND per-dimension steps. The legalizer never sees a tile — it only page-splits a +contiguous INCR run. Transpose is 1:1 bytes, so the legalizer's identical src/dst +`length` (`:262, :271`, both `req_i.length`) is correct **as-is** — no +`otf_write_length` hack. + +--- + +## 5. Data plane — transport wiring + +### 5.1 Engine placement + +Behind `EnableTranspose`, route `buffer_out` (`idma_transport_layer_rw_axi.sv:201-203`) +→ engine `data_i`; engine `data_o` → write barrel-shifter input (`:210-213`). +Passthrough keeps today's direct path. Selection = static `EnableTranspose` +**AND** per-transfer `w_dp_req_i.transpose_en` (mirrors viDMA's passthrough mux +but **without** the opcode machinery). Geometry ports come from +`w_dp_req_i.transp_mode/tensor_m/tensor_n`; `clear_i` is pulsed on the first beat +of a transpose transfer. + +### 5.2 decouple_rw / decouple_aw + +Transpose **requires** `decouple_rw = 1`: the engine's ping-pong overlaps the +write of tile k-1 with the read of tile k (`idma_otf_transpose.sv:19-23`). +`decouple_aw = 1` so AWs trail the first R by the one-tile fill latency. Set them +**per-transfer** (only when `transpose_en`) via `opt.beo.decouple_rw/decouple_aw` +— do **not** copy viDMA's blanket unconditional `decouple_rw` workaround +(`vidma:vidma_otf_pkg.sv` `needs_force_decouple_rw`). Keep the coupled path for +plain copies. The package warns `decouple_rw` "can cause deadlocks" +(`src/idma_pkg.sv:76`) — see §6 risk. + +### 5.3 Response / meta path + +No new fields: one transfer = one `done_id`. `idma_rsp_t` (`typedef.svh:45-50`) +flows backend → error handler → ND midend (`nd_rsp_o` valid on `burst_rsp_i.last`, +`idma_nd_midend.sv:208`) → frontend counters (inst64 `completed_id`/`next_id` +`:140-141`; reg `done_id`/`next_id`; desc64 IRQ from `flags[0]` +`idma_desc64_reshaper.sv:22`). **Must-fix:** with `decouple_rw` and a tile drained +*after* its read, the last response must be gated on **write** completion of the +last tile, not read completion — otherwise `done_id` rises before the transposed +data is committed. The engine's ping-pong/drain busy must also feed the existing +busy aggregation (`src/idma_pkg.sv` `idma_busy_t`) so frontend polling stays +correct. + +--- + +## 6. Per-layer change checklist + +- **typedef** — add `transpose_en`(1b), `transp_mode`(2b), `tensor_m`(12b), + `tensor_n`(12b) to `IDMA_TYPEDEF_OPTIONS_T` (`src/include/idma/typedef.svh:21-30`). + Req/ND/response macros unchanged. +- **frontend/reg** — `conf` bits 17/19:18; wire in `proc_hw_req_conv` + (`src/frontend/reg/tpl/idma_reg.{hjson,sv}.tpl`). M/N via existing length/reps. +- **frontend/desc64** — `flags[24]`/`flags[26:25]` + (`idma_desc64_top.sv:114`, reshaper `:27-60`); derive M/N. +- **frontend/inst64** — `DMTRANSP` arm (`idma_inst64_top.sv:386`); default new + opt fields to 0 (`:343-364`); reuse DMREP/DMSTR for reps/strides. +- **midend** — **no RTL logic change**; needs **NumDim = 4** backend config + (`RepWidth`/`NumDim` are synth params, `idma_nd_midend.sv:16,26`). Independent + walk (`:164-186`) + bypass (`:194-198`) carry it. +- **legalizer** — mirror the new opt fields into `idma_mut_tf_opt_t`, latch in the + `opt_tf_d` literal (`target/rtl/idma_legalizer_rw_axi.sv:267-301`; template + `src/backend/tpl/idma_legalizer.sv.tpl`), forward into `w_dp_req`. **No** + `otf_write_length` math. INCR asserts (`:417-419`) stay valid. +- **backend** — add `transpose_en/transp_mode/tensor_m/tensor_n` to `w_dp_req_t`; + add `EnableTranspose` param gating the transport instantiation; populate fields + from `opt_tf_q`. +- **transport** — instantiate engine at the `buffer_out → shifter` seam behind + `EnableTranspose && w_dp_req_i.transpose_en` (`target/rtl/idma_transport_layer_rw_axi.sv:201-213`); + wire `clear_i`, geometry, `valid/ready`; shift `strb_o` by `w_dp_req_i.shift`. +- **backend/axi_write** — **the one load-bearing change**: AND engine `strb_o` + into `mask_out` (`src/backend/idma_axi_write.sv:135-144`); flows to `w.strb` + (`:197`) and pop (`:174`). Define fully-masked-beat behavior (§7.1). +- **pkg** — ensure engine busy contributes to `busy_o` (`src/idma_pkg.sv`). + +--- + +## 7. Open questions & risks (be explicit — no silent gaps) + +1. **Partial-tile edge reads + phantom write beats (highest).** For M,N not + multiples of `NE` the engine still consumes/produces **full padded NE-beat + tiles** (`idma_otf_transpose.sv` contract). Two consequences: + - the last partial **row**-tile read may touch **out-of-bounds source** + addresses (writes are safe — masked by `strb_o`); + - the last partial **col**-tile produces **fully-masked phantom write beats** + (`strb_o == 0`). In `idma_axi_write.sv`, `mask_out == 0` makes + `ready_to_write` vacuously true (`:159`) and would emit a zero-strobe `W` + or stall the pop (`:174`) — **undefined today**. + Decide explicitly, do **not** let edge tiles read OOB or emit junk W: + (a) require NE-aligned M,N for the first cut (no OOB, no phantom); or + (b) add a transport-seam pad/drop block: zero-pad reads at the source edge and + drop fully-masked W beats (and shorten the AW length accordingly); or + (c) rely on an SoC contract that source/dest are allocated tile-padded. + Recommended: (a) first, (b) as the productized path. +2. **`strb_o → wstrb` merge must be exact.** Shift by `w_dp_req_i.shift` identically + to data; **AND** (not replace) with the alignment masks so it composes with + address misalignment. Verify a fully-masked beat does not deadlock the + buffer-pop path (`idma_axi_write.sv:174`). +3. **Last-response timing.** Gate `nd_rsp_o.last` / `done_id` on **write** + completion of the final tile under `decouple_rw` (§5.3), not read completion. + Guaranteed to bite given the ping-pong latency — a resolved design point, not + a "verify later". +4. **Deadlock.** `decouple_rw = 1` is required but the package warns it can + deadlock (`idma_pkg.sv:76`; cf. `doc/TODO-rw-decoupled-deadlock.md`). Set it + **per-transfer**, keep coupled path for copies, and write a directed test that + the engine's full-tile buffer filling while the write side backpressures does + **not** form a buffer-full ↔ engine-stall ↔ write-stall cycle. +5. **AW/AR id ordering.** All bursts use one `opt_tf_q.axi_id` + (`idma_legalizer_rw_axi.sv:314, 340`). With `NE` fragmented short bursts per + tile and `decouple_rw`, same-ID ordering forces correctness but the fragmented + multi-burst-per-tile W-to-AW pairing under the engine's drain is unanalyzed — + confirm before relying on it. +6. **NumDim = 4 cost.** A NumDim bump widens the ND CSR/descriptor footprint + (reps/strides per dimension) across reg/desc64/inst64. Quantify the CSR-map and + descriptor-size impact per frontend; small SoCs may want a transpose-only + reduced-width ND variant. +7. **Burst efficiency.** Each 1D run is one tile-row (`SW` bytes ⇒ one beat), so + AR/AW issue at one (short) burst per tile-row. This is inherent to the tiled + walk; if AR/AW issue-rate bottlenecks, coalesce contiguous tile-rows when + `N == NE` (single col-tile makes a tile's rows contiguous) as a special case. + +--- + +## 8. Net + +- **Control**: typed, request-scoped `idma_req_t.opt` fields — reject viDMA's + sticky global opcode side-band. +- **Address**: same tile-visitation order on AR and AW; transpose lives in the + engine; **transposed `dst_strides`** over a NumDim=4 ND program (midden logic + untouched, config bump only). Stride table in §4.2. +- **Data**: engine spliced at the `buffer_out` seam behind `EnableTranspose`, + self-draining; **one load-bearing new RTL primitive**: AND the shifted engine + `strb_o` into `mask_out` in `idma_axi_write.sv`. +- **Largest unresolved items**: partial-tile edge reads / phantom write beats + (§7.1), last-response-on-write timing (§7.3), `decouple_rw` deadlock test (§7.4). diff --git a/idma.mk b/idma.mk index 06d6c4b8..0034c62a 100644 --- a/idma.mk +++ b/idma.mk @@ -16,6 +16,9 @@ SPHINXBUILD ?= sphinx-build VCS ?= vcs VERILATOR ?= verilator VLOGAN ?= vlogan +VSIM ?= vsim +VLOG ?= vlog +VLIB ?= vlib # Shell SHELL := /bin/bash @@ -32,6 +35,8 @@ IDMA_OCCAMY_IDS := \ r_axi_rw_init_rw_obi IDMA_ADD_IDS ?= IDMA_BACKEND_IDS := $(IDMA_BASE_IDS) $(IDMA_OCCAMY_IDS) $(IDMA_ADD_IDS) +# Backend variants that host the on-the-fly compute dispatcher (single AXI write) +IDMA_VIDMA_IDS ?= rw_axi # generated frontends IDMA_BASE_FE_IDS := reg32_3d reg64_2d reg64_1d @@ -110,17 +115,17 @@ IDMA_RTL_FILES := $(IDMA_RTL_DIR)/idma_transport_layer \ IDMA_VSIM_DIR := $(IDMA_ROOT)/target/sim/vsim define idma_gen - $(PYTHON) $(IDMA_GEN) --entity $1 --tpl $2 --db $3 --ids $4 --fids $5 > $6 + $(PYTHON) $(IDMA_GEN) --entity $1 --tpl $2 --db $3 --ids $4 --fids $5 $(if $7,--compute-ids $7) > $6 endef $(IDMA_RTL_DIR)/idma_transport_layer_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl $(IDMA_DB_FILES) - $(call idma_gen,transport,$(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl,$(IDMA_DB_FILES),$*,,$@) + $(call idma_gen,transport,$(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS)) $(IDMA_RTL_DIR)/idma_legalizer_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl $(IDMA_DB_FILES) - $(call idma_gen,legalizer,$(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl,$(IDMA_DB_FILES),$*,,$@) + $(call idma_gen,legalizer,$(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS)) $(IDMA_RTL_DIR)/idma_backend_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_RTL_DIR)/idma_legalizer_%.sv $(IDMA_RTL_DIR)/idma_transport_layer_%.sv $(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl $(IDMA_DB_FILES) - $(call idma_gen,backend,$(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl,$(IDMA_DB_FILES),$*,,$@) + $(call idma_gen,backend,$(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS)) $(IDMA_RTL_DIR)/idma_backend_synth_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_RTL_DIR)/idma_backend_%.sv $(IDMA_ROOT)/src/backend/tpl/idma_backend_synth.sv.tpl $(IDMA_DB_FILES) $(call idma_gen,synth_wrapper,$(IDMA_ROOT)/src/backend/tpl/idma_backend_synth.sv.tpl,$(IDMA_DB_FILES),$*,,$@) @@ -306,9 +311,84 @@ define idma_generate_vsim endef $(IDMA_VSIM_DIR)/compile.tcl: $(IDMA_BENDER_FILES) $(IDMA_FULL_TB) $(IDMA_FULL_RTL) $(IDMA_INCLUDE_ALL) $(IDMA_WAVE_ALL) - $(call idma_generate_vsim, $@, -t sim -t test -t idma_test -t synth -t rtl -t asic -t snitch_cluster,../../..) + $(call idma_generate_vsim, $@, -t sim -t test -t idma_test -t synth -t rtl -t asic -t snitch_cluster -t split_rtl,../../..) + +.PHONY: idma_sim_tb_idma_rt_midend + +idma_sim_tb_idma_rt_midend: $(IDMA_VSIM_DIR)/compile.tcl + cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc \ + tb_idma_rt_midend -do "run -all; quit" + +# Standalone self-checking transpose-engine regression (DPI-C golden, no backend deps). +# Run with the Questa SEPP wrapper, e.g.: +# make idma_sim_tb_idma_otf_transpose VSIM="questa-2023.4 vsim" VLOG="questa-2023.4 vlog" VLIB="questa-2023.4 vlib" +IDMA_OTF_TP_RTL := $(abspath $(IDMA_ROOT)/src/backend/idma_otf_transpose.sv) +IDMA_OTF_TP_TB := $(abspath $(IDMA_ROOT)/test/tb_idma_otf_transpose.sv) +IDMA_OTF_TP_DPI := $(abspath $(IDMA_ROOT)/test/idma_transpose_dpi.c) +IDMA_OTF_TP_DIR := $(abspath $(IDMA_VSIM_DIR))/otf_transpose + +.PHONY: idma_sim_tb_idma_otf_transpose +idma_sim_tb_idma_otf_transpose: + mkdir -p $(IDMA_OTF_TP_DIR) + cd $(IDMA_OTF_TP_DIR); $(VLIB) work + cd $(IDMA_OTF_TP_DIR); $(VLOG) -sv $(IDMA_OTF_TP_DPI) + cd $(IDMA_OTF_TP_DIR); $(VLOG) -sv -svinputport=compat -timescale "1ns/1fs" $(IDMA_OTF_TP_RTL) $(IDMA_OTF_TP_TB) + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8 -gM=13 -gN=19 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8 -gM=7 -gN=5 -gEB=2 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8 -gM=5 -gN=3 -gEB=4 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=64 -gM=130 -gN=70 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=8 -gM=13 -gN=19 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=8 -gM=7 -gN=5 -gEB=2 tb_idma_otf_transpose +BP -do "run -all; quit" + cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=64 -gM=130 -gN=70 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit" + +# Multi-tile transpose via the ND midend (transposed strides) -> rw_axi backend +# (engine spliced at the write seam) -> axi_sim_mem. Covers aligned and edge +# (M or N not a multiple of NE) geometries for int8/fp16/fp32. Needs the +# split_rtl flow (per-variant routing). Run with the Questa SEPP wrapper: +# make idma_sim_tb_idma_transpose_nd VSIM="questa-2023.4 vsim" +.PHONY: idma_sim_tb_idma_transpose_nd +idma_sim_tb_idma_transpose_nd: $(IDMA_VSIM_DIR)/compile.tcl + cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit" + # ── aligned (regression: M,N multiples of NE) ── + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8 -gN=8 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=16 -gN=16 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=16 -gN=8 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=32 -gN=24 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8 -gN=8 -gEB=2 tb_idma_transpose_nd -do "run -all; quit" + # ── edge: partial output cols only (M%NE!=0, N%NE==0; within-beat wstrb) ── + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6 -gN=8 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + # ── edge: partial output rows only (N%NE!=0; zero-strobe drain beats) ── + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8 -gN=6 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + # ── edge: both partial (int8) ── + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6 -gN=6 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5 -gN=7 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=10 -gN=6 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + # ── edge: fp16 (EB=2) and fp32 (EB=4) ── + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5 -gN=5 -gEB=2 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=9 -gN=5 -gEB=4 tb_idma_transpose_nd -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=13 -gN=19 -gEB=1 tb_idma_transpose_nd -do "run -all; quit" + +# Back-to-back regressions: the ND midend must reload each new transfer's base +# address (it does, for a protocol-compliant producer that drops nd_req_valid on +# accept). tb_idma_nd_midend_b2b checks the midend's burst-address sequence under +# backpressure; tb_idma_transpose_b2b checks two end-to-end transposes to distinct +# destinations. Run with the Questa SEPP wrapper. +.PHONY: idma_sim_tb_idma_nd_midend_b2b +idma_sim_tb_idma_nd_midend_b2b: $(IDMA_VSIM_DIR)/compile.tcl + cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc tb_idma_nd_midend_b2b -do "run -all; quit" + +.PHONY: idma_sim_tb_idma_transpose_b2b +idma_sim_tb_idma_transpose_b2b: $(IDMA_VSIM_DIR)/compile.tcl + cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6 -gN=8 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8 -gN=8 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=13 -gN=19 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit" + cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5 -gN=5 -gEB=2 tb_idma_transpose_b2b -do "run -all; quit" idma_sim_clean: + rm -rf $(IDMA_OTF_TP_DIR) rm -rf $(IDMA_VSIM_DIR)/compile.tcl rm -rf $(IDMA_VSIM_DIR)/work rm -f $(IDMA_VSIM_DIR)/dma_trace_* diff --git a/jobs/backend_rw_axi/transpose_none.txt b/jobs/backend_rw_axi/transpose_none.txt new file mode 100644 index 00000000..e69de29b diff --git a/src/backend/idma_axi_write.sv b/src/backend/idma_axi_write.sv index ee27349d..faa309db 100644 --- a/src/backend/idma_axi_write.sv +++ b/src/backend/idma_axi_write.sv @@ -73,7 +73,11 @@ module idma_axi_write #( /// Valid from buffer input strb_t buffer_out_valid_i, /// Ready to buffer - output strb_t buffer_out_ready_o + output strb_t buffer_out_ready_o, + /// External write-strobe mask (ANDed into wstrb); tie to '1 when unused + input strb_t mask_ext_i, + /// Pulses when a write beat is accepted on the bus (strobe-independent) + output logic w_beat_done_o ); // offsets needed for masks to empty buffer strb_t w_first_mask; @@ -141,6 +145,8 @@ module idma_axi_write #( if (w_dp_req_i.tailer != '0 & last_w) begin mask_out = mask_out & w_last_mask; end + // external mask: some bytes may be masked by an OTF engine + mask_out = mask_out & mask_ext_i; end @@ -170,6 +176,8 @@ module idma_axi_write #( // write happening: both the bus (w_ready) and the buffer (ready_to_write) is high assign write_happening = ready_to_write & write_rsp_i.w_ready; + assign w_beat_done_o = write_happening; + // the main buffer is conditionally to the write mask popped assign buffer_out_ready_o = write_happening ? mask_out : '0; diff --git a/src/backend/idma_otf_compute.sv b/src/backend/idma_otf_compute.sv new file mode 100644 index 00000000..88bfff32 --- /dev/null +++ b/src/backend/idma_otf_compute.sv @@ -0,0 +1,100 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller + +/// On-the-fly compute dispatcher at the transport write seam: latches the +/// per-transfer compute options and dispatches one op per transfer to its sub-unit. +module idma_otf_compute #( + /// Byte lanes per beat (= DataWidth/8) + parameter int unsigned StrbWidth = 32'd8, + /// Compile-time per-op feature enables (value rendered by the generator) + parameter idma_pkg::compute_enable_t ComputeEnable = '0, + /// Transpose engine duplex (1: two banks full rate, 0: one bank half area) + parameter bit TransposeFullDuplex = 1'b1 +) ( + input logic clk_i, + input logic rst_ni, + + /// Per-transfer compute config; valid only while `cfg_valid_i` + input idma_pkg::compute_options_t compute_i, + input logic cfg_valid_i, + /// A supported compute op is armed for this transfer + output logic active_o, + + /// Input beat stream (from the dataflow buffer) + input logic [StrbWidth-1:0][7:0] data_i, + input logic valid_i, + output logic in_ready_o, + + /// Output beat stream (computed) with per-byte strobe for edge masking + output logic [StrbWidth-1:0][7:0] data_o, + output logic [StrbWidth-1:0] strb_o, + output logic valid_o, + input logic ready_i +); + + // config latch with first-beat bypass + idma_pkg::compute_options_t latched_q, eff_compute; + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) latched_q <= '0; + else if (cfg_valid_i) latched_q <= compute_i; + end + assign eff_compute = cfg_valid_i ? compute_i : latched_q; + + // per-op select + logic sel_transpose; + assign sel_transpose = eff_compute.enable & + (eff_compute.op == idma_pkg::COMPUTE_TRANSPOSE) & ComputeEnable.transpose; + + assign active_o = sel_transpose; + + // transpose sub-unit + logic [StrbWidth-1:0][7:0] tp_data; + logic [StrbWidth-1:0] tp_strb; + logic tp_valid, tp_in_ready; + + if (ComputeEnable.transpose) begin : gen_transpose + idma_otf_transpose #( + .StrbWidth ( StrbWidth ), + .DimWidth ( idma_pkg::TransposeDimWidth ), + .FullDuplex ( TransposeFullDuplex ) + ) i_idma_otf_transpose ( + .clk_i, + .rst_ni, + .clear_i ( ~sel_transpose ), + .transp_mode_i ( eff_compute.params.transpose.mode ), + .tensor_size_m_i ( eff_compute.params.transpose.tensor_m ), + .tensor_size_n_i ( eff_compute.params.transpose.tensor_n ), + .data_i ( data_i ), + .valid_i ( valid_i & sel_transpose ), + .ready_o ( tp_in_ready ), + .data_o ( tp_data ), + .strb_o ( tp_strb ), + .valid_o ( tp_valid ), + .ready_i ( ready_i & sel_transpose ) + ); + end else begin : gen_no_transpose + assign tp_data = '0; assign tp_strb = '0; assign tp_valid = 1'b0; assign tp_in_ready = 1'b0; + end + + // output dispatch + always_comb begin + data_o = '0; + strb_o = '0; + valid_o = 1'b0; + in_ready_o = 1'b0; + unique case (1'b1) + sel_transpose: begin + data_o = tp_data; + strb_o = tp_strb; + valid_o = tp_valid; + in_ready_o = tp_in_ready; + end + default: ; + endcase + end + +endmodule : idma_otf_compute diff --git a/src/backend/idma_otf_transpose.sv b/src/backend/idma_otf_transpose.sv new file mode 100644 index 00000000..5c543f99 --- /dev/null +++ b/src/backend/idma_otf_transpose.sv @@ -0,0 +1,234 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller +// +// On-the-fly matrix transpose engine (ping-pong tile banks) for the iDMA +// transport write seam. Adapted from the datamover (Ratha) HWPE: +// pulp-platform/datamover@d58a985, rtl/datamover_engine.sv. +// +// Contract: input padded to full tiles, fed (col-tile, row-tile, row) order; +// out_T[nt*NE+k][rt*NE+r] = in[rt*NE+r][nt*NE+k]. strb_o masks partial edge tiles. +// Throughput: 1 + 1/NE cycles per NE-beat tile (one handoff bubble per tile). + +module idma_otf_transpose #( + /// Byte lanes per beat (= DataWidth/8) + parameter int unsigned StrbWidth = 32'd8, + /// Tensor dimension width in elements (matches idma_pkg::TransposeDimWidth) + parameter int unsigned DimWidth = 32'd12, + /// 1: two tile banks, fill while draining; 0: one bank (half area, half rate) + parameter bit FullDuplex = 1'b1, + localparam int unsigned NumBanks = FullDuplex ? 32'd2 : 32'd1, + localparam int unsigned LaneW = $clog2(StrbWidth) +) ( + input logic clk_i, + input logic rst_ni, + input logic clear_i, + + /// Element size select: 0->1B, 1->2B, 2->4B (E = 1<StrbWidth) degrades to NE=1 + assign eff_mode = (transp_mode_i > LaneW) ? LaneW[1:0] : transp_mode_i; + assign ne_m1 = (1 << (LaneW - eff_mode)) - 1; // NE-1 + assign log2_ne = LaneW - eff_mode; + // Widen the ceil-div add by one bit so it cannot wrap at the dim range. + assign y_tiles = DimWidth'(((DimWidth+1)'(tensor_size_m_i) + ne_m1) >> log2_ne); + assign n_tiles = DimWidth'(((DimWidth+1)'(tensor_size_n_i) + ne_m1) >> log2_ne); + assign leftover_rows = tensor_size_m_i & ne_m1; + assign leftover_cols = tensor_size_n_i & ne_m1; + + // FF tile banks (ping-pong when FullDuplex), E=1 worst case (StrbWidth x StrbWidth B) + logic [StrbWidth-1:0][7:0] tile_q [NumBanks][StrbWidth]; + + // internal output + handshakes + logic [StrbWidth-1:0][7:0] data_int; + logic [StrbWidth-1:0] strb_int; + logic valid_int, ready_int; + logic in_hs, out_hs; + assign in_hs = valid_i & ready_o; + assign out_hs = valid_int & ready_int; + + // full_q[b]: bank b holds a complete tile. Producer sets on fill-complete, + // consumer clears on drain-complete. + logic [NumBanks-1:0] full_q; + logic wr_bank, rd_bank; + logic [LaneW-1:0] wr_cnt, rd_cnt; // intra-tile beat index (write / read) + logic wr_last, rd_last; + + assign wr_last = (wr_cnt == ne_m1[LaneW-1:0]); + assign rd_last = (rd_cnt == ne_m1[LaneW-1:0]); + + assign ready_o = ~full_q[wr_bank]; + assign valid_int = full_q[rd_bank]; + + // tile walkers (col-tile outer, row-tile inner); drain trails fill by up to one tile + logic [DimWidth-1:0] rtw, ntw; // write walker: row-tile, col-tile + logic [DimWidth-1:0] rtr, ntr; // read walker: row-tile, col-tile + + logic last_y_tile_w, last_n_tile_w; // edge flags of the tile being filled + assign last_y_tile_w = (rtw == y_tiles - 1); + assign last_n_tile_w = (ntw == n_tiles - 1); + + logic last_y_tile_r, last_n_tile_r; // edge flags of the tile being drained + assign last_y_tile_r = (rtr == y_tiles - 1); + assign last_n_tile_r = (ntr == n_tiles - 1); + + // per-bank edge flags, captured at fill-complete, consumed by the drain strobe + logic shadow_last_y [NumBanks]; + logic shadow_last_n [NumBanks]; + + // fill-/drain-complete events + logic fill_done, drain_done, exec_done; + assign fill_done = in_hs & wr_last; + assign drain_done = out_hs & rd_last; + // transfer done once the final tile drains + assign exec_done = drain_done & last_y_tile_r & last_n_tile_r; + + // producer (input) side + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + for (int b = 0; b < NumBanks; b++) + for (int r = 0; r < StrbWidth; r++) + tile_q[b][r] <= '0; + wr_cnt <= '0; + wr_bank <= 1'b0; + rtw <= '0; + ntw <= '0; + for (int b = 0; b < NumBanks; b++) begin + shadow_last_y[b] <= 1'b0; + shadow_last_n[b] <= 1'b0; + end + end else if (clear_i || exec_done) begin + wr_cnt <= '0; + wr_bank <= 1'b0; + rtw <= '0; + ntw <= '0; + for (int b = 0; b < NumBanks; b++) begin + shadow_last_y[b] <= 1'b0; + shadow_last_n[b] <= 1'b0; + end + end else begin + if (in_hs) begin + tile_q[wr_bank][wr_cnt] <= data_i; + wr_cnt <= wr_last ? '0 : (wr_cnt + 1'b1); + end + if (fill_done) begin + shadow_last_y[wr_bank] <= last_y_tile_w; + shadow_last_n[wr_bank] <= last_n_tile_w; + wr_bank <= FullDuplex ? ~wr_bank : 1'b0; + if (rtw == y_tiles - 1) begin + rtw <= '0; + ntw <= ntw + 1'b1; + end else begin + rtw <= rtw + 1'b1; + end + end + end + end + + // consumer (output) side + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni || clear_i || exec_done) begin + rd_cnt <= '0; + rd_bank <= 1'b0; + rtr <= '0; + ntr <= '0; + end else begin + if (out_hs) begin + rd_cnt <= rd_last ? '0 : (rd_cnt + 1'b1); + end + if (drain_done) begin + rd_bank <= FullDuplex ? ~rd_bank : 1'b0; + if (rtr == y_tiles - 1) begin + rtr <= '0; + ntr <= ntr + 1'b1; + end else begin + rtr <= rtr + 1'b1; + end + end + end + end + + // full/empty token + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni || clear_i || exec_done) begin + full_q <= 2'b00; + end else begin + if (fill_done) full_q[wr_bank] <= 1'b1; + if (drain_done) full_q[rd_bank] <= 1'b0; + end + end + + // transposed readout: byte p (element e=p>>logE, byte b=p&(E-1)) reads + // tile_q[rd_bank][e][rd_cnt*E + b] + always_comb begin + for (int p = 0; p < StrbWidth; p++) begin + automatic int unsigned e = p >> eff_mode; + automatic int unsigned b = p & ((1 << eff_mode) - 1); + automatic int unsigned col = (rd_cnt << eff_mode) | b; + data_int[p] = tile_q[rd_bank][e][col]; + end + end + + // output strobe: element-granular edge masking from the drain-side shadow flags + always_comb begin + logic [StrbWidth-1:0] em; // per-element valid (only low NE bits meaningful) + logic ly, ln; + ly = shadow_last_y[rd_bank]; + ln = shadow_last_n[rd_bank]; + for (int e = 0; e < StrbWidth; e++) begin + logic v; + if ((ly && leftover_rows != 0) && (ln && leftover_cols != 0)) + v = (rd_cnt < leftover_cols) && (e < leftover_rows); + else if (ly && leftover_rows != 0) + v = (e < leftover_rows); + else if (ln && leftover_cols != 0) + v = (rd_cnt < leftover_cols); + else + v = 1'b1; + em[e] = v; + end + for (int p = 0; p < StrbWidth; p++) + strb_int[p] = em[p >> eff_mode]; + end + + // output register; not cleared by exec_done so the final beat is held until accepted + assign ready_int = ~valid_o | ready_i; + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni || clear_i) begin + valid_o <= 1'b0; + data_o <= '0; + strb_o <= '0; + end else if (ready_int) begin + valid_o <= valid_int; + data_o <= data_int; + strb_o <= strb_int; + end + end + +endmodule : idma_otf_transpose diff --git a/src/backend/tpl/idma_backend.sv.tpl b/src/backend/tpl/idma_backend.sv.tpl index c5a2e164..a76120e9 100644 --- a/src/backend/tpl/idma_backend.sv.tpl +++ b/src/backend/tpl/idma_backend.sv.tpl @@ -170,9 +170,23 @@ _rsp_t ${protocol}_write_rsp_i, output idma_busy_t busy_o ); + /// Extra write-descriptor slots covering the compute (transpose) tile-fill latency + localparam int unsigned ComputeFifoDepth = ${"StrbWidth" if enable_compute else "32'd0"}; +% if enable_compute: + + /// Per-op compute set baked into this variant (frontends may cross-check) + localparam idma_pkg::compute_enable_t ComputeEnable = + '{${', '.join("%s: 1'b1" % op for op in compute_ops)}}; +`ifndef SYNTHESIS + // no engine flush on abort: compute is incompatible with error handling + initial assert (ErrorCap == idma_pkg::NO_ERROR_HANDLING) else + $fatal(1, "compute requires ErrorCap == NO_ERROR_HANDLING"); +`endif +% endif + /// The localparam MetaFifoDepth holds the maximum number of transfers that can be /// in-flight under any circumstances. - localparam int unsigned MetaFifoDepth = BufferDepth + NumAxInFlight + MemSysDepth; + localparam int unsigned MetaFifoDepth = BufferDepth + NumAxInFlight + MemSysDepth + ComputeFifoDepth; /// Address type typedef logic [AddrWidth-1:0] addr_t; @@ -231,6 +245,7 @@ _rsp_t ${protocol}_write_rsp_i, offset_t shift; axi_pkg::len_t num_beats; logic is_single; + idma_pkg::compute_options_t compute; } w_dp_req_t; /// The datapath write response type provides feedback from the write part of the datapath: @@ -290,6 +305,7 @@ _rsp_t ${protocol}_write_rsp_i, idma_pkg::axi_options_t src_axi_opt; idma_pkg::axi_options_t dst_axi_opt; logic super_last; + idma_pkg::compute_options_t compute; } idma_mut_tf_opt_t; /// The mutable transfer type holds important information that is mutated by the @@ -483,7 +499,8 @@ _rsp_t ${protocol}_write_rsp_i, tailer: OffsetWidth'(idma_req_i.length + idma_req_i.dst_addr[OffsetWidth-1:0]), shift: OffsetWidth'(- idma_req_i.dst_addr[OffsetWidth-1:0]), num_beats: len, - is_single: len == '0 + is_single: len == '0, + compute: idma_req_i.opt.compute }; // if the legalizer is bypassed; every burst is the last of the 1D transfer @@ -610,7 +627,7 @@ _rsp_t ${protocol}_write_rsp_i, ); stream_fifo_optimal_wrap #( - .Depth ( NumAxInFlight ), + .Depth ( NumAxInFlight + ComputeFifoDepth ), .type_t ( w_dp_req_t ), .PrintInfo ( PrintFifoInfo ) ) i_w_dp_req ( diff --git a/src/backend/tpl/idma_legalizer.sv.tpl b/src/backend/tpl/idma_legalizer.sv.tpl index 777ee1bc..c3643f69 100644 --- a/src/backend/tpl/idma_legalizer.sv.tpl +++ b/src/backend/tpl/idma_legalizer.sv.tpl @@ -478,8 +478,13 @@ w_num_bytes_to_pb = w_page_num_bytes_to_pb; dst_protocol: req_i.opt.dst_protocol, read_shift: '0, write_shift: '0, +% if enable_compute: + decouple_rw: req_i.opt.beo.decouple_rw | req_i.opt.compute.enable, + decouple_aw: req_i.opt.beo.decouple_aw | req_i.opt.compute.enable, +% else: decouple_rw: req_i.opt.beo.decouple_rw, decouple_aw: req_i.opt.beo.decouple_aw, +% endif src_max_llen: req_i.opt.beo.src_max_llen, dst_max_llen: req_i.opt.beo.dst_max_llen, src_reduce_len: req_i.opt.beo.src_reduce_len, @@ -487,7 +492,8 @@ w_num_bytes_to_pb = w_page_num_bytes_to_pb; axi_id: req_i.opt.axi_id, src_axi_opt: req_i.opt.src, dst_axi_opt: req_i.opt.dst, - super_last: req_i.opt.last + super_last: req_i.opt.last, + compute: req_i.opt.compute }; // determine shift amount if (CombinedShifter) begin @@ -549,7 +555,8 @@ ${database[used_write_protocols[0]]['legalizer_write_data_path']} tailer: OffsetWidth'(w_num_bytes + w_addr_offset), shift: opt_tf_q.write_shift, num_beats: 'd0, - is_single: 1'b1 + is_single: 1'b1, + compute: opt_tf_q.compute }; % endif end @@ -583,7 +590,8 @@ ${database[protocol]['legalizer_write_data_path']} tailer: OffsetWidth'(w_num_bytes + w_addr_offset), shift: opt_tf_q.write_shift, num_beats: 'd0, - is_single: 1'b1 + is_single: 1'b1, + compute: opt_tf_q.compute }; endcase end diff --git a/src/backend/tpl/idma_transport_layer.sv.tpl b/src/backend/tpl/idma_transport_layer.sv.tpl index 91d27131..019864e0 100644 --- a/src/backend/tpl/idma_transport_layer.sv.tpl +++ b/src/backend/tpl/idma_transport_layer.sv.tpl @@ -228,6 +228,10 @@ _rsp_t ${protocol}_write_rsp_i, // aligned and coalesced data leaving the buffer byte_t [2*StrbWidth-1:0] buffer_out_tmp; byte_t [StrbWidth-1:0] buffer_out, buffer_out_shifted; + // compute write seam (passthrough when compute is off) + byte_t [StrbWidth-1:0] wr_data; + strb_t wr_valid, wr_strb, mask_ext_shifted, dataflow_ready_in; + logic w_beat_done; % if not one_read_port: // Read multiplexed signals @@ -439,16 +443,61 @@ ${rendered_read_ports[read_port]} .ready_o ( buffer_in_ready ), .data_o ( buffer_out ), .valid_o ( buffer_out_valid ), - .ready_i ( buffer_out_ready_shifted ) + .ready_i ( dataflow_ready_in ) ); + //-------------------------------------- + // On-the-fly compute (write seam) + //-------------------------------------- + +% if enable_compute: + logic cmp_active; + logic cmp_in_ready, cmp_out_valid; + byte_t [StrbWidth-1:0] cmp_data_o; + strb_t cmp_strb_o; + + // beats retire on w_beat_done (strobe-independent) + idma_otf_compute #( + .StrbWidth ( StrbWidth ), + .ComputeEnable ( '{${', '.join("%s: 1'b1" % op for op in compute_ops)}} ), + .TransposeFullDuplex ( 1'b${'1' if compute_full_duplex else '0'} ) + ) i_idma_otf_compute ( + .clk_i, + .rst_ni, + .compute_i ( w_dp_req_i.compute ), + .cfg_valid_i ( w_dp_valid_i ), + .active_o ( cmp_active ), + .data_i ( buffer_out ), + .valid_i ( &buffer_out_valid ), + .in_ready_o ( cmp_in_ready ), + .data_o ( cmp_data_o ), + .strb_o ( cmp_strb_o ), + .valid_o ( cmp_out_valid ), + .ready_i ( w_beat_done ) + ); + + // whole-beat valid; edge masking carried on wr_strb + assign wr_data = cmp_active ? cmp_data_o : buffer_out; + assign wr_valid = cmp_active ? {StrbWidth{cmp_out_valid}} : buffer_out_valid; + assign wr_strb = cmp_active ? cmp_strb_o : '1; + // pop the buffer only on a compute input handshake + assign dataflow_ready_in = cmp_active ? {StrbWidth{(&buffer_out_valid) & cmp_in_ready}} + : buffer_out_ready_shifted; +% else: + assign wr_data = buffer_out; + assign wr_valid = buffer_out_valid; + assign wr_strb = '1; + assign dataflow_ready_in = buffer_out_ready_shifted; +% endif + //-------------------------------------- // Write Barrel shifter //-------------------------------------- - assign buffer_out_tmp = {buffer_out, buffer_out} >> (w_dp_req_i.shift*8); + assign buffer_out_tmp = {wr_data, wr_data} >> (w_dp_req_i.shift*8); assign buffer_out_shifted = buffer_out_tmp[$bits(buffer_out_shifted)/8-1:0]; - assign buffer_out_valid_shifted = strb_t'({buffer_out_valid, buffer_out_valid} >> w_dp_req_i.shift); + assign buffer_out_valid_shifted = strb_t'({wr_valid, wr_valid} >> w_dp_req_i.shift); + assign mask_ext_shifted = strb_t'({wr_strb, wr_strb} >> w_dp_req_i.shift); assign buffer_out_ready_shifted = strb_t'({buffer_out_ready, buffer_out_ready} >> - w_dp_req_i.shift); % if not one_write_port: diff --git a/src/db/idma_axi.yml b/src/db/idma_axi.yml index 02c7f884..497b1abd 100644 --- a/src/db/idma_axi.yml +++ b/src/db/idma_axi.yml @@ -66,7 +66,8 @@ legalizer_write_data_path: | tailer: OffsetWidth'(w_num_bytes + w_addr_offset), shift: opt_tf_q.write_shift, num_beats: w_req_o.aw_req.axi.aw_chan.len, - is_single: w_req_o.aw_req.axi.aw_chan.len == '0 + is_single: w_req_o.aw_req.axi.aw_chan.len == '0, + compute: opt_tf_q.compute }; read_template: | idma_axi_read #( @@ -127,7 +128,9 @@ write_template: | .write_rsp_i ( ${write_response} ), .buffer_out_i ( buffer_out_shifted ), .buffer_out_valid_i ( buffer_out_valid_shifted ), - .buffer_out_ready_o ( ${buffer_out_ready} ) + .buffer_out_ready_o ( ${buffer_out_ready} ), + .mask_ext_i ( mask_ext_shifted ), + .w_beat_done_o ( w_beat_done ) ); synth_wrapper_ports_write: | output id_t axi_aw_id_o, diff --git a/src/db/idma_tilelink.yml b/src/db/idma_tilelink.yml index 652a6caf..06771253 100644 --- a/src/db/idma_tilelink.yml +++ b/src/db/idma_tilelink.yml @@ -84,7 +84,8 @@ legalizer_write_data_path: | tailer: OffsetWidth'(w_num_bytes + w_addr_offset), shift: opt_tf_q.write_shift, num_beats: 'd0, - is_single: w_num_bytes <= StrbWidth + is_single: w_num_bytes <= StrbWidth, + compute: opt_tf_q.compute }; read_template: | idma_tilelink_read #( diff --git a/src/idma_pkg.sv b/src/idma_pkg.sv index f3023499..3eb2f95c 100644 --- a/src/idma_pkg.sv +++ b/src/idma_pkg.sv @@ -81,6 +81,39 @@ package idma_pkg; logic dst_reduce_len; } backend_options_t; + /// On-the-fly compute operation selector + typedef enum logic [3:0] { + COMPUTE_NONE = 4'd0, + COMPUTE_TRANSPOSE = 4'd1 + } compute_op_e; + + /// Transpose tensor dimension width in elements (bound by the inst64 DMCPY argb encoding) + localparam int unsigned TransposeDimWidth = 32'd12; + + /// Transpose option type: E = 1< 1/2/4 B); tensor in elements + typedef struct packed { + logic [1:0] mode; + logic [TransposeDimWidth-1:0] tensor_m; + logic [TransposeDimWidth-1:0] tensor_n; + } transpose_options_t; + + /// Per-op compute parameter union (members must be equal width) + typedef union packed { + transpose_options_t transpose; + } compute_params_t; + + /// Compute option type: per-transfer on-the-fly compute selection + typedef struct packed { + logic enable; + compute_op_e op; + compute_params_t params; + } compute_options_t; + + /// Compile-time per-op compute feature enables + typedef struct packed { + logic transpose; + } compute_enable_t; + /// Supported Protocols /// - `AXI`: Full AXI /// - `AXILITE`: AXI Lite diff --git a/src/include/idma/typedef.svh b/src/include/idma/typedef.svh index d250ec10..bceb89c9 100644 --- a/src/include/idma/typedef.svh +++ b/src/include/idma/typedef.svh @@ -25,8 +25,9 @@ axi_id_t axi_id; \ idma_pkg::axi_options_t src; \ idma_pkg::axi_options_t dst; \ - idma_pkg::backend_options_t beo; \ - logic last; \ + idma_pkg::backend_options_t beo; \ + idma_pkg::compute_options_t compute; \ + logic last; \ } options_t; `define IDMA_TYPEDEF_ERR_PAYLOAD_T(err_payload_t, axi_addr_t) \ typedef struct packed { \ diff --git a/src/midend/idma_nd_midend.sv b/src/midend/idma_nd_midend.sv index 07267c97..fd3adaee 100644 --- a/src/midend/idma_nd_midend.sv +++ b/src/midend/idma_nd_midend.sv @@ -65,6 +65,13 @@ module idma_nd_midend #( /// How many bits are required to index the counters localparam int unsigned StrideSelWidth = $clog2(NumDim-1) + 'd1; +`ifndef SYNTHESIS + // strides are added with same-width arithmetic; narrower strides would not sign-extend + initial assert ($bits(nd_req_i.d_req[0].src_strides) == $bits(nd_req_i.burst_req.src_addr)) + else $fatal(1, "idma_nd_midend: stride width (%0d) != address width (%0d)", + $bits(nd_req_i.d_req[0].src_strides), $bits(nd_req_i.burst_req.src_addr)); +`endif + // The counter currently active (this is added to the address) logic [StrideSelWidth-1:0] stride_sel_d, stride_sel_q; diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv new file mode 100644 index 00000000..9d79d551 --- /dev/null +++ b/src/midend/idma_transpose_midend.sv @@ -0,0 +1,123 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Daniel Keller + +/// Transpose geometry expander. For a request carrying opt.compute = TRANSPOSE, +/// derives the NumDim=4 tiled ND walk (row / row-tile / col-tile) from the tensor +/// shape (M, N, element mode) and the bus StrbWidth, leaving the generic +/// idma_nd_midend to walk it. Non-transpose requests pass through untouched. +/// The walk reads the source up to the tile-padded bounds (ceil to NE in both +/// dims) and writes the full padded dst extent (writes strb-masked, reads not). +/// Combinational; outputs are quasi-static per request (multicycle-safe in STA). +module idma_transpose_midend #( + /// Number of ND dimensions (must be >= 4 to express the tiled walk) + parameter int unsigned NumDim = 32'd4, + /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes) + parameter int unsigned StrbWidth = 32'd64, + /// Address type + parameter type addr_t = logic, + /// ND request type + parameter type idma_nd_req_t = logic +)( + input idma_nd_req_t nd_req_i, + input logic valid_i, + output logic ready_o, + output idma_nd_req_t nd_req_o, + output logic valid_o, + input logic ready_i +); + + localparam int unsigned Log2Strb = $clog2(StrbWidth); + localparam int unsigned LenW = $bits(nd_req_o.burst_req.length); + localparam int unsigned RepW = $bits(nd_req_o.d_req[0].reps); + localparam int unsigned ModeW = $bits(nd_req_o.burst_req.opt.compute.params.transpose.mode); + localparam int unsigned TensorW = + $bits(nd_req_o.burst_req.opt.compute.params.transpose.tensor_m); + localparam int unsigned AddrW = $bits(addr_t); + // working width: largest term (YT*N)< AddrW) ? ProdW : AddrW; + + assign valid_o = valid_i; + assign ready_o = ready_i; + + logic is_transpose; + assign is_transpose = nd_req_i.burst_req.opt.compute.enable & + (nd_req_i.burst_req.opt.compute.op == idma_pkg::COMPUTE_TRANSPOSE); + + // NE and E are powers of two: all geometry folds to shifts except the YT*N + // stride product. + always_comb begin : proc_expand + logic [ModeW-1:0] mode; + logic [TensorW-1:0] tm, tn; + logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe; + logic signed [WorkW-1:0] strb_c; // NE*E == StrbWidth (mode cancels) + + nd_req_o = nd_req_i; // passthrough + + if (is_transpose) begin + mode = nd_req_i.burst_req.opt.compute.params.transpose.mode; + tm = nd_req_i.burst_req.opt.compute.params.transpose.tensor_m; + tn = nd_req_i.burst_req.opt.compute.params.transpose.tensor_n; + // zero-extend bounded dims into the signed working width + m = $signed({{(WorkW-TensorW){1'b0}}, tm}); // M + n = $signed({{(WorkW-TensorW){1'b0}}, tn}); // N + log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode}); + ne = $signed(WorkW'(1)) <<< log2ne; // tile side (elements) + yt = (m + ne - 1) >>> log2ne; // ceil(M/NE) + nt = (n + ne - 1) >>> log2ne; // ceil(N/NE) + nxe = n <<< mode; // N*E (E = 1<= 4) else + $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim); + // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2 + initial assert (Log2Strb >= 2) else + $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth); + // reps must hold tile counts (<= 2^TensorW) and ne (<= StrbWidth); length StrbWidth. + initial assert (RepW >= TensorW && RepW > Log2Strb) else + $fatal(1, "idma_transpose_midend: reps field %0d b too narrow (need >= %0d)", + RepW, (TensorW > Log2Strb+1) ? TensorW : Log2Strb+1); + initial assert (LenW > Log2Strb) else + $fatal(1, "idma_transpose_midend: length field %0d b cannot hold StrbWidth", LenW); + // reserved mode 3 (EB=8) and zero-size tensors are out of contract + always_comb begin : check_domain + if (is_transpose) begin + assert (nd_req_i.burst_req.opt.compute.params.transpose.mode != 2'd3) else + $error("idma_transpose_midend: reserved element mode 3 (EB=8)"); + assert (nd_req_i.burst_req.opt.compute.params.transpose.tensor_m != '0 && + nd_req_i.burst_req.opt.compute.params.transpose.tensor_n != '0) else + $error("idma_transpose_midend: zero-size tensor (M or N == 0)"); + end + end +`endif + +endmodule diff --git a/test/idma_test.sv b/test/idma_test.sv index ece009f6..05a8ae8c 100644 --- a/test/idma_test.sv +++ b/test/idma_test.sv @@ -687,7 +687,11 @@ package idma_test; input logic [2:0] dst_max_llen, input logic src_reduce_len, input logic dst_reduce_len, - input id_t id + input id_t id, + input logic transpose_en = 1'b0, + input logic [1:0] transp_mode = '0, + input logic [11:0] tensor_m = '0, + input logic [11:0] tensor_n = '0 ); idma.req.length <= #TA length; idma.req.src_addr <= #TA src_addr; @@ -702,6 +706,12 @@ package idma_test; idma.req.opt.beo.dst_max_llen <= #TA dst_max_llen; idma.req.opt.beo.src_reduce_len <= #TA src_reduce_len; idma.req.opt.beo.dst_reduce_len <= #TA dst_reduce_len; + idma.req.opt.compute.enable <= #TA transpose_en; + idma.req.opt.compute.op <= #TA transpose_en ? idma_pkg::COMPUTE_TRANSPOSE + : idma_pkg::COMPUTE_NONE; + idma.req.opt.compute.params.transpose.mode <= #TA transp_mode; + idma.req.opt.compute.params.transpose.tensor_m <= #TA tensor_m; + idma.req.opt.compute.params.transpose.tensor_n <= #TA tensor_n; idma.req_valid <= #TA 1; cycle_start(); while (idma.req_ready != 1) begin cycle_end(); cycle_start(); end diff --git a/test/idma_transpose_dpi.c b/test/idma_transpose_dpi.c new file mode 100644 index 00000000..0f7faf3f --- /dev/null +++ b/test/idma_transpose_dpi.c @@ -0,0 +1,39 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Authors: +// - Daniel Keller +// +// DPI-C golden model for idma_otf_transpose: an element-granular matrix +// transpose. Element size E in {1,2,4} bytes (int8/fp16/fp32); each E-byte +// element is kept intact while the M x N element grid is transposed to N x M. +// Accessor-based (no open-array marshalling): the testbench loads the row-major +// input byte by byte, calls gm_transpose(m,n,e), then reads back the transposed +// output bytes. Reference: out_elem[c][r] = in_elem[r][c]. + +#include + +#define GM_MAX_BYTES (1 << 24) // 16 MiB + +static uint8_t gm_in[GM_MAX_BYTES]; +static uint8_t gm_out[GM_MAX_BYTES]; + +// Load one input byte at flat byte index. +void gm_load(int idx, int val) { + if (idx >= 0 && idx < GM_MAX_BYTES) gm_in[idx] = (uint8_t)val; +} + +// Transpose an m x n matrix of e-byte elements (row-major) into n x m. +void gm_transpose(int m, int n, int e) { + for (int r = 0; r < m; r++) + for (int c = 0; c < n; c++) + for (int b = 0; b < e; b++) + gm_out[((long)c * m + r) * e + b] = gm_in[((long)r * n + c) * e + b]; +} + +// Read one transposed output byte at flat byte index. +int gm_get(int idx) { + if (idx >= 0 && idx < GM_MAX_BYTES) return (int)gm_out[idx]; + return -1; +} diff --git a/test/midend/tb_idma_nd_midend_b2b.sv b/test/midend/tb_idma_nd_midend_b2b.sv new file mode 100644 index 00000000..aaba907f --- /dev/null +++ b/test/midend/tb_idma_nd_midend_b2b.sv @@ -0,0 +1,146 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Authors: +// - Daniel Keller +// +// Back-to-back ND regression for idma_nd_midend. Drives the midend directly and +// golden-checks the burst_req address sequence: two back-to-back transfers (no +// gap) plus one after an idle gap must each walk from their own base. Catches a +// stale base-address reuse across transfers. + +`include "idma/typedef.svh" + +module tb_idma_nd_midend_b2b; + + localparam time TCK = 10ns; + localparam int unsigned AddrWidth = 32; + localparam int unsigned NumDim = 4; // 1D burst + 3 strided dims + localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16}; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [31:0] tf_len_t; + typedef logic [11:0] id_t; + typedef logic [31:0] reps_t; + + `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t) + `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t) + `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t) + + // ── Program (shared by all three transfers; only the base addresses differ) ── + localparam int unsigned R0 = 3, R1 = 2, R2 = 2; + localparam int unsigned NB = R0 * R1 * R2; // bursts per transfer + localparam addr_t SS0 = 'h10, DS0 = 'h100; + localparam addr_t SS1 = 'h40, DS1 = 'h400; + localparam addr_t SS2 = 'h1000, DS2 = 'h4000; + localparam addr_t S1 = 'h0000_1000, D1 = 'h0001_0000; // transfer 1 base + localparam addr_t S2 = 'h0000_2000, D2 = 'h0002_0000; // transfer 2 base (back-to-back) + localparam addr_t S3 = 'h0000_3000, D3 = 'h0003_0000; // transfer 3 base (after idle gap) + + logic clk, rst_n; + idma_nd_req_t nd_req; logic nd_req_valid, nd_req_ready; + idma_rsp_t nd_rsp; logic nd_rsp_valid, nd_rsp_ready; + idma_req_t burst_req; logic burst_req_valid, burst_req_ready; + idma_rsp_t burst_rsp; logic burst_rsp_valid, burst_rsp_ready; + logic busy; + + clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n)); + + idma_nd_midend #( + .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t), + .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths) + ) i_dut ( + .clk_i(clk), .rst_ni(rst_n), + .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready), + .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready), + .burst_req_o(burst_req), .burst_req_valid_o(burst_req_valid), .burst_req_ready_i(burst_req_ready), + .burst_rsp_i(burst_rsp), .burst_rsp_valid_i(burst_rsp_valid), .burst_rsp_ready_o(burst_rsp_ready), + .busy_o(busy) + ); + + // Backpressure on burst_req_ready is essential: during a stall stride_sel_q + // collapses toward 0, which is what can defeat the base reload. ready always-1 hides it. + logic [2:0] bp_lfsr; + always @(posedge clk or negedge rst_n) + if (!rst_n) bp_lfsr <= 3'b101; + else bp_lfsr <= {bp_lfsr[1:0], bp_lfsr[2] ^ bp_lfsr[1]}; + assign burst_req_ready = bp_lfsr[0]; // stalls ~half the cycles, incl. boundaries + assign burst_rsp = '0; + assign burst_rsp_valid = 1'b0; + assign nd_rsp_ready = 1'b1; + + // ── Capture every issued burst's src/dst address in order ── + addr_t cap_src [$]; + addr_t cap_dst [$]; + always @(posedge clk) if (rst_n && burst_req_valid && burst_req_ready) begin + cap_src.push_back(burst_req.src_addr); + cap_dst.push_back(burst_req.dst_addr); + end + + // build a NumDim=4 ND program with a given base + function automatic idma_nd_req_t mk_req(input addr_t s, input addr_t d); + idma_nd_req_t r = '0; + r.burst_req.length = tf_len_t'('h8); + r.burst_req.src_addr = s; + r.burst_req.dst_addr = d; + r.burst_req.opt.src_protocol = idma_pkg::AXI; + r.burst_req.opt.dst_protocol = idma_pkg::AXI; + r.burst_req.opt.src.burst = axi_pkg::BURST_INCR; + r.burst_req.opt.dst.burst = axi_pkg::BURST_INCR; + r.d_req[0].reps = reps_t'(R0); r.d_req[0].src_strides = SS0; r.d_req[0].dst_strides = DS0; + r.d_req[1].reps = reps_t'(R1); r.d_req[1].src_strides = SS1; r.d_req[1].dst_strides = DS1; + r.d_req[2].reps = reps_t'(R2); r.d_req[2].src_strides = SS2; r.d_req[2].dst_strides = DS2; + return r; + endfunction + + initial begin + automatic int unsigned errs = 0; + nd_req = '0; nd_req_valid = 1'b0; + @(posedge rst_n); + repeat (3) @(posedge clk); + + // ── transfer 1 ── + nd_req = mk_req(S1, D1); nd_req_valid = 1'b1; + @(posedge clk); + while (!nd_req_ready) @(posedge clk); + // ── transfer 2 : BACK-TO-BACK (keep valid high, swap payload the cycle after accept) ── + nd_req = mk_req(S2, D2); + @(posedge clk); + while (!nd_req_ready) @(posedge clk); + nd_req_valid = 1'b0; + nd_req = '0; + // ── idle gap ── + repeat (5) @(posedge clk); + // ── transfer 3 : after the gap ── + nd_req = mk_req(S3, D3); nd_req_valid = 1'b1; + @(posedge clk); + while (!nd_req_ready) @(posedge clk); + nd_req_valid = 1'b0; + repeat (3) @(posedge clk); + + // ── checks ── + if (cap_src.size() != 3*NB) + begin errs++; $display("[B2B] burst count %0d != %0d", cap_src.size(), 3*NB); end + else begin + // first burst of each transfer must equal its OWN base (reload happened) + if (cap_src[0] !== S1 || cap_dst[0] !== D1) begin errs++; $display("[B2B] T1[0]=(%0h,%0h) exp (%0h,%0h)", cap_src[0], cap_dst[0], S1, D1); end + if (cap_src[NB] !== S2 || cap_dst[NB] !== D2) begin errs++; $display("[B2B] T2[0]=(%0h,%0h) exp (%0h,%0h) -- back-to-back base NOT reloaded", cap_src[NB], cap_dst[NB], S2, D2); end + if (cap_src[2*NB] !== S3 || cap_dst[2*NB] !== D3) begin errs++; $display("[B2B] T3[0]=(%0h,%0h) exp (%0h,%0h)", cap_src[2*NB], cap_dst[2*NB], S3, D3); end + // full-sequence independence: T2 and T3 must be T1 shifted by their base delta + for (int unsigned i = 0; i < NB; i++) begin + if ((cap_src[NB+i] - cap_src[i]) !== (S2 - S1) || (cap_dst[NB+i] - cap_dst[i]) !== (D2 - D1)) begin + errs++; if (errs <= 8) $display("[B2B] T2[%0d] not T1+delta: src %0h vs %0h (Δexp %0h)", i, cap_src[NB+i], cap_src[i], S2-S1); end + if ((cap_src[2*NB+i] - cap_src[i]) !== (S3 - S1) || (cap_dst[2*NB+i] - cap_dst[i]) !== (D3 - D1)) begin + errs++; if (errs <= 8) $display("[B2B] T3[%0d] not T1+delta: src %0h vs %0h (Δexp %0h)", i, cap_src[2*NB+i], cap_src[i], S3-S1); end + end + end + + if (errs == 0) $display("[B2B] PASS: %0d back-to-back + gapped ND transfers each walked from their own base", 3*NB); + else $fatal(1, "[B2B] FAIL: %0d errors (back-to-back ND base-address reuse)", errs); + $finish(); + end + + initial begin #500_000; $fatal(1, "[B2B] timeout"); end + +endmodule diff --git a/test/midend/tb_idma_transpose_midend.sv b/test/midend/tb_idma_transpose_midend.sv new file mode 100644 index 00000000..4f7440e2 --- /dev/null +++ b/test/midend/tb_idma_transpose_midend.sv @@ -0,0 +1,102 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Daniel Keller + +`include "idma/typedef.svh" + +/// Unit check for idma_transpose_midend: the expanded NumDim=4 ND request must +/// match the golden geometry that tb_idma_transpose_nd hand-builds, and a +/// non-transpose request must pass through unchanged. +module tb_idma_transpose_midend #( + parameter int unsigned DataWidth = 512, + parameter int unsigned AddrWidth = 64, + parameter int unsigned M = 40, + parameter int unsigned N = 24, + parameter int unsigned EB = 1 +); + import idma_pkg::*; + + localparam int unsigned StrbWidth = DataWidth/8; + localparam int unsigned NE = StrbWidth/EB; + localparam int unsigned MODE = (EB==4) ? 2 : (EB==2) ? 1 : 0; + localparam int unsigned YT = (M + NE - 1)/NE; + localparam int unsigned NT = (N + NE - 1)/NE; + localparam int unsigned MP = YT*NE; + localparam int unsigned NumDim = 4; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [31:0] tf_len_t; + typedef logic [2:0] id_t; + typedef logic [31:0] reps_t; + + `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t) + `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t) + + idma_nd_req_t nd_in, nd_out; + logic vi, ro, vo, ri; + assign vi = 1'b1; + assign ri = 1'b1; + + idma_transpose_midend #( + .NumDim(NumDim), .StrbWidth(StrbWidth), .addr_t(addr_t), .idma_nd_req_t(idma_nd_req_t) + ) i_dut ( + .nd_req_i(nd_in), .valid_i(vi), .ready_o(ro), + .nd_req_o(nd_out), .valid_o(vo), .ready_i(ri) + ); + + int errs = 0; + task automatic chk(input string name, input logic [63:0] got, input logic [63:0] exp); + if (got !== exp) begin + errs++; + $display("[MID] %s: got %0d exp %0d", name, $signed(got), $signed(exp)); + end + endtask + + initial begin + // --- passthrough case --- + nd_in = '0; + nd_in.burst_req.src_addr = 64'hDEAD; + nd_in.burst_req.dst_addr = 64'hBEEF; + nd_in.d_req[0].reps = 7; + #1; + if (nd_out !== nd_in) begin + errs++; + $display("[MID] passthrough altered a non-transpose request"); + end + + // --- transpose case --- + nd_in = '0; + nd_in.burst_req.src_addr = 64'h1000; + nd_in.burst_req.dst_addr = 64'h2000; + nd_in.burst_req.opt.compute.enable = 1'b1; + nd_in.burst_req.opt.compute.op = COMPUTE_TRANSPOSE; + nd_in.burst_req.opt.compute.params.transpose.mode = 2'(MODE); + nd_in.burst_req.opt.compute.params.transpose.tensor_m = 12'(M); + nd_in.burst_req.opt.compute.params.transpose.tensor_n = 12'(N); + #1; + + // golden geometry (same formulas as tb_idma_transpose_nd) + chk("length", nd_out.burst_req.length, NE*EB); + chk("d0.reps", nd_out.d_req[0].reps, NE); + chk("d0.src", nd_out.d_req[0].src_strides, addr_t'(N*EB)); + chk("d0.dst", nd_out.d_req[0].dst_strides, addr_t'(MP*EB)); + chk("d1.reps", nd_out.d_req[1].reps, YT); + chk("d1.src", nd_out.d_req[1].src_strides, addr_t'(N*EB)); + chk("d1.dst", nd_out.d_req[1].dst_strides, addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB))); + chk("d2.reps", nd_out.d_req[2].reps, NT); + chk("d2.src", nd_out.d_req[2].src_strides, addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB))); + chk("d2.dst", nd_out.d_req[2].dst_strides, addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB))); + // addresses + compute must survive untouched + chk("src_addr", nd_out.burst_req.src_addr, 64'h1000); + chk("dst_addr", nd_out.burst_req.dst_addr, 64'h2000); + chk("cmp_en", nd_out.burst_req.opt.compute.enable, 1); + + if (errs == 0) + $display("[MID] PASS: %0dx%0d EB=%0d golden (NE=%0d YT=%0d NT=%0d MP=%0d)", + M, N, EB, NE, YT, NT, MP); + else $fatal(1, "[MID] FAIL: %0d mismatches", errs); + $finish; + end +endmodule diff --git a/test/tb_idma_otf_transpose.sv b/test/tb_idma_otf_transpose.sv new file mode 100644 index 00000000..23e29a3d --- /dev/null +++ b/test/tb_idma_otf_transpose.sv @@ -0,0 +1,167 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Authors: +// - Daniel Keller +// +// Standalone self-checking testbench for idma_otf_transpose (element transpose, +// no iDMA backend / no protocol deps). Verifies a full M x N transpose of +// EB-byte elements (EB in {1,2,4} = int8/fp16/fp32), split into NE-square tiles +// (NE = StrbWidth/EB): input padded to full tiles, fed in (col-tile, row-tile, +// row) order; output collected through the engine's per-byte strb_o edge mask. +// Expected result comes from the DPI-C golden (idma_transpose_dpi.c). Checks +// correctness vs golden, full coverage, and no strobe asserted on padding. +// Optional two-sided backpressure via +BP. Override with -gM= -gN= -gEB=. + +`timescale 1ns/1ps + +module tb_idma_otf_transpose #( + parameter int unsigned StrbWidth = 32'd8, + parameter bit FullDuplex = 1'b1, + parameter int unsigned M = 32'd8, // matrix rows (elements) + parameter int unsigned N = 32'd8, // matrix cols (elements) + parameter int unsigned EB = 32'd1 // element size in bytes (1/2/4) +); + + import "DPI-C" function void gm_load(input int idx, input int val); + import "DPI-C" function void gm_transpose(input int m, input int n, input int e); + import "DPI-C" function int gm_get(input int idx); + + localparam int unsigned MODE = (EB == 4) ? 2 : (EB == 2) ? 1 : 0; // log2(EB) + localparam int unsigned NE = StrbWidth / EB; // elements/beat = tile side + localparam int unsigned YT = (M + NE - 1) / NE; // row-tiles + localparam int unsigned NT = (N + NE - 1) / NE; // col-tiles + localparam int unsigned LR = M % NE; + localparam int unsigned LC = N % NE; + localparam logic [7:0] PAD = 8'hFF; + + logic clk = 1'b0, rst_n = 1'b0, clear = 1'b0; + always #5 clk = ~clk; + + logic [StrbWidth-1:0][7:0] din_data; + logic din_valid, din_ready; + logic [StrbWidth-1:0][7:0] dout_data; + logic [StrbWidth-1:0] dout_strb; + logic dout_valid, dout_ready; + + idma_otf_transpose #( + .StrbWidth (StrbWidth), + .FullDuplex (FullDuplex) + ) i_dut ( + .clk_i (clk), + .rst_ni (rst_n), + .clear_i (clear), + .transp_mode_i (2'(MODE)), + .tensor_size_m_i (12'(M)), + .tensor_size_n_i (12'(N)), + .data_i (din_data), + .valid_i (din_valid), + .ready_o (din_ready), + .data_o (dout_data), + .strb_o (dout_strb), + .valid_o (dout_valid), + .ready_i (dout_ready) + ); + + logic [7:0] inb [M*N*EB]; // row-major input bytes + bit wrote [N*M]; // per transposed-element coverage + int unsigned errors = 0; + bit backpressure = 1'b0; + + task automatic drive_inputs(); + int unsigned beat = 0; + din_valid = 1'b0; din_data = '0; + @(posedge clk); + for (int nt = 0; nt < NT; nt++) + for (int rt = 0; rt < YT; rt++) + for (int row = 0; row < NE; row++) begin + if (backpressure) begin din_valid = 1'b0; repeat (beat % 3) @(posedge clk); end + for (int c = 0; c < NE; c++) begin + int gr = rt*NE + row; + int gc = nt*NE + c; + for (int b = 0; b < EB; b++) + din_data[c*EB + b] = (gr < M && gc < N) ? inb[(gr*N + gc)*EB + b] : PAD; + end + din_valid = 1'b1; + do @(posedge clk); while (!din_ready); + beat++; + end + din_valid = 1'b0; + endtask + + task automatic capture_outputs(); + int unsigned beat = 0; + dout_ready = 1'b0; + for (int nt = 0; nt < NT; nt++) + for (int rt = 0; rt < YT; rt++) + for (int k = 0; k < NE; k++) begin + if (backpressure) begin dout_ready = 1'b0; repeat (beat % 4) @(posedge clk); end + dout_ready = 1'b1; + do @(posedge clk); while (!dout_valid); + for (int e = 0; e < NE; e++) begin + if (dout_strb[e*EB]) begin // element e valid (element-granular mask) + int tr = nt*NE + k; // transposed row (= original col, 0..N-1) + int tc = rt*NE + e; // transposed col (= original row, 0..M-1) + if (tr >= N || tc >= M) begin + errors++; + if (errors <= 16) $display("STRB-ON-PAD beat(nt%0d rt%0d k%0d) elem %0d -> (%0d,%0d) OOB", nt, rt, k, e, tr, tc); + end else begin + for (int b = 0; b < EB; b++) begin + automatic int gold = gm_get((tr*M + tc)*EB + b); + if (int'(dout_data[e*EB + b]) !== gold) begin + errors++; + if (errors <= 16) $display("MISMATCH T(%0d,%0d).b%0d=%0d golden=%0d", tr, tc, b, dout_data[e*EB+b], gold); + end + end + wrote[tr*M + tc] = 1'b1; + end + end + end + beat++; + end + @(posedge clk); + dout_ready = 1'b0; + endtask + + initial begin + for (int i = 0; i < M*N*EB; i++) inb[i] = 8'((i * 7 + 3) & 8'hFF); // varied stimulus + for (int i = 0; i < M*N*EB; i++) gm_load(i, int'(inb[i])); + gm_transpose(M, N, EB); + for (int i = 0; i < N*M; i++) wrote[i] = 1'b0; + + din_valid = 1'b0; dout_ready = 1'b0; + if ($test$plusargs("BP")) backpressure = 1'b1; + $display("[TB] idma_otf_transpose M=%0d N=%0d EB=%0d (tile=%0d elems, %0dx%0d tiles, LR=%0d LC=%0d) BP=%0d", + M, N, EB, NE, YT, NT, LR, LC, backpressure); + + rst_n = 1'b0; clear = 1'b1; + repeat (4) @(posedge clk); + rst_n = 1'b1; + repeat (2) @(posedge clk); + clear = 1'b0; + @(posedge clk); + + fork drive_inputs(); capture_outputs(); join + + for (int tr = 0; tr < N; tr++) + for (int tc = 0; tc < M; tc++) + if (!wrote[tr*M + tc]) begin + errors++; + if (errors <= 16) $display("MISSING transposed elem (%0d,%0d)", tr, tc); + end + + if (errors == 0) $display("[TB] PASS: %0dx%0d EB=%0d transpose matches DPI golden", M, N, EB); + else $display("[TB] FAIL: %0d errors", errors); + + repeat (5) @(posedge clk); + $finish; + end + + initial begin + #5000000; + $display("[TB] FAIL: timeout"); + $finish; + end + +endmodule diff --git a/test/tb_idma_transpose_b2b.sv b/test/tb_idma_transpose_b2b.sv new file mode 100644 index 00000000..4337dce1 --- /dev/null +++ b/test/tb_idma_transpose_b2b.sv @@ -0,0 +1,214 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Authors: +// - Daniel Keller +// +// End-to-end back-to-back transpose regression: two transposes of one source to +// DIFFERENT dst bases through the ND midend -> rw_axi backend -> axi_sim_mem. A +// stale base across transfers would leave the second dst untouched. Both checked. + +`include "axi/typedef.svh" +`include "idma/typedef.svh" + +module tb_idma_transpose_b2b + import idma_pkg::*; +#( + parameter int unsigned DataWidth = 32, + parameter int unsigned AddrWidth = 32, + parameter int unsigned UserWidth = 1, + parameter int unsigned AxiIdWidth = 12, + parameter int unsigned TFLenWidth = 32, + parameter int unsigned M = 6, + parameter int unsigned N = 8, + parameter int unsigned EB = 1 +); + + localparam time TA = 1ns, TT = 9ns, TCK = 10ns; + localparam int unsigned StrbWidth = DataWidth / 8; + localparam int unsigned NE = StrbWidth / EB; + localparam int unsigned MODE = (EB == 4) ? 2 : (EB == 2) ? 1 : 0; + localparam int unsigned YT = (M + NE - 1) / NE; + localparam int unsigned NT = (N + NE - 1) / NE; + localparam int unsigned MP = YT * NE; + localparam int unsigned NumDim = 4; + localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16}; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [StrbWidth-1:0] strb_t; + typedef logic [AxiIdWidth-1:0] id_t; + typedef logic [UserWidth-1:0] user_t; + typedef logic [TFLenWidth-1:0] tf_len_t; + typedef logic [31:0] reps_t; + + `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, id_t, user_t) + `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, data_t, id_t, user_t) + `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t) + `AXI_TYPEDEF_RESP_T(axi_rsp_t, axi_b_chan_t, axi_r_chan_t) + + `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t) + `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t) + `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t) + + typedef struct packed { axi_ar_chan_t ar_chan; } axi_read_meta_channel_t; + typedef struct packed { axi_read_meta_channel_t axi; } read_meta_channel_t; + typedef struct packed { axi_aw_chan_t aw_chan; } axi_write_meta_channel_t; + typedef struct packed { axi_write_meta_channel_t axi; } write_meta_channel_t; + + logic clk, rst_n; + idma_req_t idma_req; logic req_valid, req_ready; + idma_rsp_t idma_rsp; logic rsp_valid, rsp_ready; + idma_eh_req_t idma_eh_req; logic eh_req_valid, eh_req_ready; + idma_nd_req_t nd_req; logic nd_req_valid, nd_req_ready; + idma_rsp_t nd_rsp; logic nd_rsp_valid, nd_rsp_ready; + axi_req_t axi_read_req, axi_write_req, axi_req, axi_req_mem; + axi_rsp_t axi_read_rsp, axi_write_rsp, axi_rsp, axi_rsp_mem; + idma_busy_t busy; logic nd_busy; + + assign idma_eh_req = '0; + assign eh_req_valid = 1'b0; + + clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n)); + + axi_rw_join #(.axi_req_t(axi_req_t), .axi_resp_t(axi_rsp_t)) i_axi_rw_join ( + .clk_i(clk), .rst_ni(rst_n), + .slv_read_req_i(axi_read_req), .slv_read_resp_o(axi_read_rsp), + .slv_write_req_i(axi_write_req), .slv_write_resp_o(axi_write_rsp), + .mst_req_o(axi_req), .mst_resp_i(axi_rsp) + ); + assign axi_req_mem = axi_req; + assign axi_rsp = axi_rsp_mem; + + axi_sim_mem #( + .AddrWidth(AddrWidth), .DataWidth(DataWidth), .IdWidth(AxiIdWidth), .UserWidth(UserWidth), + .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t), + .WarnUninitialized(1'b0), .ClearErrOnAccess(1'b1), .ApplDelay(TA), .AcqDelay(TT) + ) i_axi_sim_mem ( + .clk_i(clk), .rst_ni(rst_n), .axi_req_i(axi_req_mem), .axi_rsp_o(axi_rsp_mem), + .mon_r_last_o(), .mon_r_beat_count_o(), .mon_r_user_o(), .mon_r_id_o(), + .mon_r_data_o(), .mon_r_addr_o(), .mon_r_valid_o(), + .mon_w_last_o(), .mon_w_beat_count_o(), .mon_w_user_o(), .mon_w_id_o(), + .mon_w_data_o(), .mon_w_addr_o(), .mon_w_valid_o() + ); + + idma_nd_midend #( + .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t), + .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths) + ) i_nd_midend ( + .clk_i(clk), .rst_ni(rst_n), + .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready), + .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready), + .burst_req_o(idma_req), .burst_req_valid_o(req_valid), .burst_req_ready_i(req_ready), + .burst_rsp_i(idma_rsp), .burst_rsp_valid_i(rsp_valid), .burst_rsp_ready_o(rsp_ready), + .busy_o(nd_busy) + ); + + idma_backend_rw_axi #( + .CombinedShifter(1'b0), .DataWidth(DataWidth), .AddrWidth(AddrWidth), .AxiIdWidth(AxiIdWidth), + .UserWidth(UserWidth), .TFLenWidth(TFLenWidth), .MaskInvalidData(1'b1), .BufferDepth(3), + .RAWCouplingAvail(1'b1), .HardwareLegalizer(1'b1), .RejectZeroTransfers(1'b1), + .ErrorCap(idma_pkg::NO_ERROR_HANDLING), .PrintFifoInfo(1'b0), .NumAxInFlight(StrbWidth), .MemSysDepth(0), + .idma_req_t(idma_req_t), .idma_rsp_t(idma_rsp_t), .idma_eh_req_t(idma_eh_req_t), + .idma_busy_t(idma_busy_t), .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t), + .write_meta_channel_t(write_meta_channel_t), .read_meta_channel_t(read_meta_channel_t) + ) i_idma_backend ( + .clk_i(clk), .rst_ni(rst_n), .testmode_i(1'b0), + .idma_req_i(idma_req), .req_valid_i(req_valid), .req_ready_o(req_ready), + .idma_rsp_o(idma_rsp), .rsp_valid_o(rsp_valid), .rsp_ready_i(rsp_ready), + .idma_eh_req_i(idma_eh_req), .eh_req_valid_i(eh_req_valid), .eh_req_ready_o(eh_req_ready), + .axi_read_req_o(axi_read_req), .axi_read_rsp_i(axi_read_rsp), + .axi_write_req_o(axi_write_req), .axi_write_rsp_i(axi_write_rsp), .busy_o(busy) + ); + + stream_watchdog #(.NumCycles(4000)) i_r_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_rsp.r_valid), .ready_i(axi_req.r_ready)); + stream_watchdog #(.NumCycles(4000)) i_w_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_req.w_valid), .ready_i(axi_rsp.w_ready)); + + addr_t sb = 'h0000_1000; + + task automatic wr_mem(input addr_t a, input logic [7:0] d); i_axi_sim_mem.mem[a] = d; endtask + function automatic logic [7:0] rd_mem(input addr_t a); + return i_axi_sim_mem.mem.exists(a) ? i_axi_sim_mem.mem[a] : 8'hxx; + endfunction + + // one transpose of the source at sb -> dst base `db`; returns error count + task automatic do_transpose(input addr_t db, output int unsigned errs); + errs = 0; + // pre-fill full padded dst extent with sentinel + for (int unsigned i = 0; i < NT*NE; i++) + for (int unsigned j = 0; j < MP; j++) + for (int unsigned b = 0; b < EB; b++) + wr_mem(db + (i*MP + j)*EB + b, 8'hCC); + nd_req = '0; + nd_req.burst_req.length = tf_len_t'(NE*EB); + nd_req.burst_req.src_addr = sb; + nd_req.burst_req.dst_addr = db; + nd_req.burst_req.opt.src_protocol = idma_pkg::AXI; + nd_req.burst_req.opt.dst_protocol = idma_pkg::AXI; + nd_req.burst_req.opt.src.burst = axi_pkg::BURST_INCR; + nd_req.burst_req.opt.dst.burst = axi_pkg::BURST_INCR; + nd_req.burst_req.opt.beo.decouple_rw = 1'b1; + nd_req.burst_req.opt.beo.decouple_aw = 1'b1; + nd_req.burst_req.opt.compute.enable = 1'b1; + nd_req.burst_req.opt.compute.op = idma_pkg::COMPUTE_TRANSPOSE; + nd_req.burst_req.opt.compute.params.transpose.mode = 2'(MODE); + nd_req.burst_req.opt.compute.params.transpose.tensor_m = 12'(M); + nd_req.burst_req.opt.compute.params.transpose.tensor_n = 12'(N); + nd_req.burst_req.opt.last = 1'b1; + nd_req.d_req[0].reps = reps_t'(NE); nd_req.d_req[0].src_strides = addr_t'(int'(N*EB)); nd_req.d_req[0].dst_strides = addr_t'(int'(MP*EB)); + nd_req.d_req[1].reps = reps_t'(YT); nd_req.d_req[1].src_strides = addr_t'(int'(N*EB)); nd_req.d_req[1].dst_strides = addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB)); + nd_req.d_req[2].reps = reps_t'(NT); nd_req.d_req[2].src_strides = addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB)); nd_req.d_req[2].dst_strides = addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB)); + nd_req_valid = 1'b1; + do @(posedge clk); while (!nd_req_ready); // drop valid the cycle accept is seen (compliant) + nd_req_valid = 1'b0; + nd_req = '0; + while (!(nd_rsp_valid && nd_rsp_ready)) @(posedge clk); + repeat (20) @(posedge clk); + // data + padding checks + for (int unsigned c = 0; c < N; c++) + for (int unsigned r = 0; r < M; r++) + for (int unsigned b = 0; b < EB; b++) + if (rd_mem(db + (c*MP + r)*EB + b) !== rd_mem(sb + (r*N + c)*EB + b)) begin + errs++; if (errs <= 8) $display("[B2BT] @db=%0h MISMATCH out_T[%0d][%0d].b%0d=%02h exp %02h", db, c, r, b, rd_mem(db+(c*MP+r)*EB+b), rd_mem(sb+(r*N+c)*EB+b)); + end + for (int unsigned i = 0; i < NT*NE; i++) + for (int unsigned j = 0; j < MP; j++) + if (i >= N || j >= M) + for (int unsigned b = 0; b < EB; b++) + if (rd_mem(db + (i*MP + j)*EB + b) !== 8'hCC) begin + errs++; if (errs <= 8) $display("[B2BT] @db=%0h PADDING CLOBBERED row=%0d col=%0d", db, i, j); + end + endtask + + initial begin + automatic int unsigned e1, e2; + automatic addr_t db1 = 'h0000_4000; + automatic addr_t db2 = 'h0000_8000; // DIFFERENT base — a stale-addr bug misplaces xfer 2 + nd_req_valid = 1'b0; nd_rsp_ready = 1'b1; nd_req = '0; + @(posedge rst_n); + repeat (5) @(posedge clk); + for (int unsigned r = 0; r < M; r++) + for (int unsigned c = 0; c < N; c++) + for (int unsigned b = 0; b < EB; b++) + wr_mem(sb + (r*N + c)*EB + b, 8'((( (r*N+c)*EB + b )*7 + 3) & 8'hFF)); + + $display("[B2BT] transfer 1 -> db=%0h", db1); + do_transpose(db1, e1); + $display("[B2BT] transfer 2 (back-to-back) -> db=%0h", db2); + do_transpose(db2, e2); + + if (e1 == 0 && e2 == 0) + $display("[B2BT] PASS: two back-to-back %0dx%0d EB=%0d transposes both correct (xfer2 landed at its own dst)", M, N, EB); + else + $fatal(1, "[B2BT] FAIL: xfer1 errs=%0d xfer2 errs=%0d", e1, e2); + repeat (5) @(posedge clk); + $finish(); + end + + initial begin #5_000_000; $fatal(1, "[B2BT] timeout"); end + +endmodule diff --git a/test/tb_idma_transpose_nd.sv b/test/tb_idma_transpose_nd.sv new file mode 100644 index 00000000..1fa3e3ba --- /dev/null +++ b/test/tb_idma_transpose_nd.sv @@ -0,0 +1,263 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Authors: +// - Daniel Keller +// +// Self-checking multi-tile transpose testbench: idma_nd_midend (NumDim=4, +// transposed-stride program) -> idma_backend_rw_axi (EnableTranspose) -> +// axi_sim_mem. The ND midend generates the tiled read order (col-tile, +// row-tile, local-row) and the transposed destination placement the engine +// requires, so a full M x N transpose works end-to-end (not just one tile). +// Reference: out_T[c][r] = in[r][c] over E-byte elements. + +`include "axi/typedef.svh" +`include "idma/typedef.svh" + +module tb_idma_transpose_nd + import idma_pkg::*; +#( + parameter int unsigned DataWidth = 32, + parameter int unsigned AddrWidth = 32, + parameter int unsigned UserWidth = 1, + parameter int unsigned AxiIdWidth = 12, + parameter int unsigned TFLenWidth = 32, + parameter int unsigned M = 8, // matrix rows (elements) + parameter int unsigned N = 8, // matrix cols (elements) + parameter int unsigned EB = 1, // element size in bytes (1/2/4) + // 0 = auto: NumAxInFlight = StrbWidth (transpose needs >= NE in-flight; NE = StrbWidth at E=1) + parameter int unsigned NumAxInFlight = 0, + parameter int unsigned BufferDepth = 3 +); + + localparam time TA = 1ns; + localparam time TT = 9ns; + localparam time TCK = 10ns; + + localparam int unsigned StrbWidth = DataWidth / 8; + localparam int unsigned NE = StrbWidth / EB; // tile side (elements) + localparam int unsigned AxIF = (NumAxInFlight == 0) ? StrbWidth : NumAxInFlight; + localparam int unsigned MODE = (EB == 4) ? 2 : (EB == 2) ? 1 : 0; + localparam int unsigned YT = (M + NE - 1) / NE; // row-tiles + localparam int unsigned NT = (N + NE - 1) / NE; // col-tiles + // padded Aᵀ row pitch: M rounded up to a tile so every row is StrbWidth-aligned + localparam int unsigned MP = YT * NE; + localparam int unsigned NumDim = 4; // 1D + {row, row-tile, col-tile} + localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16}; + + // ── Types ── + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [StrbWidth-1:0] strb_t; + typedef logic [AxiIdWidth-1:0] id_t; + typedef logic [UserWidth-1:0] user_t; + typedef logic [TFLenWidth-1:0] tf_len_t; + typedef logic [31:0] reps_t; + + `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, id_t, user_t) + `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, data_t, id_t, user_t) + `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t) + `AXI_TYPEDEF_RESP_T(axi_rsp_t, axi_b_chan_t, axi_r_chan_t) + + `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t) + `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t) + `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t) + + typedef struct packed { axi_ar_chan_t ar_chan; } axi_read_meta_channel_t; + typedef struct packed { axi_read_meta_channel_t axi; } read_meta_channel_t; + typedef struct packed { axi_aw_chan_t aw_chan; } axi_write_meta_channel_t; + typedef struct packed { axi_write_meta_channel_t axi; } write_meta_channel_t; + + // ── Signals ── + logic clk, rst_n; + idma_req_t idma_req; logic req_valid, req_ready; + idma_rsp_t idma_rsp; logic rsp_valid, rsp_ready; + idma_eh_req_t idma_eh_req; logic eh_req_valid, eh_req_ready; + idma_nd_req_t nd_req; logic nd_req_valid, nd_req_ready; + idma_rsp_t nd_rsp; logic nd_rsp_valid, nd_rsp_ready; + axi_req_t axi_read_req, axi_write_req, axi_req, axi_req_mem; + axi_rsp_t axi_read_rsp, axi_write_rsp, axi_rsp, axi_rsp_mem; + idma_busy_t busy; logic nd_busy; + + assign idma_eh_req = '0; + assign eh_req_valid = 1'b0; + + // ── Clock / reset ── + clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n)); + + // ── AXI sim memory (read+write joined) ── + axi_rw_join #(.axi_req_t(axi_req_t), .axi_resp_t(axi_rsp_t)) i_axi_rw_join ( + .clk_i(clk), .rst_ni(rst_n), + .slv_read_req_i(axi_read_req), .slv_read_resp_o(axi_read_rsp), + .slv_write_req_i(axi_write_req), .slv_write_resp_o(axi_write_rsp), + .mst_req_o(axi_req), .mst_resp_i(axi_rsp) + ); + assign axi_req_mem = axi_req; + assign axi_rsp = axi_rsp_mem; + + axi_sim_mem #( + .AddrWidth(AddrWidth), .DataWidth(DataWidth), .IdWidth(AxiIdWidth), .UserWidth(UserWidth), + .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t), + .WarnUninitialized(1'b0), .ClearErrOnAccess(1'b1), .ApplDelay(TA), .AcqDelay(TT) + ) i_axi_sim_mem ( + .clk_i(clk), .rst_ni(rst_n), .axi_req_i(axi_req_mem), .axi_rsp_o(axi_rsp_mem), + .mon_r_last_o(), .mon_r_beat_count_o(), .mon_r_user_o(), .mon_r_id_o(), + .mon_r_data_o(), .mon_r_addr_o(), .mon_r_valid_o(), + .mon_w_last_o(), .mon_w_beat_count_o(), .mon_w_user_o(), .mon_w_id_o(), + .mon_w_data_o(), .mon_w_addr_o(), .mon_w_valid_o() + ); + + // ── ND midend: ND transpose descriptor -> 1D bursts ── + idma_nd_midend #( + .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t), + .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths) + ) i_nd_midend ( + .clk_i(clk), .rst_ni(rst_n), + .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready), + .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready), + .burst_req_o(idma_req), .burst_req_valid_o(req_valid), .burst_req_ready_i(req_ready), + .burst_rsp_i(idma_rsp), .burst_rsp_valid_i(rsp_valid), .burst_rsp_ready_o(rsp_ready), + .busy_o(nd_busy) + ); + + // ── Backend (rw_axi) with transpose engine ── + idma_backend_rw_axi #( + .CombinedShifter(1'b0), .DataWidth(DataWidth), .AddrWidth(AddrWidth), .AxiIdWidth(AxiIdWidth), + .UserWidth(UserWidth), .TFLenWidth(TFLenWidth), .MaskInvalidData(1'b1), .BufferDepth(BufferDepth), + .RAWCouplingAvail(1'b1), .HardwareLegalizer(1'b1), .RejectZeroTransfers(1'b1), + .ErrorCap(idma_pkg::NO_ERROR_HANDLING), .PrintFifoInfo(1'b0), .NumAxInFlight(AxIF), .MemSysDepth(0), + .idma_req_t(idma_req_t), .idma_rsp_t(idma_rsp_t), .idma_eh_req_t(idma_eh_req_t), + .idma_busy_t(idma_busy_t), .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t), + .write_meta_channel_t(write_meta_channel_t), .read_meta_channel_t(read_meta_channel_t) + ) i_idma_backend ( + .clk_i(clk), .rst_ni(rst_n), .testmode_i(1'b0), + .idma_req_i(idma_req), .req_valid_i(req_valid), .req_ready_o(req_ready), + .idma_rsp_o(idma_rsp), .rsp_valid_o(rsp_valid), .rsp_ready_i(rsp_ready), + .idma_eh_req_i(idma_eh_req), .eh_req_valid_i(eh_req_valid), .eh_req_ready_o(eh_req_ready), + .axi_read_req_o(axi_read_req), .axi_read_rsp_i(axi_read_rsp), + .axi_write_req_o(axi_write_req), .axi_write_rsp_i(axi_write_rsp), .busy_o(busy) + ); + + // watchdogs to surface deadlocks rather than hang forever + stream_watchdog #(.NumCycles(2000)) i_r_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_rsp.r_valid), .ready_i(axi_req.r_ready)); + stream_watchdog #(.NumCycles(2000)) i_w_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_req.w_valid), .ready_i(axi_rsp.w_ready)); + + // ── Stimulus + check via sim-memory backdoor ── + addr_t sb = 'h0000_1000; + addr_t db = 'h0000_4000; + + // every AW (incl. wstrb=0 padding rows) must stay in the padded dst allocation + // [db, db+NT*NE*MP*EB) — else a strict slave would DECERR + always @(posedge clk) if (rst_n && axi_write_req.aw_valid && axi_write_rsp.aw_ready) begin + automatic addr_t aw_end = db + addr_t'(NT*NE*MP*EB); + if (axi_write_req.aw.addr < db || axi_write_req.aw.addr >= aw_end) + $fatal(1, "[TPN] AW 0x%0h outside dst alloc [0x%0h,0x%0h) — would DECERR on a strict slave", + axi_write_req.aw.addr, db, aw_end); + end + + task automatic wr_mem(input addr_t a, input logic [7:0] d); i_axi_sim_mem.mem[a] = d; endtask + function automatic logic [7:0] rd_mem(input addr_t a); + return i_axi_sim_mem.mem.exists(a) ? i_axi_sim_mem.mem[a] : 8'hxx; + endfunction + + initial begin + automatic int unsigned errs = 0; + nd_req_valid = 1'b0; nd_rsp_ready = 1'b1; nd_req = '0; + @(posedge rst_n); + repeat (5) @(posedge clk); + + // init source matrix (row-major, M x N elements of EB bytes) + for (int unsigned r = 0; r < M; r++) + for (int unsigned c = 0; c < N; c++) + for (int unsigned b = 0; b < EB; b++) + wr_mem(sb + (r*N + c)*EB + b, 8'((( (r*N+c)*EB + b )*7 + 3) & 8'hFF)); + + // sentinel-fill the full padded Aᵀ extent; padding cols/rows must stay sentinel + // (the engine strobe suppresses them) — checked after the transfer + for (int unsigned i = 0; i < NT*NE; i++) + for (int unsigned j = 0; j < MP; j++) + for (int unsigned b = 0; b < EB; b++) + wr_mem(db + (i*MP + j)*EB + b, 8'hCC); + + // ── transposed-stride ND program (routing-plan §4.2) ── + nd_req = '0; + nd_req.burst_req.length = tf_len_t'(NE*EB); // one tile-row = StrbWidth bytes + nd_req.burst_req.src_addr = sb; + nd_req.burst_req.dst_addr = db; + nd_req.burst_req.opt.src_protocol = idma_pkg::AXI; + nd_req.burst_req.opt.dst_protocol = idma_pkg::AXI; + nd_req.burst_req.opt.src.burst = axi_pkg::BURST_INCR; + nd_req.burst_req.opt.dst.burst = axi_pkg::BURST_INCR; + nd_req.burst_req.opt.beo.decouple_rw = 1'b1; + nd_req.burst_req.opt.beo.decouple_aw = 1'b1; + nd_req.burst_req.opt.beo.src_max_llen = '0; + nd_req.burst_req.opt.beo.dst_max_llen = '0; + nd_req.burst_req.opt.compute.enable = 1'b1; + nd_req.burst_req.opt.compute.op = idma_pkg::COMPUTE_TRANSPOSE; + nd_req.burst_req.opt.compute.params.transpose.mode = 2'(MODE); + nd_req.burst_req.opt.compute.params.transpose.tensor_m = 12'(M); + nd_req.burst_req.opt.compute.params.transpose.tensor_n = 12'(N); + nd_req.burst_req.opt.last = 1'b1; + // ND midend strides are INCREMENTAL deltas (added on dim roll-over), NOT + // absolute pitches. Aᵀ uses padded pitch MP*EB (aligned writes); src keeps + // N*EB (misaligned reads coalesce in the pre-engine buffer). + // d_req[0] = local row within tile (reps NE) + nd_req.d_req[0].reps = reps_t'(NE); + nd_req.d_req[0].src_strides = addr_t'(int'(N*EB)); // next source row + nd_req.d_req[0].dst_strides = addr_t'(int'(MP*EB)); // next Aᵀ row (padded pitch) + // d_req[1] = row-tile (reps YT) + nd_req.d_req[1].reps = reps_t'(YT); + nd_req.d_req[1].src_strides = addr_t'(int'(N*EB)); // rows are consecutive + nd_req.d_req[1].dst_strides = addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB)); // back up cols, next col-block + // d_req[2] = col-tile (reps NT). src rewind uses the padded row extent + // (YT*NE-1, not M-1): the read walks padding rows of the last row-tile. + nd_req.d_req[2].reps = reps_t'(NT); + nd_req.d_req[2].src_strides = addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB)); // back to padded row0, next col-block + nd_req.d_req[2].dst_strides = addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB)); // next Aᵀ col-block + + $display("[TPN] launching %0dx%0d EB=%0d transpose via ND midend (NE=%0d, %0dx%0d tiles)", M, N, EB, NE, YT, NT); + nd_req_valid = 1'b1; + // drop valid on accept; holding it one cycle past makes the midend re-walk the request + do @(posedge clk); while (!nd_req_ready); + nd_req_valid = 1'b0; + nd_req = '0; + + // wait for ND completion + while (!(nd_rsp_valid && nd_rsp_ready)) @(posedge clk); + repeat (20) @(posedge clk); + + // check 1 (data): out_T[c][r] == in[r][c], Aᵀ at padded pitch MP + for (int unsigned c = 0; c < N; c++) + for (int unsigned r = 0; r < M; r++) + for (int unsigned b = 0; b < EB; b++) begin + automatic logic [7:0] got = rd_mem(db + (c*MP + r)*EB + b); + automatic logic [7:0] exp = rd_mem(sb + (r*N + c)*EB + b); + if (got !== exp) begin + errs++; + if (errs <= 12) $display("[TPN] MISMATCH out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp); + end + end + // check 2: padding cols [M,MP) and padding rows [N,NT*NE) must stay sentinel (strobe-suppressed) + for (int unsigned i = 0; i < NT*NE; i++) + for (int unsigned j = 0; j < MP; j++) + if (i >= N || j >= M) + for (int unsigned b = 0; b < EB; b++) begin + automatic logic [7:0] got = rd_mem(db + (i*MP + j)*EB + b); + if (got !== 8'hCC) begin + errs++; + if (errs <= 12) $display("[TPN] PADDING CLOBBERED at row=%0d col=%0d b%0d=%02h (exp CC)", i, j, b, got); + end + end + if (errs == 0) $display("[TPN] PASS: %0dx%0d EB=%0d multi-tile transpose matches golden (padding intact)", M, N, EB); + else $fatal(1, "[TPN] FAIL: %0d mismatches", errs); + repeat (5) @(posedge clk); + $finish(); + end + + initial begin #5_000_000; $fatal(1, "[TPN] timeout"); end + +endmodule diff --git a/util/gen_idma.py b/util/gen_idma.py index 2aff908e..537b9adb 100644 --- a/util/gen_idma.py +++ b/util/gen_idma.py @@ -12,7 +12,7 @@ import argparse import sys -from mario.util import prepare_ids, prepare_fids +from mario.util import prepare_ids, prepare_compute_ids, prepare_fids from mario.database import read_database from mario.transport_layer import render_transport_layer from mario.legalizer import render_legalizer @@ -44,6 +44,8 @@ def main(): parser.add_argument('--entity', choices=sorted(GENABLE_ENTITIES), dest='entity', required=True, help='The entity to generate from a given configuration.') parser.add_argument('--ids', dest='ids', nargs='*', help='configuration IDs') + parser.add_argument('--compute-ids', dest='compute_ids', nargs='*', default=[], + help='configuration IDs with on-the-fly compute enabled (IDMA_VIDMA_IDS)') parser.add_argument('--fids', dest='fids', nargs='*', help='frontend IDs') parser.add_argument('--db', dest='db', nargs='*', help='Database files') parser.add_argument('--tpl', dest='tpl', required=True, help='Template file') @@ -51,16 +53,17 @@ def main(): # prepare database and ids protocol_ids = prepare_ids(args.ids) + compute_cfg = prepare_compute_ids(args.compute_ids) frontend_ids = prepare_fids(args.fids) protocol_db = read_database(args.db) # decide what to render if args.entity == 'transport': - print(render_transport_layer(protocol_ids, protocol_db, args.tpl)) + print(render_transport_layer(protocol_ids, protocol_db, args.tpl, compute_cfg)) elif args.entity == 'legalizer': - print(render_legalizer(protocol_ids, protocol_db, args.tpl)) + print(render_legalizer(protocol_ids, protocol_db, args.tpl, compute_cfg)) elif args.entity == 'backend': - print(render_backend(protocol_ids, protocol_db, args.tpl)) + print(render_backend(protocol_ids, protocol_db, args.tpl, compute_cfg)) elif args.entity == 'vsim_wave': print(render_vsim_wave(protocol_ids, protocol_db, args.tpl)) elif args.entity == 'synth_wrapper': diff --git a/util/mario/backend.py b/util/mario/backend.py index 22a21dfd..d2a93c5b 100644 --- a/util/mario/backend.py +++ b/util/mario/backend.py @@ -12,7 +12,7 @@ from mario.util import eval_key, prot_key -def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str: +def render_backend(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None) -> str: """Generate backend""" backend_rendered = '' @@ -30,6 +30,12 @@ def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str: srp = len(used_read_prots) == 1 swp = len(used_write_prots) == 1 + # on-the-fly compute is hosted at the (single) AXI write seam + enable_compute = prot_id in (compute_cfg or {}) + if enable_compute and not (swp and used_write_prots[0] == 'axi'): + raise ValueError( + f'compute (IDMA_VIDMA_IDS) requires a single AXI write port: {prot_id}') + # create context context = { 'name_uniqueifier': prot_id, @@ -39,6 +45,8 @@ def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str: 'used_protocols': prot_ids[prot_id]['used'], 'one_read_port': srp, 'one_write_port': swp, + 'enable_compute': enable_compute, + 'compute_ops': compute_cfg[prot_id]['ops'] if enable_compute else [], 'used_non_bursting_write_protocols': prot_key(used_write_prots, 'bursts', 'not_supported', db), 'combined_aw_and_w': diff --git a/util/mario/legalizer.py b/util/mario/legalizer.py index 1724c00e..51757771 100644 --- a/util/mario/legalizer.py +++ b/util/mario/legalizer.py @@ -21,7 +21,7 @@ def prot_force_decouple(used_prots: list, db: dict) -> list: return res -def render_legalizer(prot_ids: dict, db: dict, tpl_file: str) -> str: +def render_legalizer(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None) -> str: """Generate legalizer""" legalizer_rendered = '' @@ -71,6 +71,7 @@ def render_legalizer(prot_ids: dict, db: dict, tpl_file: str) -> str: 'used_protocols': prot_ids[prot_id]['used'], 'one_read_port': srp, 'one_write_port': swp, + 'enable_compute': prot_id in (compute_cfg or {}), 'no_read_bursting': not has_read_bursting, 'has_page_read_bursting': diff --git a/util/mario/transport_layer.py b/util/mario/transport_layer.py index 17a23a36..f46c8a38 100644 --- a/util/mario/transport_layer.py +++ b/util/mario/transport_layer.py @@ -169,7 +169,8 @@ def render_write_mgr_inst(prot_id: str, prot_ids: dict, db: dict) -> dict: return res -def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str) -> str: +def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None + ) -> str: """Generate Transport Layer""" transport_rendered = '' @@ -188,6 +189,11 @@ def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str) -> str: 'used_protocols': prot_ids[prot_id]['used'], 'one_read_port': len(prot_ids[prot_id]['ar']) == 1, 'one_write_port': len(prot_ids[prot_id]['aw']) == 1, + 'enable_compute': prot_id in (compute_cfg or {}), + 'compute_ops': + compute_cfg[prot_id]['ops'] if prot_id in (compute_cfg or {}) else [], + 'compute_full_duplex': + compute_cfg[prot_id]['full_duplex'] if prot_id in (compute_cfg or {}) else True, 'rendered_read_ports': render_read_mgr_inst(prot_id, prot_ids, db), 'rendered_write_ports': render_write_mgr_inst(prot_id, prot_ids, db) } diff --git a/util/mario/util.py b/util/mario/util.py index 74376c58..a961889d 100644 --- a/util/mario/util.py +++ b/util/mario/util.py @@ -135,3 +135,23 @@ def prepare_fids(fe_strs: list) -> dict: res[f'reg{reg[0]}_{reg[1]}d'] = reg return res + + +def prepare_compute_ids(compute_id_strs: list) -> dict: + """Parses compute configuration IDs: [:[,...]][:fd|hd]""" + res = {} + for cid_str in (compute_id_strs or []): + parts = cid_str.split(':') + ops = ['transpose'] + full_duplex = True + for part in parts[1:]: + if part in ('fd', 'hd'): + full_duplex = part == 'fd' + else: + ops = part.split(',') + for op in ops: + if op not in ('transpose',): + print(f'[MARIO] {op} is a non-supported compute op in {cid_str}', file=sys.stderr) + sys.exit(1) + res[parts[0]] = {'ops': ops, 'full_duplex': full_duplex} + return res