diff --git a/Bender.yml b/Bender.yml
index 9a0972b2..2c259b31 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -43,6 +43,8 @@ sources:
       - src/backend/idma_axis_write.sv
       - src/backend/idma_channel_coupler.sv
       - src/backend/idma_dataflow_element.sv
+      - src/backend/idma_otf_transpose.sv
+      - src/backend/idma_otf_compute.sv
       - src/backend/idma_error_handler.sv
       - src/backend/idma_init_read.sv
       - src/backend/idma_init_write.sv
@@ -53,11 +55,58 @@ sources:
       - src/backend/idma_tilelink_read.sv
       - src/backend/idma_tilelink_write.sv
 
-  # Generated content
-  - target: rtl
+  # Generated content (bundled single-file; default flow)
+  - target: all(rtl, not(split_rtl))
     files:
       - target/rtl/idma_generated.sv
 
+  # Generated content (per-variant files; opt-in via `-t split_rtl`).
+  # Used for the transpose-engine prototype where the rw_axi backend/transport/
+  # legalizer are hand-edited in place before being ported back to templates.
+  # Mutually exclusive with idma_generated.sv to avoid duplicate module defs.
+  - target: all(rtl, split_rtl)
+    files:
+      - target/rtl/idma_transport_layer_rw_axi.sv
+      - target/rtl/idma_transport_layer_r_obi_w_axi.sv
+      - target/rtl/idma_transport_layer_r_axi_w_obi.sv
+      - target/rtl/idma_transport_layer_rw_axi_rw_axis.sv
+      - target/rtl/idma_transport_layer_rw_obi.sv
+      - target/rtl/idma_transport_layer_r_obi_rw_init_w_axi.sv
+      - target/rtl/idma_transport_layer_r_axi_rw_init_rw_obi.sv
+      - target/rtl/idma_legalizer_rw_axi.sv
+      - target/rtl/idma_legalizer_r_obi_w_axi.sv
+      - target/rtl/idma_legalizer_r_axi_w_obi.sv
+      - target/rtl/idma_legalizer_rw_axi_rw_axis.sv
+      - target/rtl/idma_legalizer_rw_obi.sv
+      - target/rtl/idma_legalizer_r_obi_rw_init_w_axi.sv
+      - target/rtl/idma_legalizer_r_axi_rw_init_rw_obi.sv
+      - target/rtl/idma_backend_rw_axi.sv
+      - target/rtl/idma_backend_r_obi_w_axi.sv
+      - target/rtl/idma_backend_r_axi_w_obi.sv
+      - target/rtl/idma_backend_rw_axi_rw_axis.sv
+      - target/rtl/idma_backend_rw_obi.sv
+      - target/rtl/idma_backend_r_obi_rw_init_w_axi.sv
+      - target/rtl/idma_backend_r_axi_rw_init_rw_obi.sv
+      - target/rtl/idma_backend_synth_rw_axi.sv
+      - target/rtl/idma_backend_synth_r_obi_w_axi.sv
+      - target/rtl/idma_backend_synth_r_axi_w_obi.sv
+      - target/rtl/idma_backend_synth_rw_axi_rw_axis.sv
+      - target/rtl/idma_backend_synth_rw_obi.sv
+      - target/rtl/idma_backend_synth_r_obi_rw_init_w_axi.sv
+      - target/rtl/idma_backend_synth_r_axi_rw_init_rw_obi.sv
+      - target/rtl/idma_desc64_reg_pkg.sv
+      - target/rtl/idma_reg32_3d_reg_pkg.sv
+      - target/rtl/idma_reg64_2d_reg_pkg.sv
+      - target/rtl/idma_reg64_1d_reg_pkg.sv
+      - target/rtl/idma_desc64_reg_top.sv
+      - target/rtl/idma_reg32_3d_reg_top.sv
+      - target/rtl/idma_reg64_2d_reg_top.sv
+      - target/rtl/idma_reg64_1d_reg_top.sv
+      - target/rtl/idma_desc64_top.sv
+      - target/rtl/idma_reg32_3d_top.sv
+      - target/rtl/idma_reg64_2d_top.sv
+      - target/rtl/idma_reg64_1d_top.sv
+
   # Midends
   - target: rtl
     files:
@@ -66,6 +115,7 @@ sources:
       - src/midend/idma_mp_split_midend.sv
       - src/midend/idma_nd_midend.sv
       - src/midend/idma_rt_midend.sv
+      - src/midend/idma_transpose_midend.sv
 
   # RISC-V opcode package for ooc use of inst64
   - target: all(rtl,snitch_cluster)
@@ -94,6 +144,7 @@ sources:
       # Level 2
       - src/frontend/desc64/idma_desc64_top.sv
 
+
   # Synthesis wrappers
   - target: synth
     files:
@@ -119,7 +170,9 @@ sources:
       - test/future/idma_tb_per2axi.sv
       - test/future/TLToAXI4.v
       - test/midend/tb_idma_nd_midend.sv
+      - test/midend/tb_idma_nd_midend_b2b.sv
       - test/midend/tb_idma_rt_midend.sv
+      - test/midend/tb_idma_transpose_midend.sv
       # Level 2
       - test/future/idma_obi2axi_bridge.sv
       - test/future/idma_tilelink2axi_bridge.sv
@@ -128,3 +181,8 @@ sources:
   - target: idma_test
     files:
       - target/rtl/tb_idma_generated.sv
+      # Multi-tile transpose path: ND-midend (transposed strides) -> backend
+      - target: all(idma_test, split_rtl)
+        files:
+          - test/tb_idma_transpose_nd.sv
+          - test/tb_idma_transpose_b2b.sv
diff --git a/doc/transpose-engine-routing-plan.md b/doc/transpose-engine-routing-plan.md
new file mode 100644
index 00000000..1428d5aa
--- /dev/null
+++ b/doc/transpose-engine-routing-plan.md
@@ -0,0 +1,461 @@
+# Transpose Engine — Design Routing & Signaling Plan
+
+Status: design doc (no frontend/midend RTL yet). The transpose **engine**
+(`src/backend/idma_otf_transpose.sv`) is implemented and standalone-verified
+(full-duplex ping-pong, runtime int8/fp16/fp32, DPI golden). This document plans
+how it is **routed through the whole design** — frontend → midend → legalizer →
+backend → transport → engine, and the response path back.
+
+It is grounded in a re-read of the viDMA fork (the prior OTF work) and is
+deliberately **adversarial about viDMA's signaling approach** (§3). Anchors are
+`file:line` against this repo unless prefixed `vidma:`
+(`/home/dankeller/Projects/pulp_rs/vidma_rtl/src/`).
+
+---
+
+## 1. Three-plane framing
+
+The engine reorders the **data stream** only — it has **no address ports**.
+Routing therefore splits into three planes of very uneven difficulty:
+
+| Plane | Difficulty | What it needs |
+|-------|-----------|---------------|
+| **Control** | easy | deliver `{transpose_en, E, M, N}` per transfer to the engine *and* the address generator |
+| **Data** | mostly done | splice the engine `valid/ready/strb_o` into the transport `buffer_out → write-shifter` seam behind `EnableTranspose`; engine self-drains |
+| **Address** | the crux | generate read + write address streams so the bytes land transposed, with `strb_o` masking edge tiles |
+
+The central insight that makes the address plane tractable:
+
+> **The engine transposes *inside* each `NE×NE` tile. Read and write therefore
+> walk the *same* tile-visitation order — they differ only in stride
+> *magnitude*, not iteration *order*.** A full matrix transpose is expressed as
+> an ND transfer whose `dst_strides` encode the N×M output pitch while
+> `src_strides` encode the M×N input pitch, over one shared iteration space.
+
+This matters because the ND midend (`src/midend/idma_nd_midend.sv`) advances
+`src_addr` and `dst_addr` from a **single shared iteration counter**
+(`stride_sel_q`, `burst_sent_q`, `:153-186`) — it *cannot* make read and write
+visit a different order, but it *can* give them independent strides. The
+engine-internal transpose is exactly what lets us live within that constraint.
+
+```
+        CONTROL: {transpose_en, E, M, N} as typed, request-scoped idma_req_t.opt fields
+        ADDRESS: NumDim=4 ND descriptor; src_strides (M×N) + transposed dst_strides (N×M)
+  ┌──────────┐  idma_nd_req_t   ┌─────────┐  idma_req_t   ┌───────────┐
+  │ FRONTEND │ ───────────────► │ ND      │ ────────────► │ LEGALIZER │
+  │ reg/desc │ burst_req.opt.*  │ MIDEND  │ src_addr /    │ opt_tf_q  │
+  │ /inst64  │ + d_req[].reps,  │ shared  │ dst_addr      │ r_tf_q    │ AR/AW
+  │          │ src/dst_strides  │ counter │ (per-dim      │ w_tf_q    ├──────►
+  └──────────┘                  │ walks   │  strides)     └─────┬─────┘ meta
+       ▲                        │ both    │                    │ r_dp_req / w_dp_req
+       │ done_id (idma_rsp_t)   │ ptrs    │                    │ (+ transp_*)
+       │  on WRITE of last tile └────┬────┘                    ▼
+       │                            │ nd_rsp_o (on .last)  ┌────────────────────┐
+       │                            ▼                      │   BACKEND TOP       │
+       └────────────────────────────────────────────────  │ legalizer↔transport │
+                                                           └─────────┬───────────┘
+                                                                     │ w_dp_req.transp_*, shift
+   DATA: ┌────────┐ data ┌───────────┐ data ┌──────────┐ data ┌──────────┐
+         │ buffer │ ───► │ TRANSPOSE │ ───► │ write     │ ───► │ axi_write│ ─► W.strb =
+         │ _out   │ ◄─── │ ENGINE    │ ◄─── │ barrel    │strb_o│ mask_out │   align_mask
+         └────────┘ rdy  │(EnTranspose)│rdy │ shift     │  &&  │ & strb   │   & engine_strb
+                         └───────────┘      └──────────┘      └──────────┘
+         strb_o ─────────────────(shift by w_dp_req.shift)──────────► (new AND term)
+```
+
+Response semantics are unchanged: **one transfer = one `done_id`** — but it must
+fire on **write** completion of the last tile (§5.3).
+
+---
+
+## 2. Control plane — which fields travel where
+
+Four control items: `transpose_en` (1b), `transp_mode`/E (2b), `M` (12b, rows in
+elements), `N` (12b, cols in elements). Decision: carry the small per-transfer
+items as **typed fields in `options_t`**; `M`/`N` ride the ND geometry but are
+*also* surfaced as explicit `opt` fields because the engine needs them at the
+transport seam.
+
+### 2.1 The struct of record
+
+`IDMA_TYPEDEF_OPTIONS_T(options_t, axi_id_t)` (`src/include/idma/typedef.svh:21-30`)
+is the request-scoped options struct embedded in every `idma_req_t` (`:37-44`,
+the `opt` field). Add:
+
+```systemverilog
+logic        transpose_en;   // engine on/off for this transfer
+logic [1:0]  transp_mode;    // E selector: 00=1B, 01=2B, 10=4B
+logic [11:0] tensor_m;       // rows in ELEMENTS
+logic [11:0] tensor_n;       // cols in ELEMENTS
+```
+
+It rides through the ND midend bypass for free — `idma_nd_midend.sv:194` copies
+`burst_req_o = nd_req_i.burst_req` wholesale, overwriting only `src_addr`/
+`dst_addr`/`opt.last` (`:196-198`) — and through every request FIFO/fork. It is
+then mirrored into the legalizer's mutable options and into `w_dp_req_t` to reach
+the engine (§5).
+
+### 2.2 Per-frontend insertion of `transpose_en` + `E`
+
+| Frontend | Carrier | Anchor |
+|---|---|---|
+| **reg** | new `conf` CSR bits (bits 0:16 used; **17:31 free**): `transpose_en`=17, `transp_mode`=19:18 | fields `target/rtl/idma_reg64_2d.hjson:31-66`; template `src/frontend/reg/tpl/idma_reg.hjson.tpl:23-64`; wire in `proc_hw_req_conv`, `src/frontend/reg/tpl/idma_reg.sv.tpl` (~124-141) |
+| **desc64** | descriptor `flags[31:24]` reserved → `flags[24]=transpose_en`, `flags[26:25]=transp_mode` | `src/frontend/desc64/idma_desc64_top.sv:114-115`; assign in reshaper opt block `src/frontend/desc64/idma_desc64_reshaper.sv:27-60` |
+| **inst64** | new instruction `DMTRANSP` in the `casez (acc_req_i.data_op)`, **request-scoped** write to `burst_req.opt.transpose_en/.transp_mode` (contrast viDMA's sticky `otf_opcode_q`) | casez `src/frontend/inst64/idma_inst64_top.sv:386`; opt defaults `:343-364` |
+
+### 2.3 Matrix dims M, N
+
+M, N are **redundant with the ND geometry** (they equal the descriptor's
+element extents — see §4) but the engine still needs them as explicit ports
+(`tensor_size_m_i`/`tensor_size_n_i`) at the transport seam to drive its tile
+counters and edge `strb_o`. Decision: surface them as explicit `opt.tensor_m`/
+`opt.tensor_n` (24 b of flops), forwarded to `w_dp_req`, rather than
+reconstructing them from byte counts at the seam.
+
+- **reg / inst64**: drivers already program the dims/strides CSRs (length, reps,
+  strides); add two small writes for `tensor_m/tensor_n` (or compute them in the
+  reg HWIF from `length`/`reps`).
+- **desc64**: `flags` has no spare 24-bit budget — desc64 should **derive** M/N
+  from `length`/`reps`/strides in the reshaper rather than carry them literally.
+
+---
+
+## 3. Signaling decision — adversarial critique of viDMA, chosen alternative
+
+### 3.1 What viDMA does
+
+viDMA signals OTF compute with a **single sticky 8-bit `otf_opcode_q` register**,
+written by a custom RISC-V instruction `DMOPC` (`vidma:idma_inst64_top.sv:571-602`,
+`otf_opcode_q <= acc_req_i.data_arga[7:0]`, reset to passthrough `0x08`), passed
+as a **side-band port** `otf_opcode_i` into the backend
+(`vidma:idma_backend_rw_axi.sv:89`) — parallel to `idma_req_i`, bypassing the
+request descriptor and the ND midend — latched per-transfer into
+`opt_tf_q.otf_opcode` at legalizer refill (`vidma:idma_legalizer_rw_axi.sv:403-442`),
+copied into `r_dp_req`/`w_dp_req.otf_opcode`, and consumed in the transport layer
+via a **same-cycle "effective opcode" bypass** plus MX/FP **drain FSMs**
+(`vidma:idma_transport_layer_rw_axi.sv:231-243`). Element sizes and the write
+length are **derived from the opcode** by hard-coded tables
+(`vidma:vidma_otf_pkg.sv`: `otf_element_sizes`, `otf_write_length = r_len/in*out`).
+
+### 3.2 Critique — reject this model for transpose
+
+1. **Sticky + global desyncs config from the transfer.** One register feeds every
+   backend channel, but each channel has its own `nd_req` FIFO. `DMOPC` then
+   `DMCPY` is non-atomic; an interrupt or a second `DMOPC` re-tags a pending
+   transfer. The legalizer accept-time latch only *partially* masks this. For a
+   transpose interleaved with plain copies this is a correctness hazard.
+2. **8 bits cannot carry geometry.** The opcode is a pure op selector with
+   hard-coded element sizes and a scalar length ratio. Transpose needs runtime
+   `E ∈ {1,2,4}` plus 12-bit `M` and `N` (≥26 b). Extending viDMA means adding
+   sticky `DMSIZE`-class registers — reintroducing hazard (1) plus multi-register
+   atomicity.
+3. **viDMA never remaps addresses — the disqualifying gap.** Its `w_addr` is
+   monotonic; `otf_write_length`/`otf_coordinated_r_bytes` assume a *scalar
+   length ratio on a linear stream* (`vidma:vidma_otf_pkg.sv`). Transpose is 1:1
+   in bytes (ratio = identity) but needs a **tile-strided write address program**.
+   The opcode/ratio model has no concept of it.
+4. **Transport bloat is avoidable.** The same-cycle effective-opcode bypass is a
+   timing smell; the MX/FP drain FSMs and expansion bypass are viDMA-op-specific.
+   The transpose engine **self-drains via `exec_done`**, so integration needs
+   zero drain FSMs, zero opcode mux, zero bypass.
+
+### 3.3 Chosen alternative
+
+**Carry `{transpose_en, transp_mode, tensor_m, tensor_n}` as typed,
+request-scoped `idma_req_t.opt` fields, and express the address remap as an ND
+transfer (transposed `dst_strides`), not a legalizer ratio function.**
+
+| Layer | viDMA cost | Chosen cost | Why better |
+|---|---|---|---|
+| frontend | sticky `otf_opcode_q` + `DMOPC` | a few `opt` assigns | request-scoped; no cross-transfer leak |
+| port | side-band `otf_opcode_i` hand-synced | none — rides `opt` through existing FIFOs/forks | no global wire, no manual sync |
+| midend | bypassed | bypassed; **remap reuses `src/dst_strides`** | remap is native, not bolted on |
+| legalizer | latch + `otf_write_length` ratio | latch + forward; **no ratio math** (W==R) | identity length is correct as-is |
+| transport | effective-opcode bypass + drain FSMs | splice `valid/ready/strb_o`, pulse `clear_i` | engine self-drains |
+
+**Decision: adopt it.** Strictly cheaper and strictly more composable than the
+viDMA side-band.
+
+---
+
+## 4. Address plane — the corrected model
+
+### 4.1 Routing model
+
+Both `AR` and `AW` walk the **same** tile order (matching the engine's identical
+fill/drain walkers, `idma_otf_transpose.sv:172-201`). The transpose lives
+**entirely inside the engine** (intra-tile element swap, the readout at `:224-231`).
+The destination **strides** encode the N×M output pitch. This is realizable with
+the ND midend's existing logic **unchanged** — it already advances independent
+`src_addr`/`dst_addr` per dimension (`:169` src, `:181` dst) over one shared
+iteration order. The only cost is a **NumDim bump** (synth-time config), not new
+midend logic.
+
+> ⚠️ This corrects an earlier framing that described "transposed *write
+> sequence* / scattered tiles." The midend **cannot** reorder tiles between read
+> and write (single `stride_sel_q`/`burst_sent_q`, `:153-186`). It does not need
+> to: same tile order, **transposed dst strides**, engine does the element swap.
+
+### 4.2 The descriptor (NumDim = 4) and the exact stride program
+
+Let `E` = element bytes, `NE = StrbWidth/E` (tile side in elements,
+= beat in elements), `SW = StrbWidth` (bytes/beat), `YT = ceil(M/NE)`,
+`NT = ceil(N/NE)`, and `MP = YT·NE` — the **padded** output row pitch (M rounded
+up to a tile/`SW` boundary; `MP = M` when `M % NE == 0`). Iteration order
+(outer→inner): `nt` (col-tile) → `rt` (row-tile) → `j` (row/beat within tile).
+Each 1D run = one tile-row = `NE` elements = `SW` bytes, contiguous on both src
+and dst.
+
+Source `A` is M×N row-major at `src`; transposed `Aᵀ` is stored **N×MP**
+row-major at `dst` (real data in columns `[0, M)`, columns `[M, MP)` are padding).
+For iteration `(nt, rt, j)` the absolute addresses are:
+
+```
+src_addr = src + (rt·NE + j)·(N·E) + nt·(NE·E)     // reads A[rt·NE+j][nt·NE .. +NE)
+dst_addr = dst + (nt·NE + j)·(MP·E) + rt·(NE·E)    // writes Aᵀ[nt·NE+j][rt·NE .. +NE)
+```
+
+> ⚠️ **The ND midend strides are INCREMENTAL deltas, not these absolute per-dim
+> steps.** `idma_nd_midend.sv:164-186` adds exactly **one** stride per burst —
+> `addr += d_req[stride_sel].strides`, where `stride_sel = popcount(rolled-over
+> dims)`. So a dimension's `*_strides` must encode *"undo the inner loops and step
+> the outer dim once"*, i.e. the delta from the last inner position to the next
+> outer position — not the absolute pitch. The table below is the delta form
+> (verified in `test/tb_idma_transpose_nd.sv`); an absolute-stride table would
+> corrupt a real driver.
+
+| Level | reps | `src_strides` (Δ) | `dst_strides` (Δ) |
+|------|------|---------------|---------------|
+| 1D burst | `length = NE·E (= SW B)` | contiguous | contiguous |
+| `d_req[0]` = j (row in tile) | `NE` | `N·E` | `MP·E` |
+| `d_req[1]` = rt (row-tile) | `YT` | `N·E` | `NE·E − (NE−1)·MP·E` |
+| `d_req[2]` = nt (col-tile) | `NT` | `NE·E − (YT·NE−1)·N·E` | `MP·E − (YT−1)·NE·E` |
+
+⇒ **NumDim = 4** (1D + 3 `d_req`). The driver computes this program from
+`(src, dst, M, N, E)`; no new midend RTL. Pairing is positional: the engine
+fills with read beats `j=0..NE-1` of a tile, then drains output beats
+`j=0..NE-1` (columns); write iteration `j` consumes output beat `j`. `decouple_rw`
+lets the read machine run a tile ahead while the engine buffers (§5.2).
+
+**Two asymmetries that the deltas encode (both load-bearing for edge tiles):**
+
+1. **dst uses the padded pitch `MP·E`, src uses the real pitch `N·E`.** The engine
+   drains exactly one beat per `Aᵀ` row and the write path has **no buffer after
+   the engine** to coalesce a *misaligned* (word-crossing, hence 2-beat) write. If
+   `M·E` is not a multiple of `SW` (i.e. `M % NE != 0`), an unpadded row pitch
+   makes every `Aᵀ` row start at a sub-word offset → the legalizer splits each
+   row into 2 beats → the engine retires its single beat after the first and the
+   second beat starves → **write-channel deadlock**. Padding the pitch to `MP·E`
+   keeps every row `SW`-aligned (single-beat). Reads need no such padding:
+   misaligned source reads coalesce in the dataflow buffer, which sits *before*
+   the engine (verified: e.g. `8×6 EB=1` has misaligned reads and passes).
+
+2. **`d_req[2].src` rewinds by `YT·NE−1` rows, not `M−1`.** The read walks `NE`
+   rows per tile *including* the padding rows of the last row-tile (rows
+   `[M, YT·NE)`), so the column-tile rewind must clear the full *padded* column
+   height. (For aligned `M = YT·NE` the two forms coincide.)
+
+**Allocation / liveness obligations for a real driver (sim satisfies these):**
+- **`dst` MUST be allocated for the FULL tile-padded extent `NT·NE × MP` elements**
+  (not just `N × MP`). This is a hard requirement, not an optimization. When
+  `N % NE != 0` the engine drains `NT·NE` output-row beats and the all-padding
+  rows `[N, NT·NE)` are issued as real `AW`s (correct addr+len) carrying
+  `wstrb = 0` (§4.3). A permissive slave (e.g. `axi_sim_mem`) writes nothing for
+  them, but **a strict slave decodes the `AW` address *before* the strobes — an
+  under-allocated `dst` therefore risks `DECERR` on those rows**, surfaced as a
+  transfer error. The real data is the `[0, N) × [0, M)` sub-block of the padded
+  buffer. (A future hardening could suppress `AW` issuance for all-padding rows
+  so under-allocation is safe; until then, over-allocation is mandatory.)
+- `src` reads extend up to row `YT·NE−1`, col `NT·NE−1` (tile padding); the
+  source buffer must cover that or the reads must be benign (masked on write).
+- **`NumAxInFlight ≥ NE`.** The engine must buffer a whole `NE`-beat tile before
+  its first output beat, and the backend holds one transaction slot per single-beat
+  burst from `AR` until `B`; so `NE−1` bursts sit read-done/write-pending while the
+  `NE`-th completes. The tight deadlock boundary is therefore `NumAxInFlight ≥ NE−1`
+  (measured: `NE=8`→`≥7`, `NE=4`→`≥3`); the bound scales with `NE`, not buffer
+  depth (`BufferDepth` does not move it). Use `≥ NE` for a one-slot margin and a
+  clean rule. With runtime `E`, `NE = SW` in the worst case (`E=1`), so a backend
+  that must serve int8 transpose needs **synth-time `NumAxInFlight ≥ StrbWidth`**.
+  Below the threshold the write channel deadlocks *loudly* (W watchdog), never
+  silently. Verified by sweep in `test/tb_idma_transpose_nd.sv` (param
+  `NumAxInFlight`, auto-sized to `StrbWidth`).
+
+### 4.3 `strb_o → wstrb` — the load-bearing RTL change (IMPLEMENTED)
+
+`wstrb` was born **solely** from address-alignment masks in
+`src/backend/idma_axi_write.sv`: `w_first_mask = '1 << offset`, `w_last_mask =
+'1 >> (StrbWidth - tailer)`, combined into `mask_out` (`proc_out_mask_generator`),
+driven onto `write_req_o.w.strb` **and** used to pop the buffer
+`buffer_out_ready_o = mask_out`. There was no input port for an engine mask.
+
+Implemented (shared `idma_axi_write.sv`, backward-compatible):
+1. New input **`mask_ext_i`** (strb-wide), **AND**ed into `mask_out`
+   (`mask_out = alignment_masks & mask_ext_i`). Other backends tie it to `'1`
+   (no change). This is the deciding modification: it both narrows `w.strb` to
+   the real bytes of an edge tile **and** narrows the buffer pop so partial/zero
+   beats don't require the full beat.
+2. New output **`w_beat_done_o = write_happening`** — a strobe-*independent*
+   "a W beat was accepted" pulse. The transpose handshake retires the engine
+   output on `w_beat_done` (not on `|buffer_out_ready`), so an **all-padding
+   beat** (`strb_o = 0`, e.g. the non-existent `Aᵀ` rows of a `N % NE != 0`
+   edge) still issues one `wstrb=0` W beat and drains — instead of stalling on
+   `buffer_out_valid_i != 0`.
+3. In the transport (`idma_transport_layer_rw_axi.sv`), the transpose path drives
+   **presence = all-ones** (`buffer_out_valid_i = {StrbWidth{tp_out_valid}}`) and
+   carries the engine strobe separately as `wr_strb → mask_ext_i`, shifted by
+   `w_dp_req_i.shift` **identically** to the data (parallel `mask_ext_shifted`).
+   Presence-as-all-ones is what lets a zero-strobe beat satisfy `ready_to_write`
+   (`(presence & mask_out) == mask_out` holds for `mask_out = 0`) while the
+   `buffer_out_valid_i != 0` guard still protects the non-transpose path.
+
+`strb_o` is **already byte-granular** — `strb_int[p] = em[p >> transp_mode_i]`
+(`idma_otf_transpose.sv`) expands the per-element mask to one bit per byte — so it
+maps directly onto byte-granular `wstrb` for E ∈ {1,2,4}. No expansion needed.
+
+> The original plan listed only step 1. Steps 2–3 are the part it missed: edge
+> tiles produce **both** partially-masked beats (`leftover_rows`, within a row)
+> **and** all-zero beats (`leftover_cols`, whole `Aᵀ` rows that don't exist).
+> Masking alone deadlocks on the all-zero beats; the strobe-independent retire +
+> all-ones presence is what makes them drain. Both are proven load-bearing by the
+> padding-sentinel negative control in `tb_idma_transpose_nd.sv` (disable the
+> strobe → padding is clobbered with real transposed data → test fails).
+
+### 4.4 INCR-only — respected
+
+The legalizer hard-asserts `BURST_INCR` (`target/rtl/idma_legalizer_rw_axi.sv:417-419`)
+and a 1D run is one contiguous burst. Model §4.1 respects this: every 1D run is a
+contiguous tile-row; all tiling/scatter is expressed **above** the legalizer as
+ND per-dimension steps. The legalizer never sees a tile — it only page-splits a
+contiguous INCR run. Transpose is 1:1 bytes, so the legalizer's identical src/dst
+`length` (`:262, :271`, both `req_i.length`) is correct **as-is** — no
+`otf_write_length` hack.
+
+---
+
+## 5. Data plane — transport wiring
+
+### 5.1 Engine placement
+
+Behind `EnableTranspose`, route `buffer_out` (`idma_transport_layer_rw_axi.sv:201-203`)
+→ engine `data_i`; engine `data_o` → write barrel-shifter input (`:210-213`).
+Passthrough keeps today's direct path. Selection = static `EnableTranspose`
+**AND** per-transfer `w_dp_req_i.transpose_en` (mirrors viDMA's passthrough mux
+but **without** the opcode machinery). Geometry ports come from
+`w_dp_req_i.transp_mode/tensor_m/tensor_n`; `clear_i` is pulsed on the first beat
+of a transpose transfer.
+
+### 5.2 decouple_rw / decouple_aw
+
+Transpose **requires** `decouple_rw = 1`: the engine's ping-pong overlaps the
+write of tile k-1 with the read of tile k (`idma_otf_transpose.sv:19-23`).
+`decouple_aw = 1` so AWs trail the first R by the one-tile fill latency. Set them
+**per-transfer** (only when `transpose_en`) via `opt.beo.decouple_rw/decouple_aw`
+— do **not** copy viDMA's blanket unconditional `decouple_rw` workaround
+(`vidma:vidma_otf_pkg.sv` `needs_force_decouple_rw`). Keep the coupled path for
+plain copies. The package warns `decouple_rw` "can cause deadlocks"
+(`src/idma_pkg.sv:76`) — see §6 risk.
+
+### 5.3 Response / meta path
+
+No new fields: one transfer = one `done_id`. `idma_rsp_t` (`typedef.svh:45-50`)
+flows backend → error handler → ND midend (`nd_rsp_o` valid on `burst_rsp_i.last`,
+`idma_nd_midend.sv:208`) → frontend counters (inst64 `completed_id`/`next_id`
+`:140-141`; reg `done_id`/`next_id`; desc64 IRQ from `flags[0]`
+`idma_desc64_reshaper.sv:22`). **Must-fix:** with `decouple_rw` and a tile drained
+*after* its read, the last response must be gated on **write** completion of the
+last tile, not read completion — otherwise `done_id` rises before the transposed
+data is committed. The engine's ping-pong/drain busy must also feed the existing
+busy aggregation (`src/idma_pkg.sv` `idma_busy_t`) so frontend polling stays
+correct.
+
+---
+
+## 6. Per-layer change checklist
+
+- **typedef** — add `transpose_en`(1b), `transp_mode`(2b), `tensor_m`(12b),
+  `tensor_n`(12b) to `IDMA_TYPEDEF_OPTIONS_T` (`src/include/idma/typedef.svh:21-30`).
+  Req/ND/response macros unchanged.
+- **frontend/reg** — `conf` bits 17/19:18; wire in `proc_hw_req_conv`
+  (`src/frontend/reg/tpl/idma_reg.{hjson,sv}.tpl`). M/N via existing length/reps.
+- **frontend/desc64** — `flags[24]`/`flags[26:25]`
+  (`idma_desc64_top.sv:114`, reshaper `:27-60`); derive M/N.
+- **frontend/inst64** — `DMTRANSP` arm (`idma_inst64_top.sv:386`); default new
+  opt fields to 0 (`:343-364`); reuse DMREP/DMSTR for reps/strides.
+- **midend** — **no RTL logic change**; needs **NumDim = 4** backend config
+  (`RepWidth`/`NumDim` are synth params, `idma_nd_midend.sv:16,26`). Independent
+  walk (`:164-186`) + bypass (`:194-198`) carry it.
+- **legalizer** — mirror the new opt fields into `idma_mut_tf_opt_t`, latch in the
+  `opt_tf_d` literal (`target/rtl/idma_legalizer_rw_axi.sv:267-301`; template
+  `src/backend/tpl/idma_legalizer.sv.tpl`), forward into `w_dp_req`. **No**
+  `otf_write_length` math. INCR asserts (`:417-419`) stay valid.
+- **backend** — add `transpose_en/transp_mode/tensor_m/tensor_n` to `w_dp_req_t`;
+  add `EnableTranspose` param gating the transport instantiation; populate fields
+  from `opt_tf_q`.
+- **transport** — instantiate engine at the `buffer_out → shifter` seam behind
+  `EnableTranspose && w_dp_req_i.transpose_en` (`target/rtl/idma_transport_layer_rw_axi.sv:201-213`);
+  wire `clear_i`, geometry, `valid/ready`; shift `strb_o` by `w_dp_req_i.shift`.
+- **backend/axi_write** — **the one load-bearing change**: AND engine `strb_o`
+  into `mask_out` (`src/backend/idma_axi_write.sv:135-144`); flows to `w.strb`
+  (`:197`) and pop (`:174`). Define fully-masked-beat behavior (§7.1).
+- **pkg** — ensure engine busy contributes to `busy_o` (`src/idma_pkg.sv`).
+
+---
+
+## 7. Open questions & risks (be explicit — no silent gaps)
+
+1. **Partial-tile edge reads + phantom write beats (highest).** For M,N not
+   multiples of `NE` the engine still consumes/produces **full padded NE-beat
+   tiles** (`idma_otf_transpose.sv` contract). Two consequences:
+   - the last partial **row**-tile read may touch **out-of-bounds source**
+     addresses (writes are safe — masked by `strb_o`);
+   - the last partial **col**-tile produces **fully-masked phantom write beats**
+     (`strb_o == 0`). In `idma_axi_write.sv`, `mask_out == 0` makes
+     `ready_to_write` vacuously true (`:159`) and would emit a zero-strobe `W`
+     or stall the pop (`:174`) — **undefined today**.
+   Decide explicitly, do **not** let edge tiles read OOB or emit junk W:
+   (a) require NE-aligned M,N for the first cut (no OOB, no phantom); or
+   (b) add a transport-seam pad/drop block: zero-pad reads at the source edge and
+       drop fully-masked W beats (and shorten the AW length accordingly); or
+   (c) rely on an SoC contract that source/dest are allocated tile-padded.
+   Recommended: (a) first, (b) as the productized path.
+2. **`strb_o → wstrb` merge must be exact.** Shift by `w_dp_req_i.shift` identically
+   to data; **AND** (not replace) with the alignment masks so it composes with
+   address misalignment. Verify a fully-masked beat does not deadlock the
+   buffer-pop path (`idma_axi_write.sv:174`).
+3. **Last-response timing.** Gate `nd_rsp_o.last` / `done_id` on **write**
+   completion of the final tile under `decouple_rw` (§5.3), not read completion.
+   Guaranteed to bite given the ping-pong latency — a resolved design point, not
+   a "verify later".
+4. **Deadlock.** `decouple_rw = 1` is required but the package warns it can
+   deadlock (`idma_pkg.sv:76`; cf. `doc/TODO-rw-decoupled-deadlock.md`). Set it
+   **per-transfer**, keep coupled path for copies, and write a directed test that
+   the engine's full-tile buffer filling while the write side backpressures does
+   **not** form a buffer-full ↔ engine-stall ↔ write-stall cycle.
+5. **AW/AR id ordering.** All bursts use one `opt_tf_q.axi_id`
+   (`idma_legalizer_rw_axi.sv:314, 340`). With `NE` fragmented short bursts per
+   tile and `decouple_rw`, same-ID ordering forces correctness but the fragmented
+   multi-burst-per-tile W-to-AW pairing under the engine's drain is unanalyzed —
+   confirm before relying on it.
+6. **NumDim = 4 cost.** A NumDim bump widens the ND CSR/descriptor footprint
+   (reps/strides per dimension) across reg/desc64/inst64. Quantify the CSR-map and
+   descriptor-size impact per frontend; small SoCs may want a transpose-only
+   reduced-width ND variant.
+7. **Burst efficiency.** Each 1D run is one tile-row (`SW` bytes ⇒ one beat), so
+   AR/AW issue at one (short) burst per tile-row. This is inherent to the tiled
+   walk; if AR/AW issue-rate bottlenecks, coalesce contiguous tile-rows when
+   `N == NE` (single col-tile makes a tile's rows contiguous) as a special case.
+
+---
+
+## 8. Net
+
+- **Control**: typed, request-scoped `idma_req_t.opt` fields — reject viDMA's
+  sticky global opcode side-band.
+- **Address**: same tile-visitation order on AR and AW; transpose lives in the
+  engine; **transposed `dst_strides`** over a NumDim=4 ND program (midden logic
+  untouched, config bump only). Stride table in §4.2.
+- **Data**: engine spliced at the `buffer_out` seam behind `EnableTranspose`,
+  self-draining; **one load-bearing new RTL primitive**: AND the shifted engine
+  `strb_o` into `mask_out` in `idma_axi_write.sv`.
+- **Largest unresolved items**: partial-tile edge reads / phantom write beats
+  (§7.1), last-response-on-write timing (§7.3), `decouple_rw` deadlock test (§7.4).
diff --git a/idma.mk b/idma.mk
index 06d6c4b8..0034c62a 100644
--- a/idma.mk
+++ b/idma.mk
@@ -16,6 +16,9 @@ SPHINXBUILD ?= sphinx-build
 VCS         ?= vcs
 VERILATOR   ?= verilator
 VLOGAN      ?= vlogan
+VSIM        ?= vsim
+VLOG        ?= vlog
+VLIB        ?= vlib
 
 # Shell
 SHELL := /bin/bash
@@ -32,6 +35,8 @@ IDMA_OCCAMY_IDS  := \
 					r_axi_rw_init_rw_obi
 IDMA_ADD_IDS     ?=
 IDMA_BACKEND_IDS := $(IDMA_BASE_IDS) $(IDMA_OCCAMY_IDS) $(IDMA_ADD_IDS)
+# Backend variants that host the on-the-fly compute dispatcher (single AXI write)
+IDMA_VIDMA_IDS   ?= rw_axi
 
 # generated frontends
 IDMA_BASE_FE_IDS := reg32_3d reg64_2d reg64_1d
@@ -110,17 +115,17 @@ IDMA_RTL_FILES  := $(IDMA_RTL_DIR)/idma_transport_layer \
 IDMA_VSIM_DIR   := $(IDMA_ROOT)/target/sim/vsim
 
 define idma_gen
-	$(PYTHON) $(IDMA_GEN) --entity $1 --tpl $2 --db $3 --ids $4 --fids $5 > $6
+	$(PYTHON) $(IDMA_GEN) --entity $1 --tpl $2 --db $3 --ids $4 --fids $5 $(if $7,--compute-ids $7) > $6
 endef
 
 $(IDMA_RTL_DIR)/idma_transport_layer_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl $(IDMA_DB_FILES)
-	$(call idma_gen,transport,$(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl,$(IDMA_DB_FILES),$*,,$@)
+	$(call idma_gen,transport,$(IDMA_ROOT)/src/backend/tpl/idma_transport_layer.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS))
 
 $(IDMA_RTL_DIR)/idma_legalizer_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl $(IDMA_DB_FILES)
-	$(call idma_gen,legalizer,$(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl,$(IDMA_DB_FILES),$*,,$@)
+	$(call idma_gen,legalizer,$(IDMA_ROOT)/src/backend/tpl/idma_legalizer.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS))
 
 $(IDMA_RTL_DIR)/idma_backend_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_RTL_DIR)/idma_legalizer_%.sv $(IDMA_RTL_DIR)/idma_transport_layer_%.sv $(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl $(IDMA_DB_FILES)
-	$(call idma_gen,backend,$(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl,$(IDMA_DB_FILES),$*,,$@)
+	$(call idma_gen,backend,$(IDMA_ROOT)/src/backend/tpl/idma_backend.sv.tpl,$(IDMA_DB_FILES),$*,,$@,$(IDMA_VIDMA_IDS))
 
 $(IDMA_RTL_DIR)/idma_backend_synth_%.sv: $(IDMA_GEN) $(IDMA_GEN_SRC) $(IDMA_RTL_DIR)/idma_backend_%.sv $(IDMA_ROOT)/src/backend/tpl/idma_backend_synth.sv.tpl $(IDMA_DB_FILES)
 	$(call idma_gen,synth_wrapper,$(IDMA_ROOT)/src/backend/tpl/idma_backend_synth.sv.tpl,$(IDMA_DB_FILES),$*,,$@)
@@ -306,9 +311,84 @@ define idma_generate_vsim
 endef
 
 $(IDMA_VSIM_DIR)/compile.tcl: $(IDMA_BENDER_FILES) $(IDMA_FULL_TB) $(IDMA_FULL_RTL) $(IDMA_INCLUDE_ALL) $(IDMA_WAVE_ALL)
-	$(call idma_generate_vsim, $@, -t sim -t test -t idma_test -t synth -t rtl -t asic -t snitch_cluster,../../..)
+	$(call idma_generate_vsim, $@, -t sim -t test -t idma_test -t synth -t rtl -t asic -t snitch_cluster -t split_rtl,../../..)
+
+.PHONY: idma_sim_tb_idma_rt_midend
+
+idma_sim_tb_idma_rt_midend: $(IDMA_VSIM_DIR)/compile.tcl
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc \
+	    tb_idma_rt_midend -do "run -all; quit"
+
+# Standalone self-checking transpose-engine regression (DPI-C golden, no backend deps).
+# Run with the Questa SEPP wrapper, e.g.:
+#   make idma_sim_tb_idma_otf_transpose VSIM="questa-2023.4 vsim" VLOG="questa-2023.4 vlog" VLIB="questa-2023.4 vlib"
+IDMA_OTF_TP_RTL := $(abspath $(IDMA_ROOT)/src/backend/idma_otf_transpose.sv)
+IDMA_OTF_TP_TB  := $(abspath $(IDMA_ROOT)/test/tb_idma_otf_transpose.sv)
+IDMA_OTF_TP_DPI := $(abspath $(IDMA_ROOT)/test/idma_transpose_dpi.c)
+IDMA_OTF_TP_DIR := $(abspath $(IDMA_VSIM_DIR))/otf_transpose
+
+.PHONY: idma_sim_tb_idma_otf_transpose
+idma_sim_tb_idma_otf_transpose:
+	mkdir -p $(IDMA_OTF_TP_DIR)
+	cd $(IDMA_OTF_TP_DIR); $(VLIB) work
+	cd $(IDMA_OTF_TP_DIR); $(VLOG) -sv $(IDMA_OTF_TP_DPI)
+	cd $(IDMA_OTF_TP_DIR); $(VLOG) -sv -svinputport=compat -timescale "1ns/1fs" $(IDMA_OTF_TP_RTL) $(IDMA_OTF_TP_TB)
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8  -gM=13  -gN=19 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8  -gM=7   -gN=5  -gEB=2 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=8  -gM=5   -gN=3  -gEB=4 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gStrbWidth=64 -gM=130 -gN=70 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=8  -gM=13 -gN=19 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=8  -gM=7  -gN=5  -gEB=2 tb_idma_otf_transpose +BP -do "run -all; quit"
+	cd $(IDMA_OTF_TP_DIR); $(VSIM) -c -t 1ps -gFullDuplex=0 -gStrbWidth=64 -gM=130 -gN=70 -gEB=1 tb_idma_otf_transpose +BP -do "run -all; quit"
+
+# Multi-tile transpose via the ND midend (transposed strides) -> rw_axi backend
+# (engine spliced at the write seam) -> axi_sim_mem. Covers aligned and edge
+# (M or N not a multiple of NE) geometries for int8/fp16/fp32. Needs the
+# split_rtl flow (per-variant routing). Run with the Questa SEPP wrapper:
+#   make idma_sim_tb_idma_transpose_nd VSIM="questa-2023.4 vsim"
+.PHONY: idma_sim_tb_idma_transpose_nd
+idma_sim_tb_idma_transpose_nd: $(IDMA_VSIM_DIR)/compile.tcl
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit"
+	# ── aligned (regression: M,N multiples of NE) ──
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8   -gN=8  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=16  -gN=16 -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=16  -gN=8  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=32  -gN=24 -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8   -gN=8  -gEB=2 tb_idma_transpose_nd -do "run -all; quit"
+	# ── edge: partial output cols only (M%NE!=0, N%NE==0; within-beat wstrb) ──
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6   -gN=8  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	# ── edge: partial output rows only (N%NE!=0; zero-strobe drain beats) ──
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8   -gN=6  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	# ── edge: both partial (int8) ──
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6   -gN=6  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5   -gN=7  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=10  -gN=6  -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+	# ── edge: fp16 (EB=2) and fp32 (EB=4) ──
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5   -gN=5  -gEB=2 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=9   -gN=5  -gEB=4 tb_idma_transpose_nd -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=13  -gN=19 -gEB=1 tb_idma_transpose_nd -do "run -all; quit"
+
+# Back-to-back regressions: the ND midend must reload each new transfer's base
+# address (it does, for a protocol-compliant producer that drops nd_req_valid on
+# accept). tb_idma_nd_midend_b2b checks the midend's burst-address sequence under
+# backpressure; tb_idma_transpose_b2b checks two end-to-end transposes to distinct
+# destinations.  Run with the Questa SEPP wrapper.
+.PHONY: idma_sim_tb_idma_nd_midend_b2b
+idma_sim_tb_idma_nd_midend_b2b: $(IDMA_VSIM_DIR)/compile.tcl
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc tb_idma_nd_midend_b2b -do "run -all; quit"
+
+.PHONY: idma_sim_tb_idma_transpose_b2b
+idma_sim_tb_idma_transpose_b2b: $(IDMA_VSIM_DIR)/compile.tcl
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -do "source compile.tcl; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=6  -gN=8 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=8  -gN=8 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=64 -gM=13 -gN=19 -gEB=1 tb_idma_transpose_b2b -do "run -all; quit"
+	cd $(IDMA_VSIM_DIR); $(VSIM) -c -t 1ps -voptargs=+acc -gDataWidth=32 -gM=5  -gN=5 -gEB=2 tb_idma_transpose_b2b -do "run -all; quit"
 
 idma_sim_clean:
+	rm -rf $(IDMA_OTF_TP_DIR)
 	rm -rf $(IDMA_VSIM_DIR)/compile.tcl
 	rm -rf $(IDMA_VSIM_DIR)/work
 	rm -f  $(IDMA_VSIM_DIR)/dma_trace_*
diff --git a/jobs/backend_rw_axi/transpose_none.txt b/jobs/backend_rw_axi/transpose_none.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/src/backend/idma_axi_write.sv b/src/backend/idma_axi_write.sv
index ee27349d..faa309db 100644
--- a/src/backend/idma_axi_write.sv
+++ b/src/backend/idma_axi_write.sv
@@ -73,7 +73,11 @@ module idma_axi_write #(
     /// Valid from buffer
     input  strb_t buffer_out_valid_i,
     /// Ready to buffer
-    output strb_t buffer_out_ready_o
+    output strb_t buffer_out_ready_o,
+    /// External write-strobe mask (ANDed into wstrb); tie to '1 when unused
+    input  strb_t mask_ext_i,
+    /// Pulses when a write beat is accepted on the bus (strobe-independent)
+    output logic  w_beat_done_o
 );
     // offsets needed for masks to empty buffer
     strb_t w_first_mask;
@@ -141,6 +145,8 @@ module idma_axi_write #(
         if (w_dp_req_i.tailer != '0 & last_w) begin
             mask_out = mask_out & w_last_mask;
         end
+        // external mask: some bytes may be masked by an OTF engine
+        mask_out = mask_out & mask_ext_i;
     end
 
 
@@ -170,6 +176,8 @@ module idma_axi_write #(
     // write happening: both the bus (w_ready) and the buffer (ready_to_write) is high
     assign write_happening = ready_to_write & write_rsp_i.w_ready;
 
+    assign w_beat_done_o = write_happening;
+
     // the main buffer is conditionally to the write mask popped
     assign buffer_out_ready_o = write_happening ? mask_out : '0;
 
diff --git a/src/backend/idma_otf_compute.sv b/src/backend/idma_otf_compute.sv
new file mode 100644
index 00000000..88bfff32
--- /dev/null
+++ b/src/backend/idma_otf_compute.sv
@@ -0,0 +1,100 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+/// On-the-fly compute dispatcher at the transport write seam: latches the
+/// per-transfer compute options and dispatches one op per transfer to its sub-unit.
+module idma_otf_compute #(
+  /// Byte lanes per beat (= DataWidth/8)
+  parameter int unsigned StrbWidth       = 32'd8,
+  /// Compile-time per-op feature enables (value rendered by the generator)
+  parameter idma_pkg::compute_enable_t ComputeEnable = '0,
+  /// Transpose engine duplex (1: two banks full rate, 0: one bank half area)
+  parameter bit                        TransposeFullDuplex = 1'b1
+) (
+  input  logic clk_i,
+  input  logic rst_ni,
+
+  /// Per-transfer compute config; valid only while `cfg_valid_i`
+  input  idma_pkg::compute_options_t compute_i,
+  input  logic                       cfg_valid_i,
+  /// A supported compute op is armed for this transfer
+  output logic                       active_o,
+
+  /// Input beat stream (from the dataflow buffer)
+  input  logic [StrbWidth-1:0][7:0] data_i,
+  input  logic                      valid_i,
+  output logic                      in_ready_o,
+
+  /// Output beat stream (computed) with per-byte strobe for edge masking
+  output logic [StrbWidth-1:0][7:0] data_o,
+  output logic [StrbWidth-1:0]      strb_o,
+  output logic                      valid_o,
+  input  logic                      ready_i
+);
+
+  // config latch with first-beat bypass
+  idma_pkg::compute_options_t latched_q, eff_compute;
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni)          latched_q <= '0;
+    else if (cfg_valid_i) latched_q <= compute_i;
+  end
+  assign eff_compute = cfg_valid_i ? compute_i : latched_q;
+
+  // per-op select
+  logic sel_transpose;
+  assign sel_transpose = eff_compute.enable &
+                         (eff_compute.op == idma_pkg::COMPUTE_TRANSPOSE) & ComputeEnable.transpose;
+
+  assign active_o = sel_transpose;
+
+  // transpose sub-unit
+  logic [StrbWidth-1:0][7:0] tp_data;
+  logic [StrbWidth-1:0]      tp_strb;
+  logic                      tp_valid, tp_in_ready;
+
+  if (ComputeEnable.transpose) begin : gen_transpose
+    idma_otf_transpose #(
+      .StrbWidth  ( StrbWidth                   ),
+      .DimWidth   ( idma_pkg::TransposeDimWidth ),
+      .FullDuplex ( TransposeFullDuplex         )
+    ) i_idma_otf_transpose (
+      .clk_i,
+      .rst_ni,
+      .clear_i         ( ~sel_transpose                          ),
+      .transp_mode_i   ( eff_compute.params.transpose.mode       ),
+      .tensor_size_m_i ( eff_compute.params.transpose.tensor_m   ),
+      .tensor_size_n_i ( eff_compute.params.transpose.tensor_n   ),
+      .data_i          ( data_i                                  ),
+      .valid_i         ( valid_i & sel_transpose                 ),
+      .ready_o         ( tp_in_ready                             ),
+      .data_o          ( tp_data                                 ),
+      .strb_o          ( tp_strb                                 ),
+      .valid_o         ( tp_valid                                ),
+      .ready_i         ( ready_i & sel_transpose                 )
+    );
+  end else begin : gen_no_transpose
+    assign tp_data = '0; assign tp_strb = '0; assign tp_valid = 1'b0; assign tp_in_ready = 1'b0;
+  end
+
+  // output dispatch
+  always_comb begin
+    data_o     = '0;
+    strb_o     = '0;
+    valid_o    = 1'b0;
+    in_ready_o = 1'b0;
+    unique case (1'b1)
+      sel_transpose: begin
+        data_o     = tp_data;
+        strb_o     = tp_strb;
+        valid_o    = tp_valid;
+        in_ready_o = tp_in_ready;
+      end
+      default: ;
+    endcase
+  end
+
+endmodule : idma_otf_compute
diff --git a/src/backend/idma_otf_transpose.sv b/src/backend/idma_otf_transpose.sv
new file mode 100644
index 00000000..5c543f99
--- /dev/null
+++ b/src/backend/idma_otf_transpose.sv
@@ -0,0 +1,234 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// On-the-fly matrix transpose engine (ping-pong tile banks) for the iDMA
+// transport write seam. Adapted from the datamover (Ratha) HWPE:
+// pulp-platform/datamover@d58a985, rtl/datamover_engine.sv.
+//
+// Contract: input padded to full tiles, fed (col-tile, row-tile, row) order;
+// out_T[nt*NE+k][rt*NE+r] = in[rt*NE+r][nt*NE+k]. strb_o masks partial edge tiles.
+// Throughput: 1 + 1/NE cycles per NE-beat tile (one handoff bubble per tile).
+
+module idma_otf_transpose #(
+  /// Byte lanes per beat (= DataWidth/8)
+  parameter  int unsigned StrbWidth  = 32'd8,
+  /// Tensor dimension width in elements (matches idma_pkg::TransposeDimWidth)
+  parameter  int unsigned DimWidth   = 32'd12,
+  /// 1: two tile banks, fill while draining; 0: one bank (half area, half rate)
+  parameter  bit          FullDuplex = 1'b1,
+  localparam int unsigned NumBanks   = FullDuplex ? 32'd2 : 32'd1,
+  localparam int unsigned LaneW      = $clog2(StrbWidth)
+) (
+  input  logic clk_i,
+  input  logic rst_ni,
+  input  logic clear_i,
+
+  /// Element size select: 0->1B, 1->2B, 2->4B (E = 1<<transp_mode_i)
+  input  logic [1:0]  transp_mode_i,
+  /// Matrix dimensions in elements
+  input  logic [DimWidth-1:0] tensor_size_m_i,
+  input  logic [DimWidth-1:0] tensor_size_n_i,
+
+  /// Input beat stream (row-major), padded to full tiles
+  input  logic [StrbWidth-1:0][7:0] data_i,
+  input  logic                      valid_i,
+  output logic                      ready_o,
+
+  /// Output beat stream (transposed) with per-byte valid mask for edge tiles
+  output logic [StrbWidth-1:0][7:0] data_o,
+  output logic [StrbWidth-1:0]      strb_o,
+  output logic                      valid_o,
+  input  logic                      ready_i
+);
+
+  // geometry: NE is a power of two, so only shifts and AND-masks
+  logic [1:0]       eff_mode;         // element-size mode, saturated at LaneW
+  logic [LaneW:0]   ne_m1;            // NE-1
+  logic [3:0]       log2_ne;          // log2(NE) = LaneW - eff_mode
+  logic [DimWidth-1:0] y_tiles, n_tiles; // row-tiles, col-tiles
+  logic [LaneW:0]   leftover_rows, leftover_cols;  // M%NE, N%NE (run-global)
+
+  // saturate at LaneW: out-of-contract mode (E>StrbWidth) degrades to NE=1
+  assign eff_mode      = (transp_mode_i > LaneW) ? LaneW[1:0] : transp_mode_i;
+  assign ne_m1         = (1 << (LaneW - eff_mode)) - 1;            // NE-1
+  assign log2_ne       = LaneW - eff_mode;
+  // Widen the ceil-div add by one bit so it cannot wrap at the dim range.
+  assign y_tiles       = DimWidth'(((DimWidth+1)'(tensor_size_m_i) + ne_m1) >> log2_ne);
+  assign n_tiles       = DimWidth'(((DimWidth+1)'(tensor_size_n_i) + ne_m1) >> log2_ne);
+  assign leftover_rows = tensor_size_m_i & ne_m1;
+  assign leftover_cols = tensor_size_n_i & ne_m1;
+
+  // FF tile banks (ping-pong when FullDuplex), E=1 worst case (StrbWidth x StrbWidth B)
+  logic [StrbWidth-1:0][7:0] tile_q [NumBanks][StrbWidth];
+
+  // internal output + handshakes
+  logic [StrbWidth-1:0][7:0] data_int;
+  logic [StrbWidth-1:0]      strb_int;
+  logic                      valid_int, ready_int;
+  logic in_hs, out_hs;
+  assign in_hs  = valid_i   & ready_o;
+  assign out_hs = valid_int & ready_int;
+
+  // full_q[b]: bank b holds a complete tile. Producer sets on fill-complete,
+  // consumer clears on drain-complete.
+  logic [NumBanks-1:0] full_q;
+  logic             wr_bank, rd_bank;
+  logic [LaneW-1:0] wr_cnt, rd_cnt;   // intra-tile beat index (write / read)
+  logic             wr_last, rd_last;
+
+  assign wr_last = (wr_cnt == ne_m1[LaneW-1:0]);
+  assign rd_last = (rd_cnt == ne_m1[LaneW-1:0]);
+
+  assign ready_o   = ~full_q[wr_bank];
+  assign valid_int =  full_q[rd_bank];
+
+  // tile walkers (col-tile outer, row-tile inner); drain trails fill by up to one tile
+  logic [DimWidth-1:0] rtw, ntw;   // write walker: row-tile, col-tile
+  logic [DimWidth-1:0] rtr, ntr;   // read  walker: row-tile, col-tile
+
+  logic last_y_tile_w, last_n_tile_w;  // edge flags of the tile being filled
+  assign last_y_tile_w = (rtw == y_tiles - 1);
+  assign last_n_tile_w = (ntw == n_tiles - 1);
+
+  logic last_y_tile_r, last_n_tile_r;  // edge flags of the tile being drained
+  assign last_y_tile_r = (rtr == y_tiles - 1);
+  assign last_n_tile_r = (ntr == n_tiles - 1);
+
+  // per-bank edge flags, captured at fill-complete, consumed by the drain strobe
+  logic shadow_last_y [NumBanks];
+  logic shadow_last_n [NumBanks];
+
+  // fill-/drain-complete events
+  logic fill_done, drain_done, exec_done;
+  assign fill_done  = in_hs  & wr_last;
+  assign drain_done = out_hs & rd_last;
+  // transfer done once the final tile drains
+  assign exec_done  = drain_done & last_y_tile_r & last_n_tile_r;
+
+  // producer (input) side
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      for (int b = 0; b < NumBanks; b++)
+        for (int r = 0; r < StrbWidth; r++)
+          tile_q[b][r] <= '0;
+      wr_cnt           <= '0;
+      wr_bank          <= 1'b0;
+      rtw              <= '0;
+      ntw              <= '0;
+      for (int b = 0; b < NumBanks; b++) begin
+        shadow_last_y[b] <= 1'b0;
+        shadow_last_n[b] <= 1'b0;
+      end
+    end else if (clear_i || exec_done) begin
+      wr_cnt           <= '0;
+      wr_bank          <= 1'b0;
+      rtw              <= '0;
+      ntw              <= '0;
+      for (int b = 0; b < NumBanks; b++) begin
+        shadow_last_y[b] <= 1'b0;
+        shadow_last_n[b] <= 1'b0;
+      end
+    end else begin
+      if (in_hs) begin
+        tile_q[wr_bank][wr_cnt] <= data_i;
+        wr_cnt <= wr_last ? '0 : (wr_cnt + 1'b1);
+      end
+      if (fill_done) begin
+        shadow_last_y[wr_bank] <= last_y_tile_w;
+        shadow_last_n[wr_bank] <= last_n_tile_w;
+        wr_bank <= FullDuplex ? ~wr_bank : 1'b0;
+        if (rtw == y_tiles - 1) begin
+          rtw <= '0;
+          ntw <= ntw + 1'b1;
+        end else begin
+          rtw <= rtw + 1'b1;
+        end
+      end
+    end
+  end
+
+  // consumer (output) side
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni || clear_i || exec_done) begin
+      rd_cnt  <= '0;
+      rd_bank <= 1'b0;
+      rtr     <= '0;
+      ntr     <= '0;
+    end else begin
+      if (out_hs) begin
+        rd_cnt <= rd_last ? '0 : (rd_cnt + 1'b1);
+      end
+      if (drain_done) begin
+        rd_bank <= FullDuplex ? ~rd_bank : 1'b0;
+        if (rtr == y_tiles - 1) begin
+          rtr <= '0;
+          ntr <= ntr + 1'b1;
+        end else begin
+          rtr <= rtr + 1'b1;
+        end
+      end
+    end
+  end
+
+  // full/empty token
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni || clear_i || exec_done) begin
+      full_q <= 2'b00;
+    end else begin
+      if (fill_done)  full_q[wr_bank] <= 1'b1;
+      if (drain_done) full_q[rd_bank] <= 1'b0;
+    end
+  end
+
+  // transposed readout: byte p (element e=p>>logE, byte b=p&(E-1)) reads
+  // tile_q[rd_bank][e][rd_cnt*E + b]
+  always_comb begin
+    for (int p = 0; p < StrbWidth; p++) begin
+      automatic int unsigned e   = p >> eff_mode;
+      automatic int unsigned b   = p & ((1 << eff_mode) - 1);
+      automatic int unsigned col = (rd_cnt << eff_mode) | b;
+      data_int[p] = tile_q[rd_bank][e][col];
+    end
+  end
+
+  // output strobe: element-granular edge masking from the drain-side shadow flags
+  always_comb begin
+    logic [StrbWidth-1:0] em;  // per-element valid (only low NE bits meaningful)
+    logic ly, ln;
+    ly = shadow_last_y[rd_bank];
+    ln = shadow_last_n[rd_bank];
+    for (int e = 0; e < StrbWidth; e++) begin
+      logic v;
+      if ((ly && leftover_rows != 0) && (ln && leftover_cols != 0))
+        v = (rd_cnt < leftover_cols) && (e < leftover_rows);
+      else if (ly && leftover_rows != 0)
+        v = (e < leftover_rows);
+      else if (ln && leftover_cols != 0)
+        v = (rd_cnt < leftover_cols);
+      else
+        v = 1'b1;
+      em[e] = v;
+    end
+    for (int p = 0; p < StrbWidth; p++)
+      strb_int[p] = em[p >> eff_mode];
+  end
+
+  // output register; not cleared by exec_done so the final beat is held until accepted
+  assign ready_int = ~valid_o | ready_i;
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni || clear_i) begin
+      valid_o <= 1'b0;
+      data_o  <= '0;
+      strb_o  <= '0;
+    end else if (ready_int) begin
+      valid_o <= valid_int;
+      data_o  <= data_int;
+      strb_o  <= strb_int;
+    end
+  end
+
+endmodule : idma_otf_transpose
diff --git a/src/backend/tpl/idma_backend.sv.tpl b/src/backend/tpl/idma_backend.sv.tpl
index c5a2e164..a76120e9 100644
--- a/src/backend/tpl/idma_backend.sv.tpl
+++ b/src/backend/tpl/idma_backend.sv.tpl
@@ -170,9 +170,23 @@ _rsp_t ${protocol}_write_rsp_i,
     output idma_busy_t busy_o
 );
 
+    /// Extra write-descriptor slots covering the compute (transpose) tile-fill latency
+    localparam int unsigned ComputeFifoDepth = ${"StrbWidth" if enable_compute else "32'd0"};
+% if enable_compute:
+
+    /// Per-op compute set baked into this variant (frontends may cross-check)
+    localparam idma_pkg::compute_enable_t ComputeEnable =
+        '{${', '.join("%s: 1'b1" % op for op in compute_ops)}};
+`ifndef SYNTHESIS
+    // no engine flush on abort: compute is incompatible with error handling
+    initial assert (ErrorCap == idma_pkg::NO_ERROR_HANDLING) else
+        $fatal(1, "compute requires ErrorCap == NO_ERROR_HANDLING");
+`endif
+% endif
+
     /// The localparam MetaFifoDepth holds the maximum number of transfers that can be
     /// in-flight under any circumstances.
-    localparam int unsigned MetaFifoDepth = BufferDepth + NumAxInFlight + MemSysDepth;
+    localparam int unsigned MetaFifoDepth = BufferDepth + NumAxInFlight + MemSysDepth + ComputeFifoDepth;
 
     /// Address type
     typedef logic [AddrWidth-1:0]   addr_t;
@@ -231,6 +245,7 @@ _rsp_t ${protocol}_write_rsp_i,
         offset_t             shift;
         axi_pkg::len_t       num_beats;
         logic                is_single;
+        idma_pkg::compute_options_t compute;
     } w_dp_req_t;
 
     /// The datapath write response type provides feedback from the write part of the datapath:
@@ -290,6 +305,7 @@ _rsp_t ${protocol}_write_rsp_i,
         idma_pkg::axi_options_t src_axi_opt;
         idma_pkg::axi_options_t dst_axi_opt;
         logic                   super_last;
+        idma_pkg::compute_options_t compute;
     } idma_mut_tf_opt_t;
 
     /// The mutable transfer type holds important information that is mutated by the
@@ -483,7 +499,8 @@ _rsp_t ${protocol}_write_rsp_i,
             tailer:    OffsetWidth'(idma_req_i.length + idma_req_i.dst_addr[OffsetWidth-1:0]),
             shift:     OffsetWidth'(- idma_req_i.dst_addr[OffsetWidth-1:0]),
             num_beats: len,
-            is_single: len == '0
+            is_single: len == '0,
+            compute:   idma_req_i.opt.compute
         };
 
         // if the legalizer is bypassed; every burst is the last of the 1D transfer
@@ -610,7 +627,7 @@ _rsp_t ${protocol}_write_rsp_i,
     );
 
     stream_fifo_optimal_wrap #(
-        .Depth     ( NumAxInFlight ),
+        .Depth     ( NumAxInFlight + ComputeFifoDepth ),
         .type_t    ( w_dp_req_t    ),
         .PrintInfo ( PrintFifoInfo )
     ) i_w_dp_req (
diff --git a/src/backend/tpl/idma_legalizer.sv.tpl b/src/backend/tpl/idma_legalizer.sv.tpl
index 777ee1bc..c3643f69 100644
--- a/src/backend/tpl/idma_legalizer.sv.tpl
+++ b/src/backend/tpl/idma_legalizer.sv.tpl
@@ -478,8 +478,13 @@ w_num_bytes_to_pb = w_page_num_bytes_to_pb;
                 dst_protocol:   req_i.opt.dst_protocol,
                 read_shift:     '0,
                 write_shift:    '0,
+% if enable_compute:
+                decouple_rw:    req_i.opt.beo.decouple_rw | req_i.opt.compute.enable,
+                decouple_aw:    req_i.opt.beo.decouple_aw | req_i.opt.compute.enable,
+% else:
                 decouple_rw:    req_i.opt.beo.decouple_rw,
                 decouple_aw:    req_i.opt.beo.decouple_aw,
+% endif
                 src_max_llen:   req_i.opt.beo.src_max_llen,
                 dst_max_llen:   req_i.opt.beo.dst_max_llen,
                 src_reduce_len: req_i.opt.beo.src_reduce_len,
@@ -487,7 +492,8 @@ w_num_bytes_to_pb = w_page_num_bytes_to_pb;
                 axi_id:         req_i.opt.axi_id,
                 src_axi_opt:    req_i.opt.src,
                 dst_axi_opt:    req_i.opt.dst,
-                super_last:     req_i.opt.last
+                super_last:     req_i.opt.last,
+                compute:        req_i.opt.compute
             };
             // determine shift amount
             if (CombinedShifter) begin
@@ -549,7 +555,8 @@ ${database[used_write_protocols[0]]['legalizer_write_data_path']}
             tailer:       OffsetWidth'(w_num_bytes + w_addr_offset),
             shift:        opt_tf_q.write_shift,
             num_beats:    'd0,
-            is_single:    1'b1
+            is_single:    1'b1,
+            compute:      opt_tf_q.compute
         };
     % endif
     end
@@ -583,7 +590,8 @@ ${database[protocol]['legalizer_write_data_path']}
                 tailer:       OffsetWidth'(w_num_bytes + w_addr_offset),
                 shift:        opt_tf_q.write_shift,
                 num_beats:    'd0,
-                is_single:    1'b1
+                is_single:    1'b1,
+                compute:      opt_tf_q.compute
             };
         endcase
     end
diff --git a/src/backend/tpl/idma_transport_layer.sv.tpl b/src/backend/tpl/idma_transport_layer.sv.tpl
index 91d27131..019864e0 100644
--- a/src/backend/tpl/idma_transport_layer.sv.tpl
+++ b/src/backend/tpl/idma_transport_layer.sv.tpl
@@ -228,6 +228,10 @@ _rsp_t ${protocol}_write_rsp_i,
     // aligned and coalesced data leaving the buffer
     byte_t [2*StrbWidth-1:0] buffer_out_tmp;
     byte_t [StrbWidth-1:0] buffer_out, buffer_out_shifted;
+    // compute write seam (passthrough when compute is off)
+    byte_t [StrbWidth-1:0] wr_data;
+    strb_t                 wr_valid, wr_strb, mask_ext_shifted, dataflow_ready_in;
+    logic                  w_beat_done;
 % if not one_read_port:
 
     // Read multiplexed signals
@@ -439,16 +443,61 @@ ${rendered_read_ports[read_port]}
         .ready_o     ( buffer_in_ready          ),
         .data_o      ( buffer_out               ),
         .valid_o     ( buffer_out_valid         ),
-        .ready_i     ( buffer_out_ready_shifted )
+        .ready_i     ( dataflow_ready_in        )
     );
 
+    //--------------------------------------
+    // On-the-fly compute (write seam)
+    //--------------------------------------
+
+% if enable_compute:
+    logic                  cmp_active;
+    logic                  cmp_in_ready, cmp_out_valid;
+    byte_t [StrbWidth-1:0] cmp_data_o;
+    strb_t                 cmp_strb_o;
+
+    // beats retire on w_beat_done (strobe-independent)
+    idma_otf_compute #(
+        .StrbWidth           ( StrbWidth ),
+        .ComputeEnable       ( '{${', '.join("%s: 1'b1" % op for op in compute_ops)}} ),
+        .TransposeFullDuplex ( 1'b${'1' if compute_full_duplex else '0'} )
+    ) i_idma_otf_compute (
+        .clk_i,
+        .rst_ni,
+        .compute_i   ( w_dp_req_i.compute ),
+        .cfg_valid_i ( w_dp_valid_i        ),
+        .active_o    ( cmp_active          ),
+        .data_i      ( buffer_out          ),
+        .valid_i     ( &buffer_out_valid   ),
+        .in_ready_o  ( cmp_in_ready        ),
+        .data_o      ( cmp_data_o          ),
+        .strb_o      ( cmp_strb_o          ),
+        .valid_o     ( cmp_out_valid       ),
+        .ready_i     ( w_beat_done         )
+    );
+
+    // whole-beat valid; edge masking carried on wr_strb
+    assign wr_data           = cmp_active ? cmp_data_o : buffer_out;
+    assign wr_valid          = cmp_active ? {StrbWidth{cmp_out_valid}} : buffer_out_valid;
+    assign wr_strb           = cmp_active ? cmp_strb_o : '1;
+    // pop the buffer only on a compute input handshake
+    assign dataflow_ready_in = cmp_active ? {StrbWidth{(&buffer_out_valid) & cmp_in_ready}}
+                                          : buffer_out_ready_shifted;
+% else:
+    assign wr_data           = buffer_out;
+    assign wr_valid          = buffer_out_valid;
+    assign wr_strb           = '1;
+    assign dataflow_ready_in = buffer_out_ready_shifted;
+% endif
+
     //--------------------------------------
     // Write Barrel shifter
     //--------------------------------------
 
-    assign buffer_out_tmp           = {buffer_out, buffer_out} >> (w_dp_req_i.shift*8);
+    assign buffer_out_tmp           = {wr_data, wr_data} >> (w_dp_req_i.shift*8);
     assign buffer_out_shifted       = buffer_out_tmp[$bits(buffer_out_shifted)/8-1:0];
-    assign buffer_out_valid_shifted = strb_t'({buffer_out_valid, buffer_out_valid} >>   w_dp_req_i.shift);
+    assign buffer_out_valid_shifted = strb_t'({wr_valid, wr_valid} >>   w_dp_req_i.shift);
+    assign mask_ext_shifted         = strb_t'({wr_strb, wr_strb} >>   w_dp_req_i.shift);
     assign buffer_out_ready_shifted = strb_t'({buffer_out_ready, buffer_out_ready} >> - w_dp_req_i.shift);
 
 % if not one_write_port:
diff --git a/src/db/idma_axi.yml b/src/db/idma_axi.yml
index 02c7f884..497b1abd 100644
--- a/src/db/idma_axi.yml
+++ b/src/db/idma_axi.yml
@@ -66,7 +66,8 @@ legalizer_write_data_path: |
         tailer: OffsetWidth'(w_num_bytes + w_addr_offset),
         shift: opt_tf_q.write_shift,
         num_beats: w_req_o.aw_req.axi.aw_chan.len,
-        is_single: w_req_o.aw_req.axi.aw_chan.len == '0
+        is_single: w_req_o.aw_req.axi.aw_chan.len == '0,
+        compute: opt_tf_q.compute
     };
 read_template: |
     idma_axi_read #(
@@ -127,7 +128,9 @@ write_template: |
         .write_rsp_i        ( ${write_response} ),
         .buffer_out_i       ( buffer_out_shifted ),
         .buffer_out_valid_i ( buffer_out_valid_shifted ),
-        .buffer_out_ready_o ( ${buffer_out_ready} )
+        .buffer_out_ready_o ( ${buffer_out_ready} ),
+        .mask_ext_i         ( mask_ext_shifted ),
+        .w_beat_done_o      ( w_beat_done )
     );
 synth_wrapper_ports_write: |
     output id_t                    axi_aw_id_o,
diff --git a/src/db/idma_tilelink.yml b/src/db/idma_tilelink.yml
index 652a6caf..06771253 100644
--- a/src/db/idma_tilelink.yml
+++ b/src/db/idma_tilelink.yml
@@ -84,7 +84,8 @@ legalizer_write_data_path: |
         tailer: OffsetWidth'(w_num_bytes + w_addr_offset),
         shift: opt_tf_q.write_shift,
         num_beats: 'd0,
-        is_single: w_num_bytes <= StrbWidth
+        is_single: w_num_bytes <= StrbWidth,
+        compute: opt_tf_q.compute
     };
 read_template: |
     idma_tilelink_read #(
diff --git a/src/idma_pkg.sv b/src/idma_pkg.sv
index f3023499..3eb2f95c 100644
--- a/src/idma_pkg.sv
+++ b/src/idma_pkg.sv
@@ -81,6 +81,39 @@ package idma_pkg;
         logic       dst_reduce_len;
     } backend_options_t;
 
+    /// On-the-fly compute operation selector
+    typedef enum logic [3:0] {
+        COMPUTE_NONE      = 4'd0,
+        COMPUTE_TRANSPOSE = 4'd1
+    } compute_op_e;
+
+    /// Transpose tensor dimension width in elements (bound by the inst64 DMCPY argb encoding)
+    localparam int unsigned TransposeDimWidth = 32'd12;
+
+    /// Transpose option type: E = 1<<mode (0/1/2 -> 1/2/4 B); tensor in elements
+    typedef struct packed {
+        logic [1:0]                   mode;
+        logic [TransposeDimWidth-1:0] tensor_m;
+        logic [TransposeDimWidth-1:0] tensor_n;
+    } transpose_options_t;
+
+    /// Per-op compute parameter union (members must be equal width)
+    typedef union packed {
+        transpose_options_t transpose;
+    } compute_params_t;
+
+    /// Compute option type: per-transfer on-the-fly compute selection
+    typedef struct packed {
+        logic            enable;
+        compute_op_e     op;
+        compute_params_t params;
+    } compute_options_t;
+
+    /// Compile-time per-op compute feature enables
+    typedef struct packed {
+        logic transpose;
+    } compute_enable_t;
+
     /// Supported Protocols
     /// - `AXI`: Full AXI
     /// - `AXILITE`: AXI Lite
diff --git a/src/include/idma/typedef.svh b/src/include/idma/typedef.svh
index d250ec10..bceb89c9 100644
--- a/src/include/idma/typedef.svh
+++ b/src/include/idma/typedef.svh
@@ -25,8 +25,9 @@
         axi_id_t                    axi_id;                              \
         idma_pkg::axi_options_t     src;                                 \
         idma_pkg::axi_options_t     dst;                                 \
-        idma_pkg::backend_options_t beo;                                 \
-        logic                       last;                                \
+        idma_pkg::backend_options_t   beo;                               \
+        idma_pkg::compute_options_t   compute;                           \
+        logic                         last;                              \
     } options_t;
 `define IDMA_TYPEDEF_ERR_PAYLOAD_T(err_payload_t, axi_addr_t)            \
     typedef struct packed {                                              \
diff --git a/src/midend/idma_nd_midend.sv b/src/midend/idma_nd_midend.sv
index 07267c97..fd3adaee 100644
--- a/src/midend/idma_nd_midend.sv
+++ b/src/midend/idma_nd_midend.sv
@@ -65,6 +65,13 @@ module idma_nd_midend #(
     /// How many bits are required to index the counters
     localparam int unsigned StrideSelWidth = $clog2(NumDim-1) + 'd1;
 
+`ifndef SYNTHESIS
+    // strides are added with same-width arithmetic; narrower strides would not sign-extend
+    initial assert ($bits(nd_req_i.d_req[0].src_strides) == $bits(nd_req_i.burst_req.src_addr))
+        else $fatal(1, "idma_nd_midend: stride width (%0d) != address width (%0d)",
+                    $bits(nd_req_i.d_req[0].src_strides), $bits(nd_req_i.burst_req.src_addr));
+`endif
+
     // The counter currently active (this is added to the address)
     logic [StrideSelWidth-1:0] stride_sel_d, stride_sel_q;
 
diff --git a/src/midend/idma_transpose_midend.sv b/src/midend/idma_transpose_midend.sv
new file mode 100644
index 00000000..9d79d551
--- /dev/null
+++ b/src/midend/idma_transpose_midend.sv
@@ -0,0 +1,123 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+/// Transpose geometry expander. For a request carrying opt.compute = TRANSPOSE,
+/// derives the NumDim=4 tiled ND walk (row / row-tile / col-tile) from the tensor
+/// shape (M, N, element mode) and the bus StrbWidth, leaving the generic
+/// idma_nd_midend to walk it. Non-transpose requests pass through untouched.
+/// The walk reads the source up to the tile-padded bounds (ceil to NE in both
+/// dims) and writes the full padded dst extent (writes strb-masked, reads not).
+/// Combinational; outputs are quasi-static per request (multicycle-safe in STA).
+module idma_transpose_midend #(
+    /// Number of ND dimensions (must be >= 4 to express the tiled walk)
+    parameter int unsigned NumDim    = 32'd4,
+    /// Write data-path width in bytes (tile side NE = StrbWidth / element bytes)
+    parameter int unsigned StrbWidth = 32'd64,
+    /// Address type
+    parameter type addr_t        = logic,
+    /// ND request type
+    parameter type idma_nd_req_t = logic
+)(
+    input  idma_nd_req_t nd_req_i,
+    input  logic         valid_i,
+    output logic         ready_o,
+    output idma_nd_req_t nd_req_o,
+    output logic         valid_o,
+    input  logic         ready_i
+);
+
+    localparam int unsigned Log2Strb = $clog2(StrbWidth);
+    localparam int unsigned LenW     = $bits(nd_req_o.burst_req.length);
+    localparam int unsigned RepW     = $bits(nd_req_o.d_req[0].reps);
+    localparam int unsigned ModeW    = $bits(nd_req_o.burst_req.opt.compute.params.transpose.mode);
+    localparam int unsigned TensorW  =
+        $bits(nd_req_o.burst_req.opt.compute.params.transpose.tensor_m);
+    localparam int unsigned AddrW    = $bits(addr_t);
+    // working width: largest term (YT*N)<<Log2Strb, +1 for the signed rewind
+    localparam int unsigned ProdW    = 2*TensorW + Log2Strb + 1;
+    localparam int unsigned WorkW    = (ProdW > AddrW) ? ProdW : AddrW;
+
+    assign valid_o = valid_i;
+    assign ready_o = ready_i;
+
+    logic is_transpose;
+    assign is_transpose = nd_req_i.burst_req.opt.compute.enable &
+                          (nd_req_i.burst_req.opt.compute.op == idma_pkg::COMPUTE_TRANSPOSE);
+
+    // NE and E are powers of two: all geometry folds to shifts except the YT*N
+    // stride product.
+    always_comb begin : proc_expand
+        logic [ModeW-1:0]        mode;
+        logic [TensorW-1:0]      tm, tn;
+        logic signed [WorkW-1:0] m, n, log2ne, ne, yt, nt, nxe, mpe;
+        logic signed [WorkW-1:0] strb_c;   // NE*E == StrbWidth (mode cancels)
+
+        nd_req_o = nd_req_i;   // passthrough
+
+        if (is_transpose) begin
+            mode = nd_req_i.burst_req.opt.compute.params.transpose.mode;
+            tm   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_m;
+            tn   = nd_req_i.burst_req.opt.compute.params.transpose.tensor_n;
+            // zero-extend bounded dims into the signed working width
+            m      = $signed({{(WorkW-TensorW){1'b0}}, tm});   // M
+            n      = $signed({{(WorkW-TensorW){1'b0}}, tn});   // N
+            log2ne = $signed(WorkW'(Log2Strb)) - $signed({{(WorkW-ModeW){1'b0}}, mode});
+            ne     = $signed(WorkW'(1)) <<< log2ne;            // tile side (elements)
+            yt     = (m + ne - 1) >>> log2ne;                  // ceil(M/NE)
+            nt     = (n + ne - 1) >>> log2ne;                  // ceil(N/NE)
+            nxe    = n  <<< mode;                              // N*E  (E = 1<<mode)
+            mpe    = yt <<< Log2Strb;                          // MP*E = YT*NE*E = YT*StrbWidth
+            strb_c = $signed(WorkW'(StrbWidth));               // NE*E (one tile-row = StrbWidth B)
+
+            nd_req_o.burst_req.length     = LenW'(StrbWidth);
+
+            // d_req[0] = local row within tile (reps NE)
+            nd_req_o.d_req[0].reps        = ne[RepW-1:0];
+            nd_req_o.d_req[0].src_strides = addr_t'(nxe);
+            nd_req_o.d_req[0].dst_strides = addr_t'(mpe);
+            // d_req[1] = row-tile (reps YT). (NE-1)*MPE = (MPE<<log2ne) - MPE.
+            nd_req_o.d_req[1].reps        = yt[RepW-1:0];
+            nd_req_o.d_req[1].src_strides = addr_t'(nxe);
+            nd_req_o.d_req[1].dst_strides = addr_t'(strb_c - (mpe <<< log2ne) + mpe);
+            // d_req[2] = col-tile (reps NT). (YT*NE-1)*NXE = ((YT*N)<<Log2Strb) - NXE;
+            //            the dst rewind MPE-(YT-1)*StrbWidth collapses to StrbWidth.
+            nd_req_o.d_req[2].reps        = nt[RepW-1:0];
+            nd_req_o.d_req[2].src_strides = addr_t'(strb_c - ((yt * n) <<< Log2Strb) + nxe);
+            nd_req_o.d_req[2].dst_strides = addr_t'(strb_c);
+            // the walk is exactly 4-D: neutralize any higher dims
+            for (int unsigned d = 3; d < NumDim-1; d++) begin
+                nd_req_o.d_req[d].reps        = RepW'(1);
+                nd_req_o.d_req[d].src_strides = '0;
+                nd_req_o.d_req[d].dst_strides = '0;
+            end
+        end
+    end
+
+`ifndef SYNTHESIS
+    initial assert (NumDim >= 4) else
+        $fatal(1, "idma_transpose_midend requires NumDim >= 4 (got %0d)", NumDim);
+    // mode 0..2 needs NE >= 1, i.e. log2(StrbWidth) >= 2
+    initial assert (Log2Strb >= 2) else
+        $fatal(1, "idma_transpose_midend requires StrbWidth >= 4 (got %0d)", StrbWidth);
+    // reps must hold tile counts (<= 2^TensorW) and ne (<= StrbWidth); length StrbWidth.
+    initial assert (RepW >= TensorW && RepW > Log2Strb) else
+        $fatal(1, "idma_transpose_midend: reps field %0d b too narrow (need >= %0d)",
+               RepW, (TensorW > Log2Strb+1) ? TensorW : Log2Strb+1);
+    initial assert (LenW > Log2Strb) else
+        $fatal(1, "idma_transpose_midend: length field %0d b cannot hold StrbWidth", LenW);
+    // reserved mode 3 (EB=8) and zero-size tensors are out of contract
+    always_comb begin : check_domain
+        if (is_transpose) begin
+            assert (nd_req_i.burst_req.opt.compute.params.transpose.mode != 2'd3) else
+                $error("idma_transpose_midend: reserved element mode 3 (EB=8)");
+            assert (nd_req_i.burst_req.opt.compute.params.transpose.tensor_m != '0 &&
+                    nd_req_i.burst_req.opt.compute.params.transpose.tensor_n != '0) else
+                $error("idma_transpose_midend: zero-size tensor (M or N == 0)");
+        end
+    end
+`endif
+
+endmodule
diff --git a/test/idma_test.sv b/test/idma_test.sv
index ece009f6..05a8ae8c 100644
--- a/test/idma_test.sv
+++ b/test/idma_test.sv
@@ -687,7 +687,11 @@ package idma_test;
             input logic [2:0] dst_max_llen,
             input logic       src_reduce_len,
             input logic       dst_reduce_len,
-            input id_t        id
+            input id_t        id,
+            input logic       transpose_en = 1'b0,
+            input logic [1:0] transp_mode  = '0,
+            input logic [11:0] tensor_m    = '0,
+            input logic [11:0] tensor_n    = '0
         );
             idma.req.length                 <= #TA length;
             idma.req.src_addr               <= #TA src_addr;
@@ -702,6 +706,12 @@ package idma_test;
             idma.req.opt.beo.dst_max_llen   <= #TA dst_max_llen;
             idma.req.opt.beo.src_reduce_len <= #TA src_reduce_len;
             idma.req.opt.beo.dst_reduce_len <= #TA dst_reduce_len;
+            idma.req.opt.compute.enable              <= #TA transpose_en;
+            idma.req.opt.compute.op                  <= #TA transpose_en ? idma_pkg::COMPUTE_TRANSPOSE
+                                                                         : idma_pkg::COMPUTE_NONE;
+            idma.req.opt.compute.params.transpose.mode     <= #TA transp_mode;
+            idma.req.opt.compute.params.transpose.tensor_m <= #TA tensor_m;
+            idma.req.opt.compute.params.transpose.tensor_n <= #TA tensor_n;
             idma.req_valid                  <= #TA 1;
             cycle_start();
             while (idma.req_ready != 1) begin cycle_end(); cycle_start(); end
diff --git a/test/idma_transpose_dpi.c b/test/idma_transpose_dpi.c
new file mode 100644
index 00000000..0f7faf3f
--- /dev/null
+++ b/test/idma_transpose_dpi.c
@@ -0,0 +1,39 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// DPI-C golden model for idma_otf_transpose: an element-granular matrix
+// transpose. Element size E in {1,2,4} bytes (int8/fp16/fp32); each E-byte
+// element is kept intact while the M x N element grid is transposed to N x M.
+// Accessor-based (no open-array marshalling): the testbench loads the row-major
+// input byte by byte, calls gm_transpose(m,n,e), then reads back the transposed
+// output bytes. Reference: out_elem[c][r] = in_elem[r][c].
+
+#include <stdint.h>
+
+#define GM_MAX_BYTES (1 << 24)  // 16 MiB
+
+static uint8_t gm_in[GM_MAX_BYTES];
+static uint8_t gm_out[GM_MAX_BYTES];
+
+// Load one input byte at flat byte index.
+void gm_load(int idx, int val) {
+  if (idx >= 0 && idx < GM_MAX_BYTES) gm_in[idx] = (uint8_t)val;
+}
+
+// Transpose an m x n matrix of e-byte elements (row-major) into n x m.
+void gm_transpose(int m, int n, int e) {
+  for (int r = 0; r < m; r++)
+    for (int c = 0; c < n; c++)
+      for (int b = 0; b < e; b++)
+        gm_out[((long)c * m + r) * e + b] = gm_in[((long)r * n + c) * e + b];
+}
+
+// Read one transposed output byte at flat byte index.
+int gm_get(int idx) {
+  if (idx >= 0 && idx < GM_MAX_BYTES) return (int)gm_out[idx];
+  return -1;
+}
diff --git a/test/midend/tb_idma_nd_midend_b2b.sv b/test/midend/tb_idma_nd_midend_b2b.sv
new file mode 100644
index 00000000..aaba907f
--- /dev/null
+++ b/test/midend/tb_idma_nd_midend_b2b.sv
@@ -0,0 +1,146 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// Back-to-back ND regression for idma_nd_midend. Drives the midend directly and
+// golden-checks the burst_req address sequence: two back-to-back transfers (no
+// gap) plus one after an idle gap must each walk from their own base. Catches a
+// stale base-address reuse across transfers.
+
+`include "idma/typedef.svh"
+
+module tb_idma_nd_midend_b2b;
+
+  localparam time TCK = 10ns;
+  localparam int unsigned AddrWidth = 32;
+  localparam int unsigned NumDim    = 4;     // 1D burst + 3 strided dims
+  localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16};
+
+  typedef logic [AddrWidth-1:0] addr_t;
+  typedef logic [31:0]          tf_len_t;
+  typedef logic [11:0]          id_t;
+  typedef logic [31:0]          reps_t;
+
+  `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t)
+  `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t)
+  `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t)
+
+  // ── Program (shared by all three transfers; only the base addresses differ) ──
+  localparam int unsigned R0 = 3, R1 = 2, R2 = 2;
+  localparam int unsigned NB = R0 * R1 * R2;           // bursts per transfer
+  localparam addr_t SS0 = 'h10,   DS0 = 'h100;
+  localparam addr_t SS1 = 'h40,   DS1 = 'h400;
+  localparam addr_t SS2 = 'h1000, DS2 = 'h4000;
+  localparam addr_t S1 = 'h0000_1000, D1 = 'h0001_0000;  // transfer 1 base
+  localparam addr_t S2 = 'h0000_2000, D2 = 'h0002_0000;  // transfer 2 base (back-to-back)
+  localparam addr_t S3 = 'h0000_3000, D3 = 'h0003_0000;  // transfer 3 base (after idle gap)
+
+  logic clk, rst_n;
+  idma_nd_req_t nd_req;  logic nd_req_valid, nd_req_ready;
+  idma_rsp_t    nd_rsp;  logic nd_rsp_valid, nd_rsp_ready;
+  idma_req_t    burst_req; logic burst_req_valid, burst_req_ready;
+  idma_rsp_t    burst_rsp; logic burst_rsp_valid, burst_rsp_ready;
+  logic busy;
+
+  clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n));
+
+  idma_nd_midend #(
+    .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t),
+    .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths)
+  ) i_dut (
+    .clk_i(clk), .rst_ni(rst_n),
+    .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready),
+    .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready),
+    .burst_req_o(burst_req), .burst_req_valid_o(burst_req_valid), .burst_req_ready_i(burst_req_ready),
+    .burst_rsp_i(burst_rsp), .burst_rsp_valid_i(burst_rsp_valid), .burst_rsp_ready_o(burst_rsp_ready),
+    .busy_o(busy)
+  );
+
+  // Backpressure on burst_req_ready is essential: during a stall stride_sel_q
+  // collapses toward 0, which is what can defeat the base reload. ready always-1 hides it.
+  logic [2:0] bp_lfsr;
+  always @(posedge clk or negedge rst_n)
+    if (!rst_n) bp_lfsr <= 3'b101;
+    else        bp_lfsr <= {bp_lfsr[1:0], bp_lfsr[2] ^ bp_lfsr[1]};
+  assign burst_req_ready = bp_lfsr[0];   // stalls ~half the cycles, incl. boundaries
+  assign burst_rsp       = '0;
+  assign burst_rsp_valid = 1'b0;
+  assign nd_rsp_ready    = 1'b1;
+
+  // ── Capture every issued burst's src/dst address in order ──
+  addr_t cap_src [$];
+  addr_t cap_dst [$];
+  always @(posedge clk) if (rst_n && burst_req_valid && burst_req_ready) begin
+    cap_src.push_back(burst_req.src_addr);
+    cap_dst.push_back(burst_req.dst_addr);
+  end
+
+  // build a NumDim=4 ND program with a given base
+  function automatic idma_nd_req_t mk_req(input addr_t s, input addr_t d);
+    idma_nd_req_t r = '0;
+    r.burst_req.length   = tf_len_t'('h8);
+    r.burst_req.src_addr = s;
+    r.burst_req.dst_addr = d;
+    r.burst_req.opt.src_protocol = idma_pkg::AXI;
+    r.burst_req.opt.dst_protocol = idma_pkg::AXI;
+    r.burst_req.opt.src.burst    = axi_pkg::BURST_INCR;
+    r.burst_req.opt.dst.burst    = axi_pkg::BURST_INCR;
+    r.d_req[0].reps = reps_t'(R0); r.d_req[0].src_strides = SS0; r.d_req[0].dst_strides = DS0;
+    r.d_req[1].reps = reps_t'(R1); r.d_req[1].src_strides = SS1; r.d_req[1].dst_strides = DS1;
+    r.d_req[2].reps = reps_t'(R2); r.d_req[2].src_strides = SS2; r.d_req[2].dst_strides = DS2;
+    return r;
+  endfunction
+
+  initial begin
+    automatic int unsigned errs = 0;
+    nd_req = '0; nd_req_valid = 1'b0;
+    @(posedge rst_n);
+    repeat (3) @(posedge clk);
+
+    // ── transfer 1 ──
+    nd_req = mk_req(S1, D1); nd_req_valid = 1'b1;
+    @(posedge clk);
+    while (!nd_req_ready) @(posedge clk);
+    // ── transfer 2 : BACK-TO-BACK (keep valid high, swap payload the cycle after accept) ──
+    nd_req = mk_req(S2, D2);
+    @(posedge clk);
+    while (!nd_req_ready) @(posedge clk);
+    nd_req_valid = 1'b0;
+    nd_req = '0;
+    // ── idle gap ──
+    repeat (5) @(posedge clk);
+    // ── transfer 3 : after the gap ──
+    nd_req = mk_req(S3, D3); nd_req_valid = 1'b1;
+    @(posedge clk);
+    while (!nd_req_ready) @(posedge clk);
+    nd_req_valid = 1'b0;
+    repeat (3) @(posedge clk);
+
+    // ── checks ──
+    if (cap_src.size() != 3*NB)
+      begin errs++; $display("[B2B] burst count %0d != %0d", cap_src.size(), 3*NB); end
+    else begin
+      // first burst of each transfer must equal its OWN base (reload happened)
+      if (cap_src[0]     !== S1 || cap_dst[0]     !== D1) begin errs++; $display("[B2B] T1[0]=(%0h,%0h) exp (%0h,%0h)", cap_src[0], cap_dst[0], S1, D1); end
+      if (cap_src[NB]    !== S2 || cap_dst[NB]    !== D2) begin errs++; $display("[B2B] T2[0]=(%0h,%0h) exp (%0h,%0h) -- back-to-back base NOT reloaded", cap_src[NB], cap_dst[NB], S2, D2); end
+      if (cap_src[2*NB]  !== S3 || cap_dst[2*NB]  !== D3) begin errs++; $display("[B2B] T3[0]=(%0h,%0h) exp (%0h,%0h)", cap_src[2*NB], cap_dst[2*NB], S3, D3); end
+      // full-sequence independence: T2 and T3 must be T1 shifted by their base delta
+      for (int unsigned i = 0; i < NB; i++) begin
+        if ((cap_src[NB+i]   - cap_src[i]) !== (S2 - S1) || (cap_dst[NB+i]   - cap_dst[i]) !== (D2 - D1)) begin
+          errs++; if (errs <= 8) $display("[B2B] T2[%0d] not T1+delta: src %0h vs %0h (Δexp %0h)", i, cap_src[NB+i], cap_src[i], S2-S1); end
+        if ((cap_src[2*NB+i] - cap_src[i]) !== (S3 - S1) || (cap_dst[2*NB+i] - cap_dst[i]) !== (D3 - D1)) begin
+          errs++; if (errs <= 8) $display("[B2B] T3[%0d] not T1+delta: src %0h vs %0h (Δexp %0h)", i, cap_src[2*NB+i], cap_src[i], S3-S1); end
+      end
+    end
+
+    if (errs == 0) $display("[B2B] PASS: %0d back-to-back + gapped ND transfers each walked from their own base", 3*NB);
+    else           $fatal(1, "[B2B] FAIL: %0d errors (back-to-back ND base-address reuse)", errs);
+    $finish();
+  end
+
+  initial begin #500_000; $fatal(1, "[B2B] timeout"); end
+
+endmodule
diff --git a/test/midend/tb_idma_transpose_midend.sv b/test/midend/tb_idma_transpose_midend.sv
new file mode 100644
index 00000000..4f7440e2
--- /dev/null
+++ b/test/midend/tb_idma_transpose_midend.sv
@@ -0,0 +1,102 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Daniel Keller <dankeller@iis.ee.ethz.ch>
+
+`include "idma/typedef.svh"
+
+/// Unit check for idma_transpose_midend: the expanded NumDim=4 ND request must
+/// match the golden geometry that tb_idma_transpose_nd hand-builds, and a
+/// non-transpose request must pass through unchanged.
+module tb_idma_transpose_midend #(
+  parameter int unsigned DataWidth = 512,
+  parameter int unsigned AddrWidth = 64,
+  parameter int unsigned M  = 40,
+  parameter int unsigned N  = 24,
+  parameter int unsigned EB = 1
+);
+  import idma_pkg::*;
+
+  localparam int unsigned StrbWidth = DataWidth/8;
+  localparam int unsigned NE   = StrbWidth/EB;
+  localparam int unsigned MODE = (EB==4) ? 2 : (EB==2) ? 1 : 0;
+  localparam int unsigned YT   = (M + NE - 1)/NE;
+  localparam int unsigned NT   = (N + NE - 1)/NE;
+  localparam int unsigned MP   = YT*NE;
+  localparam int unsigned NumDim = 4;
+
+  typedef logic [AddrWidth-1:0] addr_t;
+  typedef logic [31:0]          tf_len_t;
+  typedef logic [2:0]           id_t;
+  typedef logic [31:0]          reps_t;
+
+  `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t)
+  `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t)
+
+  idma_nd_req_t nd_in, nd_out;
+  logic vi, ro, vo, ri;
+  assign vi = 1'b1;
+  assign ri = 1'b1;
+
+  idma_transpose_midend #(
+    .NumDim(NumDim), .StrbWidth(StrbWidth), .addr_t(addr_t), .idma_nd_req_t(idma_nd_req_t)
+  ) i_dut (
+    .nd_req_i(nd_in), .valid_i(vi), .ready_o(ro),
+    .nd_req_o(nd_out), .valid_o(vo), .ready_i(ri)
+  );
+
+  int errs = 0;
+  task automatic chk(input string name, input logic [63:0] got, input logic [63:0] exp);
+    if (got !== exp) begin
+      errs++;
+      $display("[MID] %s: got %0d exp %0d", name, $signed(got), $signed(exp));
+    end
+  endtask
+
+  initial begin
+    // --- passthrough case ---
+    nd_in = '0;
+    nd_in.burst_req.src_addr = 64'hDEAD;
+    nd_in.burst_req.dst_addr = 64'hBEEF;
+    nd_in.d_req[0].reps = 7;
+    #1;
+    if (nd_out !== nd_in) begin
+      errs++;
+      $display("[MID] passthrough altered a non-transpose request");
+    end
+
+    // --- transpose case ---
+    nd_in = '0;
+    nd_in.burst_req.src_addr = 64'h1000;
+    nd_in.burst_req.dst_addr = 64'h2000;
+    nd_in.burst_req.opt.compute.enable                  = 1'b1;
+    nd_in.burst_req.opt.compute.op                      = COMPUTE_TRANSPOSE;
+    nd_in.burst_req.opt.compute.params.transpose.mode     = 2'(MODE);
+    nd_in.burst_req.opt.compute.params.transpose.tensor_m = 12'(M);
+    nd_in.burst_req.opt.compute.params.transpose.tensor_n = 12'(N);
+    #1;
+
+    // golden geometry (same formulas as tb_idma_transpose_nd)
+    chk("length",  nd_out.burst_req.length,     NE*EB);
+    chk("d0.reps", nd_out.d_req[0].reps,        NE);
+    chk("d0.src",  nd_out.d_req[0].src_strides, addr_t'(N*EB));
+    chk("d0.dst",  nd_out.d_req[0].dst_strides, addr_t'(MP*EB));
+    chk("d1.reps", nd_out.d_req[1].reps,        YT);
+    chk("d1.src",  nd_out.d_req[1].src_strides, addr_t'(N*EB));
+    chk("d1.dst",  nd_out.d_req[1].dst_strides, addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB)));
+    chk("d2.reps", nd_out.d_req[2].reps,        NT);
+    chk("d2.src",  nd_out.d_req[2].src_strides, addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB)));
+    chk("d2.dst",  nd_out.d_req[2].dst_strides, addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB)));
+    // addresses + compute must survive untouched
+    chk("src_addr", nd_out.burst_req.src_addr, 64'h1000);
+    chk("dst_addr", nd_out.burst_req.dst_addr, 64'h2000);
+    chk("cmp_en",   nd_out.burst_req.opt.compute.enable, 1);
+
+    if (errs == 0)
+      $display("[MID] PASS: %0dx%0d EB=%0d golden (NE=%0d YT=%0d NT=%0d MP=%0d)",
+               M, N, EB, NE, YT, NT, MP);
+    else           $fatal(1, "[MID] FAIL: %0d mismatches", errs);
+    $finish;
+  end
+endmodule
diff --git a/test/tb_idma_otf_transpose.sv b/test/tb_idma_otf_transpose.sv
new file mode 100644
index 00000000..23e29a3d
--- /dev/null
+++ b/test/tb_idma_otf_transpose.sv
@@ -0,0 +1,167 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// Standalone self-checking testbench for idma_otf_transpose (element transpose,
+// no iDMA backend / no protocol deps). Verifies a full M x N transpose of
+// EB-byte elements (EB in {1,2,4} = int8/fp16/fp32), split into NE-square tiles
+// (NE = StrbWidth/EB): input padded to full tiles, fed in (col-tile, row-tile,
+// row) order; output collected through the engine's per-byte strb_o edge mask.
+// Expected result comes from the DPI-C golden (idma_transpose_dpi.c). Checks
+// correctness vs golden, full coverage, and no strobe asserted on padding.
+// Optional two-sided backpressure via +BP. Override with -gM= -gN= -gEB=.
+
+`timescale 1ns/1ps
+
+module tb_idma_otf_transpose #(
+  parameter int unsigned StrbWidth  = 32'd8,
+  parameter bit          FullDuplex = 1'b1,
+  parameter int unsigned M         = 32'd8,   // matrix rows (elements)
+  parameter int unsigned N         = 32'd8,   // matrix cols (elements)
+  parameter int unsigned EB        = 32'd1    // element size in bytes (1/2/4)
+);
+
+  import "DPI-C" function void gm_load(input int idx, input int val);
+  import "DPI-C" function void gm_transpose(input int m, input int n, input int e);
+  import "DPI-C" function int  gm_get(input int idx);
+
+  localparam int unsigned MODE = (EB == 4) ? 2 : (EB == 2) ? 1 : 0;  // log2(EB)
+  localparam int unsigned NE   = StrbWidth / EB;                     // elements/beat = tile side
+  localparam int unsigned YT   = (M + NE - 1) / NE;                  // row-tiles
+  localparam int unsigned NT   = (N + NE - 1) / NE;                  // col-tiles
+  localparam int unsigned LR   = M % NE;
+  localparam int unsigned LC   = N % NE;
+  localparam logic [7:0]  PAD  = 8'hFF;
+
+  logic clk = 1'b0, rst_n = 1'b0, clear = 1'b0;
+  always #5 clk = ~clk;
+
+  logic [StrbWidth-1:0][7:0] din_data;
+  logic                      din_valid, din_ready;
+  logic [StrbWidth-1:0][7:0] dout_data;
+  logic [StrbWidth-1:0]      dout_strb;
+  logic                      dout_valid, dout_ready;
+
+  idma_otf_transpose #(
+    .StrbWidth  (StrbWidth),
+    .FullDuplex (FullDuplex)
+  ) i_dut (
+    .clk_i           (clk),
+    .rst_ni          (rst_n),
+    .clear_i         (clear),
+    .transp_mode_i   (2'(MODE)),
+    .tensor_size_m_i (12'(M)),
+    .tensor_size_n_i (12'(N)),
+    .data_i          (din_data),
+    .valid_i         (din_valid),
+    .ready_o         (din_ready),
+    .data_o          (dout_data),
+    .strb_o          (dout_strb),
+    .valid_o         (dout_valid),
+    .ready_i         (dout_ready)
+  );
+
+  logic [7:0] inb [M*N*EB];     // row-major input bytes
+  bit         wrote [N*M];      // per transposed-element coverage
+  int unsigned errors = 0;
+  bit backpressure = 1'b0;
+
+  task automatic drive_inputs();
+    int unsigned beat = 0;
+    din_valid = 1'b0; din_data = '0;
+    @(posedge clk);
+    for (int nt = 0; nt < NT; nt++)
+      for (int rt = 0; rt < YT; rt++)
+        for (int row = 0; row < NE; row++) begin
+          if (backpressure) begin din_valid = 1'b0; repeat (beat % 3) @(posedge clk); end
+          for (int c = 0; c < NE; c++) begin
+            int gr = rt*NE + row;
+            int gc = nt*NE + c;
+            for (int b = 0; b < EB; b++)
+              din_data[c*EB + b] = (gr < M && gc < N) ? inb[(gr*N + gc)*EB + b] : PAD;
+          end
+          din_valid = 1'b1;
+          do @(posedge clk); while (!din_ready);
+          beat++;
+        end
+    din_valid = 1'b0;
+  endtask
+
+  task automatic capture_outputs();
+    int unsigned beat = 0;
+    dout_ready = 1'b0;
+    for (int nt = 0; nt < NT; nt++)
+      for (int rt = 0; rt < YT; rt++)
+        for (int k = 0; k < NE; k++) begin
+          if (backpressure) begin dout_ready = 1'b0; repeat (beat % 4) @(posedge clk); end
+          dout_ready = 1'b1;
+          do @(posedge clk); while (!dout_valid);
+          for (int e = 0; e < NE; e++) begin
+            if (dout_strb[e*EB]) begin     // element e valid (element-granular mask)
+              int tr = nt*NE + k;          // transposed row (= original col, 0..N-1)
+              int tc = rt*NE + e;          // transposed col (= original row, 0..M-1)
+              if (tr >= N || tc >= M) begin
+                errors++;
+                if (errors <= 16) $display("STRB-ON-PAD beat(nt%0d rt%0d k%0d) elem %0d -> (%0d,%0d) OOB", nt, rt, k, e, tr, tc);
+              end else begin
+                for (int b = 0; b < EB; b++) begin
+                  automatic int gold = gm_get((tr*M + tc)*EB + b);
+                  if (int'(dout_data[e*EB + b]) !== gold) begin
+                    errors++;
+                    if (errors <= 16) $display("MISMATCH T(%0d,%0d).b%0d=%0d golden=%0d", tr, tc, b, dout_data[e*EB+b], gold);
+                  end
+                end
+                wrote[tr*M + tc] = 1'b1;
+              end
+            end
+          end
+          beat++;
+        end
+    @(posedge clk);
+    dout_ready = 1'b0;
+  endtask
+
+  initial begin
+    for (int i = 0; i < M*N*EB; i++) inb[i] = 8'((i * 7 + 3) & 8'hFF);  // varied stimulus
+    for (int i = 0; i < M*N*EB; i++) gm_load(i, int'(inb[i]));
+    gm_transpose(M, N, EB);
+    for (int i = 0; i < N*M; i++) wrote[i] = 1'b0;
+
+    din_valid = 1'b0; dout_ready = 1'b0;
+    if ($test$plusargs("BP")) backpressure = 1'b1;
+    $display("[TB] idma_otf_transpose M=%0d N=%0d EB=%0d (tile=%0d elems, %0dx%0d tiles, LR=%0d LC=%0d) BP=%0d",
+             M, N, EB, NE, YT, NT, LR, LC, backpressure);
+
+    rst_n = 1'b0; clear = 1'b1;
+    repeat (4) @(posedge clk);
+    rst_n = 1'b1;
+    repeat (2) @(posedge clk);
+    clear = 1'b0;
+    @(posedge clk);
+
+    fork drive_inputs(); capture_outputs(); join
+
+    for (int tr = 0; tr < N; tr++)
+      for (int tc = 0; tc < M; tc++)
+        if (!wrote[tr*M + tc]) begin
+          errors++;
+          if (errors <= 16) $display("MISSING transposed elem (%0d,%0d)", tr, tc);
+        end
+
+    if (errors == 0) $display("[TB] PASS: %0dx%0d EB=%0d transpose matches DPI golden", M, N, EB);
+    else             $display("[TB] FAIL: %0d errors", errors);
+
+    repeat (5) @(posedge clk);
+    $finish;
+  end
+
+  initial begin
+    #5000000;
+    $display("[TB] FAIL: timeout");
+    $finish;
+  end
+
+endmodule
diff --git a/test/tb_idma_transpose_b2b.sv b/test/tb_idma_transpose_b2b.sv
new file mode 100644
index 00000000..4337dce1
--- /dev/null
+++ b/test/tb_idma_transpose_b2b.sv
@@ -0,0 +1,214 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// End-to-end back-to-back transpose regression: two transposes of one source to
+// DIFFERENT dst bases through the ND midend -> rw_axi backend -> axi_sim_mem. A
+// stale base across transfers would leave the second dst untouched. Both checked.
+
+`include "axi/typedef.svh"
+`include "idma/typedef.svh"
+
+module tb_idma_transpose_b2b
+  import idma_pkg::*;
+#(
+  parameter int unsigned DataWidth = 32,
+  parameter int unsigned AddrWidth = 32,
+  parameter int unsigned UserWidth = 1,
+  parameter int unsigned AxiIdWidth = 12,
+  parameter int unsigned TFLenWidth = 32,
+  parameter int unsigned M  = 6,
+  parameter int unsigned N  = 8,
+  parameter int unsigned EB = 1
+);
+
+  localparam time TA = 1ns, TT = 9ns, TCK = 10ns;
+  localparam int unsigned StrbWidth = DataWidth / 8;
+  localparam int unsigned NE   = StrbWidth / EB;
+  localparam int unsigned MODE = (EB == 4) ? 2 : (EB == 2) ? 1 : 0;
+  localparam int unsigned YT   = (M + NE - 1) / NE;
+  localparam int unsigned NT   = (N + NE - 1) / NE;
+  localparam int unsigned MP   = YT * NE;
+  localparam int unsigned NumDim = 4;
+  localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16};
+
+  typedef logic [AddrWidth-1:0]  addr_t;
+  typedef logic [DataWidth-1:0]  data_t;
+  typedef logic [StrbWidth-1:0]  strb_t;
+  typedef logic [AxiIdWidth-1:0] id_t;
+  typedef logic [UserWidth-1:0]  user_t;
+  typedef logic [TFLenWidth-1:0] tf_len_t;
+  typedef logic [31:0]           reps_t;
+
+  `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, addr_t, id_t, user_t)
+  `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, data_t, strb_t, user_t)
+  `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, id_t, user_t)
+  `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, addr_t, id_t, user_t)
+  `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, data_t, id_t, user_t)
+  `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t)
+  `AXI_TYPEDEF_RESP_T(axi_rsp_t, axi_b_chan_t, axi_r_chan_t)
+
+  `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t)
+  `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t)
+  `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t)
+
+  typedef struct packed { axi_ar_chan_t ar_chan; } axi_read_meta_channel_t;
+  typedef struct packed { axi_read_meta_channel_t axi; } read_meta_channel_t;
+  typedef struct packed { axi_aw_chan_t aw_chan; } axi_write_meta_channel_t;
+  typedef struct packed { axi_write_meta_channel_t axi; } write_meta_channel_t;
+
+  logic clk, rst_n;
+  idma_req_t   idma_req;   logic req_valid, req_ready;
+  idma_rsp_t   idma_rsp;   logic rsp_valid, rsp_ready;
+  idma_eh_req_t idma_eh_req; logic eh_req_valid, eh_req_ready;
+  idma_nd_req_t nd_req;    logic nd_req_valid, nd_req_ready;
+  idma_rsp_t   nd_rsp;     logic nd_rsp_valid, nd_rsp_ready;
+  axi_req_t axi_read_req, axi_write_req, axi_req, axi_req_mem;
+  axi_rsp_t axi_read_rsp, axi_write_rsp, axi_rsp, axi_rsp_mem;
+  idma_busy_t busy; logic nd_busy;
+
+  assign idma_eh_req = '0;
+  assign eh_req_valid = 1'b0;
+
+  clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n));
+
+  axi_rw_join #(.axi_req_t(axi_req_t), .axi_resp_t(axi_rsp_t)) i_axi_rw_join (
+    .clk_i(clk), .rst_ni(rst_n),
+    .slv_read_req_i(axi_read_req),  .slv_read_resp_o(axi_read_rsp),
+    .slv_write_req_i(axi_write_req), .slv_write_resp_o(axi_write_rsp),
+    .mst_req_o(axi_req), .mst_resp_i(axi_rsp)
+  );
+  assign axi_req_mem = axi_req;
+  assign axi_rsp     = axi_rsp_mem;
+
+  axi_sim_mem #(
+    .AddrWidth(AddrWidth), .DataWidth(DataWidth), .IdWidth(AxiIdWidth), .UserWidth(UserWidth),
+    .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t),
+    .WarnUninitialized(1'b0), .ClearErrOnAccess(1'b1), .ApplDelay(TA), .AcqDelay(TT)
+  ) i_axi_sim_mem (
+    .clk_i(clk), .rst_ni(rst_n), .axi_req_i(axi_req_mem), .axi_rsp_o(axi_rsp_mem),
+    .mon_r_last_o(), .mon_r_beat_count_o(), .mon_r_user_o(), .mon_r_id_o(),
+    .mon_r_data_o(), .mon_r_addr_o(), .mon_r_valid_o(),
+    .mon_w_last_o(), .mon_w_beat_count_o(), .mon_w_user_o(), .mon_w_id_o(),
+    .mon_w_data_o(), .mon_w_addr_o(), .mon_w_valid_o()
+  );
+
+  idma_nd_midend #(
+    .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t),
+    .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths)
+  ) i_nd_midend (
+    .clk_i(clk), .rst_ni(rst_n),
+    .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready),
+    .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready),
+    .burst_req_o(idma_req), .burst_req_valid_o(req_valid), .burst_req_ready_i(req_ready),
+    .burst_rsp_i(idma_rsp), .burst_rsp_valid_i(rsp_valid), .burst_rsp_ready_o(rsp_ready),
+    .busy_o(nd_busy)
+  );
+
+  idma_backend_rw_axi #(
+    .CombinedShifter(1'b0), .DataWidth(DataWidth), .AddrWidth(AddrWidth), .AxiIdWidth(AxiIdWidth),
+    .UserWidth(UserWidth), .TFLenWidth(TFLenWidth), .MaskInvalidData(1'b1), .BufferDepth(3),
+    .RAWCouplingAvail(1'b1), .HardwareLegalizer(1'b1), .RejectZeroTransfers(1'b1),
+    .ErrorCap(idma_pkg::NO_ERROR_HANDLING), .PrintFifoInfo(1'b0), .NumAxInFlight(StrbWidth), .MemSysDepth(0),
+    .idma_req_t(idma_req_t), .idma_rsp_t(idma_rsp_t), .idma_eh_req_t(idma_eh_req_t),
+    .idma_busy_t(idma_busy_t), .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t),
+    .write_meta_channel_t(write_meta_channel_t), .read_meta_channel_t(read_meta_channel_t)
+  ) i_idma_backend (
+    .clk_i(clk), .rst_ni(rst_n), .testmode_i(1'b0),
+    .idma_req_i(idma_req), .req_valid_i(req_valid), .req_ready_o(req_ready),
+    .idma_rsp_o(idma_rsp), .rsp_valid_o(rsp_valid), .rsp_ready_i(rsp_ready),
+    .idma_eh_req_i(idma_eh_req), .eh_req_valid_i(eh_req_valid), .eh_req_ready_o(eh_req_ready),
+    .axi_read_req_o(axi_read_req), .axi_read_rsp_i(axi_read_rsp),
+    .axi_write_req_o(axi_write_req), .axi_write_rsp_i(axi_write_rsp), .busy_o(busy)
+  );
+
+  stream_watchdog #(.NumCycles(4000)) i_r_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_rsp.r_valid), .ready_i(axi_req.r_ready));
+  stream_watchdog #(.NumCycles(4000)) i_w_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_req.w_valid), .ready_i(axi_rsp.w_ready));
+
+  addr_t sb = 'h0000_1000;
+
+  task automatic wr_mem(input addr_t a, input logic [7:0] d); i_axi_sim_mem.mem[a] = d; endtask
+  function automatic logic [7:0] rd_mem(input addr_t a);
+    return i_axi_sim_mem.mem.exists(a) ? i_axi_sim_mem.mem[a] : 8'hxx;
+  endfunction
+
+  // one transpose of the source at sb -> dst base `db`; returns error count
+  task automatic do_transpose(input addr_t db, output int unsigned errs);
+    errs = 0;
+    // pre-fill full padded dst extent with sentinel
+    for (int unsigned i = 0; i < NT*NE; i++)
+      for (int unsigned j = 0; j < MP; j++)
+        for (int unsigned b = 0; b < EB; b++)
+          wr_mem(db + (i*MP + j)*EB + b, 8'hCC);
+    nd_req = '0;
+    nd_req.burst_req.length   = tf_len_t'(NE*EB);
+    nd_req.burst_req.src_addr = sb;
+    nd_req.burst_req.dst_addr = db;
+    nd_req.burst_req.opt.src_protocol = idma_pkg::AXI;
+    nd_req.burst_req.opt.dst_protocol = idma_pkg::AXI;
+    nd_req.burst_req.opt.src.burst    = axi_pkg::BURST_INCR;
+    nd_req.burst_req.opt.dst.burst    = axi_pkg::BURST_INCR;
+    nd_req.burst_req.opt.beo.decouple_rw = 1'b1;
+    nd_req.burst_req.opt.beo.decouple_aw = 1'b1;
+    nd_req.burst_req.opt.compute.enable                  = 1'b1;
+    nd_req.burst_req.opt.compute.op                      = idma_pkg::COMPUTE_TRANSPOSE;
+    nd_req.burst_req.opt.compute.params.transpose.mode     = 2'(MODE);
+    nd_req.burst_req.opt.compute.params.transpose.tensor_m = 12'(M);
+    nd_req.burst_req.opt.compute.params.transpose.tensor_n = 12'(N);
+    nd_req.burst_req.opt.last         = 1'b1;
+    nd_req.d_req[0].reps = reps_t'(NE); nd_req.d_req[0].src_strides = addr_t'(int'(N*EB));                       nd_req.d_req[0].dst_strides = addr_t'(int'(MP*EB));
+    nd_req.d_req[1].reps = reps_t'(YT); nd_req.d_req[1].src_strides = addr_t'(int'(N*EB));                       nd_req.d_req[1].dst_strides = addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB));
+    nd_req.d_req[2].reps = reps_t'(NT); nd_req.d_req[2].src_strides = addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB)); nd_req.d_req[2].dst_strides = addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB));
+    nd_req_valid = 1'b1;
+    do @(posedge clk); while (!nd_req_ready);   // drop valid the cycle accept is seen (compliant)
+    nd_req_valid = 1'b0;
+    nd_req = '0;
+    while (!(nd_rsp_valid && nd_rsp_ready)) @(posedge clk);
+    repeat (20) @(posedge clk);
+    // data + padding checks
+    for (int unsigned c = 0; c < N; c++)
+      for (int unsigned r = 0; r < M; r++)
+        for (int unsigned b = 0; b < EB; b++)
+          if (rd_mem(db + (c*MP + r)*EB + b) !== rd_mem(sb + (r*N + c)*EB + b)) begin
+            errs++; if (errs <= 8) $display("[B2BT] @db=%0h MISMATCH out_T[%0d][%0d].b%0d=%02h exp %02h", db, c, r, b, rd_mem(db+(c*MP+r)*EB+b), rd_mem(sb+(r*N+c)*EB+b));
+          end
+    for (int unsigned i = 0; i < NT*NE; i++)
+      for (int unsigned j = 0; j < MP; j++)
+        if (i >= N || j >= M)
+          for (int unsigned b = 0; b < EB; b++)
+            if (rd_mem(db + (i*MP + j)*EB + b) !== 8'hCC) begin
+              errs++; if (errs <= 8) $display("[B2BT] @db=%0h PADDING CLOBBERED row=%0d col=%0d", db, i, j);
+            end
+  endtask
+
+  initial begin
+    automatic int unsigned e1, e2;
+    automatic addr_t db1 = 'h0000_4000;
+    automatic addr_t db2 = 'h0000_8000;   // DIFFERENT base — a stale-addr bug misplaces xfer 2
+    nd_req_valid = 1'b0; nd_rsp_ready = 1'b1; nd_req = '0;
+    @(posedge rst_n);
+    repeat (5) @(posedge clk);
+    for (int unsigned r = 0; r < M; r++)
+      for (int unsigned c = 0; c < N; c++)
+        for (int unsigned b = 0; b < EB; b++)
+          wr_mem(sb + (r*N + c)*EB + b, 8'((( (r*N+c)*EB + b )*7 + 3) & 8'hFF));
+
+    $display("[B2BT] transfer 1 -> db=%0h", db1);
+    do_transpose(db1, e1);
+    $display("[B2BT] transfer 2 (back-to-back) -> db=%0h", db2);
+    do_transpose(db2, e2);
+
+    if (e1 == 0 && e2 == 0)
+      $display("[B2BT] PASS: two back-to-back %0dx%0d EB=%0d transposes both correct (xfer2 landed at its own dst)", M, N, EB);
+    else
+      $fatal(1, "[B2BT] FAIL: xfer1 errs=%0d xfer2 errs=%0d", e1, e2);
+    repeat (5) @(posedge clk);
+    $finish();
+  end
+
+  initial begin #5_000_000; $fatal(1, "[B2BT] timeout"); end
+
+endmodule
diff --git a/test/tb_idma_transpose_nd.sv b/test/tb_idma_transpose_nd.sv
new file mode 100644
index 00000000..1fa3e3ba
--- /dev/null
+++ b/test/tb_idma_transpose_nd.sv
@@ -0,0 +1,263 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Authors:
+// - Daniel Keller <dankeller@iis.ee.ethz.ch>
+//
+// Self-checking multi-tile transpose testbench: idma_nd_midend (NumDim=4,
+// transposed-stride program) -> idma_backend_rw_axi (EnableTranspose) ->
+// axi_sim_mem. The ND midend generates the tiled read order (col-tile,
+// row-tile, local-row) and the transposed destination placement the engine
+// requires, so a full M x N transpose works end-to-end (not just one tile).
+// Reference: out_T[c][r] = in[r][c] over E-byte elements.
+
+`include "axi/typedef.svh"
+`include "idma/typedef.svh"
+
+module tb_idma_transpose_nd
+  import idma_pkg::*;
+#(
+  parameter int unsigned DataWidth  = 32,
+  parameter int unsigned AddrWidth  = 32,
+  parameter int unsigned UserWidth  = 1,
+  parameter int unsigned AxiIdWidth = 12,
+  parameter int unsigned TFLenWidth = 32,
+  parameter int unsigned M          = 8,   // matrix rows (elements)
+  parameter int unsigned N          = 8,   // matrix cols (elements)
+  parameter int unsigned EB         = 1,   // element size in bytes (1/2/4)
+  // 0 = auto: NumAxInFlight = StrbWidth (transpose needs >= NE in-flight; NE = StrbWidth at E=1)
+  parameter int unsigned NumAxInFlight = 0,
+  parameter int unsigned BufferDepth   = 3
+);
+
+  localparam time TA  = 1ns;
+  localparam time TT  = 9ns;
+  localparam time TCK = 10ns;
+
+  localparam int unsigned StrbWidth = DataWidth / 8;
+  localparam int unsigned NE        = StrbWidth / EB;        // tile side (elements)
+  localparam int unsigned AxIF      = (NumAxInFlight == 0) ? StrbWidth : NumAxInFlight;
+  localparam int unsigned MODE      = (EB == 4) ? 2 : (EB == 2) ? 1 : 0;
+  localparam int unsigned YT        = (M + NE - 1) / NE;     // row-tiles
+  localparam int unsigned NT        = (N + NE - 1) / NE;     // col-tiles
+  // padded Aᵀ row pitch: M rounded up to a tile so every row is StrbWidth-aligned
+  localparam int unsigned MP        = YT * NE;
+  localparam int unsigned NumDim    = 4;                     // 1D + {row, row-tile, col-tile}
+  localparam logic [NumDim-1:0][31:0] RepWidths = '{default: 32'd16};
+
+  // ── Types ──
+  typedef logic [AddrWidth-1:0]   addr_t;
+  typedef logic [DataWidth-1:0]   data_t;
+  typedef logic [StrbWidth-1:0]   strb_t;
+  typedef logic [AxiIdWidth-1:0]  id_t;
+  typedef logic [UserWidth-1:0]   user_t;
+  typedef logic [TFLenWidth-1:0]  tf_len_t;
+  typedef logic [31:0]            reps_t;
+
+  `AXI_TYPEDEF_AW_CHAN_T(axi_aw_chan_t, addr_t, id_t, user_t)
+  `AXI_TYPEDEF_W_CHAN_T(axi_w_chan_t, data_t, strb_t, user_t)
+  `AXI_TYPEDEF_B_CHAN_T(axi_b_chan_t, id_t, user_t)
+  `AXI_TYPEDEF_AR_CHAN_T(axi_ar_chan_t, addr_t, id_t, user_t)
+  `AXI_TYPEDEF_R_CHAN_T(axi_r_chan_t, data_t, id_t, user_t)
+  `AXI_TYPEDEF_REQ_T(axi_req_t, axi_aw_chan_t, axi_w_chan_t, axi_ar_chan_t)
+  `AXI_TYPEDEF_RESP_T(axi_rsp_t, axi_b_chan_t, axi_r_chan_t)
+
+  `IDMA_TYPEDEF_FULL_REQ_T(idma_req_t, id_t, addr_t, tf_len_t)
+  `IDMA_TYPEDEF_FULL_RSP_T(idma_rsp_t, addr_t)
+  `IDMA_TYPEDEF_FULL_ND_REQ_T(idma_nd_req_t, idma_req_t, reps_t, addr_t)
+
+  typedef struct packed { axi_ar_chan_t ar_chan; } axi_read_meta_channel_t;
+  typedef struct packed { axi_read_meta_channel_t axi; } read_meta_channel_t;
+  typedef struct packed { axi_aw_chan_t aw_chan; } axi_write_meta_channel_t;
+  typedef struct packed { axi_write_meta_channel_t axi; } write_meta_channel_t;
+
+  // ── Signals ──
+  logic clk, rst_n;
+  idma_req_t   idma_req;   logic req_valid, req_ready;
+  idma_rsp_t   idma_rsp;   logic rsp_valid, rsp_ready;
+  idma_eh_req_t idma_eh_req; logic eh_req_valid, eh_req_ready;
+  idma_nd_req_t nd_req;    logic nd_req_valid, nd_req_ready;
+  idma_rsp_t   nd_rsp;     logic nd_rsp_valid, nd_rsp_ready;
+  axi_req_t axi_read_req, axi_write_req, axi_req, axi_req_mem;
+  axi_rsp_t axi_read_rsp, axi_write_rsp, axi_rsp, axi_rsp_mem;
+  idma_busy_t busy; logic nd_busy;
+
+  assign idma_eh_req = '0;
+  assign eh_req_valid = 1'b0;
+
+  // ── Clock / reset ──
+  clk_rst_gen #(.ClkPeriod(TCK), .RstClkCycles(1)) i_clk_rst_gen (.clk_o(clk), .rst_no(rst_n));
+
+  // ── AXI sim memory (read+write joined) ──
+  axi_rw_join #(.axi_req_t(axi_req_t), .axi_resp_t(axi_rsp_t)) i_axi_rw_join (
+    .clk_i(clk), .rst_ni(rst_n),
+    .slv_read_req_i(axi_read_req),  .slv_read_resp_o(axi_read_rsp),
+    .slv_write_req_i(axi_write_req), .slv_write_resp_o(axi_write_rsp),
+    .mst_req_o(axi_req), .mst_resp_i(axi_rsp)
+  );
+  assign axi_req_mem = axi_req;
+  assign axi_rsp     = axi_rsp_mem;
+
+  axi_sim_mem #(
+    .AddrWidth(AddrWidth), .DataWidth(DataWidth), .IdWidth(AxiIdWidth), .UserWidth(UserWidth),
+    .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t),
+    .WarnUninitialized(1'b0), .ClearErrOnAccess(1'b1), .ApplDelay(TA), .AcqDelay(TT)
+  ) i_axi_sim_mem (
+    .clk_i(clk), .rst_ni(rst_n), .axi_req_i(axi_req_mem), .axi_rsp_o(axi_rsp_mem),
+    .mon_r_last_o(), .mon_r_beat_count_o(), .mon_r_user_o(), .mon_r_id_o(),
+    .mon_r_data_o(), .mon_r_addr_o(), .mon_r_valid_o(),
+    .mon_w_last_o(), .mon_w_beat_count_o(), .mon_w_user_o(), .mon_w_id_o(),
+    .mon_w_data_o(), .mon_w_addr_o(), .mon_w_valid_o()
+  );
+
+  // ── ND midend: ND transpose descriptor -> 1D bursts ──
+  idma_nd_midend #(
+    .NumDim(NumDim), .addr_t(addr_t), .idma_req_t(idma_req_t),
+    .idma_rsp_t(idma_rsp_t), .idma_nd_req_t(idma_nd_req_t), .RepWidths(RepWidths)
+  ) i_nd_midend (
+    .clk_i(clk), .rst_ni(rst_n),
+    .nd_req_i(nd_req), .nd_req_valid_i(nd_req_valid), .nd_req_ready_o(nd_req_ready),
+    .nd_rsp_o(nd_rsp), .nd_rsp_valid_o(nd_rsp_valid), .nd_rsp_ready_i(nd_rsp_ready),
+    .burst_req_o(idma_req), .burst_req_valid_o(req_valid), .burst_req_ready_i(req_ready),
+    .burst_rsp_i(idma_rsp), .burst_rsp_valid_i(rsp_valid), .burst_rsp_ready_o(rsp_ready),
+    .busy_o(nd_busy)
+  );
+
+  // ── Backend (rw_axi) with transpose engine ──
+  idma_backend_rw_axi #(
+    .CombinedShifter(1'b0), .DataWidth(DataWidth), .AddrWidth(AddrWidth), .AxiIdWidth(AxiIdWidth),
+    .UserWidth(UserWidth), .TFLenWidth(TFLenWidth), .MaskInvalidData(1'b1), .BufferDepth(BufferDepth),
+    .RAWCouplingAvail(1'b1), .HardwareLegalizer(1'b1), .RejectZeroTransfers(1'b1),
+    .ErrorCap(idma_pkg::NO_ERROR_HANDLING), .PrintFifoInfo(1'b0), .NumAxInFlight(AxIF), .MemSysDepth(0),
+    .idma_req_t(idma_req_t), .idma_rsp_t(idma_rsp_t), .idma_eh_req_t(idma_eh_req_t),
+    .idma_busy_t(idma_busy_t), .axi_req_t(axi_req_t), .axi_rsp_t(axi_rsp_t),
+    .write_meta_channel_t(write_meta_channel_t), .read_meta_channel_t(read_meta_channel_t)
+  ) i_idma_backend (
+    .clk_i(clk), .rst_ni(rst_n), .testmode_i(1'b0),
+    .idma_req_i(idma_req), .req_valid_i(req_valid), .req_ready_o(req_ready),
+    .idma_rsp_o(idma_rsp), .rsp_valid_o(rsp_valid), .rsp_ready_i(rsp_ready),
+    .idma_eh_req_i(idma_eh_req), .eh_req_valid_i(eh_req_valid), .eh_req_ready_o(eh_req_ready),
+    .axi_read_req_o(axi_read_req), .axi_read_rsp_i(axi_read_rsp),
+    .axi_write_req_o(axi_write_req), .axi_write_rsp_i(axi_write_rsp), .busy_o(busy)
+  );
+
+  // watchdogs to surface deadlocks rather than hang forever
+  stream_watchdog #(.NumCycles(2000)) i_r_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_rsp.r_valid), .ready_i(axi_req.r_ready));
+  stream_watchdog #(.NumCycles(2000)) i_w_wd (.clk_i(clk), .rst_ni(rst_n), .valid_i(axi_req.w_valid), .ready_i(axi_rsp.w_ready));
+
+  // ── Stimulus + check via sim-memory backdoor ──
+  addr_t sb = 'h0000_1000;
+  addr_t db = 'h0000_4000;
+
+  // every AW (incl. wstrb=0 padding rows) must stay in the padded dst allocation
+  // [db, db+NT*NE*MP*EB) — else a strict slave would DECERR
+  always @(posedge clk) if (rst_n && axi_write_req.aw_valid && axi_write_rsp.aw_ready) begin
+    automatic addr_t aw_end = db + addr_t'(NT*NE*MP*EB);
+    if (axi_write_req.aw.addr < db || axi_write_req.aw.addr >= aw_end)
+      $fatal(1, "[TPN] AW 0x%0h outside dst alloc [0x%0h,0x%0h) — would DECERR on a strict slave",
+             axi_write_req.aw.addr, db, aw_end);
+  end
+
+  task automatic wr_mem(input addr_t a, input logic [7:0] d); i_axi_sim_mem.mem[a] = d; endtask
+  function automatic logic [7:0] rd_mem(input addr_t a);
+    return i_axi_sim_mem.mem.exists(a) ? i_axi_sim_mem.mem[a] : 8'hxx;
+  endfunction
+
+  initial begin
+    automatic int unsigned errs = 0;
+    nd_req_valid = 1'b0; nd_rsp_ready = 1'b1; nd_req = '0;
+    @(posedge rst_n);
+    repeat (5) @(posedge clk);
+
+    // init source matrix (row-major, M x N elements of EB bytes)
+    for (int unsigned r = 0; r < M; r++)
+      for (int unsigned c = 0; c < N; c++)
+        for (int unsigned b = 0; b < EB; b++)
+          wr_mem(sb + (r*N + c)*EB + b, 8'((( (r*N+c)*EB + b )*7 + 3) & 8'hFF));
+
+    // sentinel-fill the full padded Aᵀ extent; padding cols/rows must stay sentinel
+    // (the engine strobe suppresses them) — checked after the transfer
+    for (int unsigned i = 0; i < NT*NE; i++)
+      for (int unsigned j = 0; j < MP; j++)
+        for (int unsigned b = 0; b < EB; b++)
+          wr_mem(db + (i*MP + j)*EB + b, 8'hCC);
+
+    // ── transposed-stride ND program (routing-plan §4.2) ──
+    nd_req = '0;
+    nd_req.burst_req.length   = tf_len_t'(NE*EB);   // one tile-row = StrbWidth bytes
+    nd_req.burst_req.src_addr = sb;
+    nd_req.burst_req.dst_addr = db;
+    nd_req.burst_req.opt.src_protocol = idma_pkg::AXI;
+    nd_req.burst_req.opt.dst_protocol = idma_pkg::AXI;
+    nd_req.burst_req.opt.src.burst    = axi_pkg::BURST_INCR;
+    nd_req.burst_req.opt.dst.burst    = axi_pkg::BURST_INCR;
+    nd_req.burst_req.opt.beo.decouple_rw = 1'b1;
+    nd_req.burst_req.opt.beo.decouple_aw = 1'b1;
+    nd_req.burst_req.opt.beo.src_max_llen = '0;
+    nd_req.burst_req.opt.beo.dst_max_llen = '0;
+    nd_req.burst_req.opt.compute.enable                  = 1'b1;
+    nd_req.burst_req.opt.compute.op                      = idma_pkg::COMPUTE_TRANSPOSE;
+    nd_req.burst_req.opt.compute.params.transpose.mode     = 2'(MODE);
+    nd_req.burst_req.opt.compute.params.transpose.tensor_m = 12'(M);
+    nd_req.burst_req.opt.compute.params.transpose.tensor_n = 12'(N);
+    nd_req.burst_req.opt.last         = 1'b1;
+    // ND midend strides are INCREMENTAL deltas (added on dim roll-over), NOT
+    // absolute pitches. Aᵀ uses padded pitch MP*EB (aligned writes); src keeps
+    // N*EB (misaligned reads coalesce in the pre-engine buffer).
+    // d_req[0] = local row within tile (reps NE)
+    nd_req.d_req[0].reps        = reps_t'(NE);
+    nd_req.d_req[0].src_strides = addr_t'(int'(N*EB));                        // next source row
+    nd_req.d_req[0].dst_strides = addr_t'(int'(MP*EB));                       // next Aᵀ row (padded pitch)
+    // d_req[1] = row-tile (reps YT)
+    nd_req.d_req[1].reps        = reps_t'(YT);
+    nd_req.d_req[1].src_strides = addr_t'(int'(N*EB));                        // rows are consecutive
+    nd_req.d_req[1].dst_strides = addr_t'(int'(NE*EB) - int'((NE-1)*MP*EB));  // back up cols, next col-block
+    // d_req[2] = col-tile (reps NT). src rewind uses the padded row extent
+    // (YT*NE-1, not M-1): the read walks padding rows of the last row-tile.
+    nd_req.d_req[2].reps        = reps_t'(NT);
+    nd_req.d_req[2].src_strides = addr_t'(int'(NE*EB) - int'((YT*NE-1)*N*EB)); // back to padded row0, next col-block
+    nd_req.d_req[2].dst_strides = addr_t'(int'(MP*EB) - int'((YT-1)*NE*EB));   // next Aᵀ col-block
+
+    $display("[TPN] launching %0dx%0d EB=%0d transpose via ND midend (NE=%0d, %0dx%0d tiles)", M, N, EB, NE, YT, NT);
+    nd_req_valid = 1'b1;
+    // drop valid on accept; holding it one cycle past makes the midend re-walk the request
+    do @(posedge clk); while (!nd_req_ready);
+    nd_req_valid = 1'b0;
+    nd_req = '0;
+
+    // wait for ND completion
+    while (!(nd_rsp_valid && nd_rsp_ready)) @(posedge clk);
+    repeat (20) @(posedge clk);
+
+    // check 1 (data): out_T[c][r] == in[r][c], Aᵀ at padded pitch MP
+    for (int unsigned c = 0; c < N; c++)
+      for (int unsigned r = 0; r < M; r++)
+        for (int unsigned b = 0; b < EB; b++) begin
+          automatic logic [7:0] got = rd_mem(db + (c*MP + r)*EB + b);
+          automatic logic [7:0] exp = rd_mem(sb + (r*N + c)*EB + b);
+          if (got !== exp) begin
+            errs++;
+            if (errs <= 12) $display("[TPN] MISMATCH out_T[%0d][%0d].b%0d=%02h exp %02h", c, r, b, got, exp);
+          end
+        end
+    // check 2: padding cols [M,MP) and padding rows [N,NT*NE) must stay sentinel (strobe-suppressed)
+    for (int unsigned i = 0; i < NT*NE; i++)
+      for (int unsigned j = 0; j < MP; j++)
+        if (i >= N || j >= M)
+          for (int unsigned b = 0; b < EB; b++) begin
+            automatic logic [7:0] got = rd_mem(db + (i*MP + j)*EB + b);
+            if (got !== 8'hCC) begin
+              errs++;
+              if (errs <= 12) $display("[TPN] PADDING CLOBBERED at row=%0d col=%0d b%0d=%02h (exp CC)", i, j, b, got);
+            end
+          end
+    if (errs == 0) $display("[TPN] PASS: %0dx%0d EB=%0d multi-tile transpose matches golden (padding intact)", M, N, EB);
+    else           $fatal(1, "[TPN] FAIL: %0d mismatches", errs);
+    repeat (5) @(posedge clk);
+    $finish();
+  end
+
+  initial begin #5_000_000; $fatal(1, "[TPN] timeout"); end
+
+endmodule
diff --git a/util/gen_idma.py b/util/gen_idma.py
index 2aff908e..537b9adb 100644
--- a/util/gen_idma.py
+++ b/util/gen_idma.py
@@ -12,7 +12,7 @@
 import argparse
 import sys
 
-from mario.util import prepare_ids, prepare_fids
+from mario.util import prepare_ids, prepare_compute_ids, prepare_fids
 from mario.database import read_database
 from mario.transport_layer import render_transport_layer
 from mario.legalizer import render_legalizer
@@ -44,6 +44,8 @@ def main():
     parser.add_argument('--entity', choices=sorted(GENABLE_ENTITIES), dest='entity', required=True,
         help='The entity to generate from a given configuration.')
     parser.add_argument('--ids', dest='ids', nargs='*', help='configuration IDs')
+    parser.add_argument('--compute-ids', dest='compute_ids', nargs='*', default=[],
+        help='configuration IDs with on-the-fly compute enabled (IDMA_VIDMA_IDS)')
     parser.add_argument('--fids', dest='fids', nargs='*', help='frontend IDs')
     parser.add_argument('--db', dest='db', nargs='*', help='Database files')
     parser.add_argument('--tpl', dest='tpl', required=True, help='Template file')
@@ -51,16 +53,17 @@ def main():
 
     # prepare database and ids
     protocol_ids = prepare_ids(args.ids)
+    compute_cfg = prepare_compute_ids(args.compute_ids)
     frontend_ids = prepare_fids(args.fids)
     protocol_db = read_database(args.db)
 
     # decide what to render
     if args.entity == 'transport':
-        print(render_transport_layer(protocol_ids, protocol_db, args.tpl))
+        print(render_transport_layer(protocol_ids, protocol_db, args.tpl, compute_cfg))
     elif args.entity == 'legalizer':
-        print(render_legalizer(protocol_ids, protocol_db, args.tpl))
+        print(render_legalizer(protocol_ids, protocol_db, args.tpl, compute_cfg))
     elif args.entity == 'backend':
-        print(render_backend(protocol_ids, protocol_db, args.tpl))
+        print(render_backend(protocol_ids, protocol_db, args.tpl, compute_cfg))
     elif args.entity == 'vsim_wave':
         print(render_vsim_wave(protocol_ids, protocol_db, args.tpl))
     elif args.entity == 'synth_wrapper':
diff --git a/util/mario/backend.py b/util/mario/backend.py
index 22a21dfd..d2a93c5b 100644
--- a/util/mario/backend.py
+++ b/util/mario/backend.py
@@ -12,7 +12,7 @@
 from mario.util import eval_key, prot_key
 
 
-def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str:
+def render_backend(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None) -> str:
     """Generate backend"""
     backend_rendered = ''
 
@@ -30,6 +30,12 @@ def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str:
         srp = len(used_read_prots) == 1
         swp = len(used_write_prots) == 1
 
+        # on-the-fly compute is hosted at the (single) AXI write seam
+        enable_compute = prot_id in (compute_cfg or {})
+        if enable_compute and not (swp and used_write_prots[0] == 'axi'):
+            raise ValueError(
+                f'compute (IDMA_VIDMA_IDS) requires a single AXI write port: {prot_id}')
+
         # create context
         context = {
             'name_uniqueifier': prot_id,
@@ -39,6 +45,8 @@ def render_backend(prot_ids: dict, db: dict, tpl_file: str) -> str:
             'used_protocols': prot_ids[prot_id]['used'],
             'one_read_port': srp,
             'one_write_port': swp,
+            'enable_compute': enable_compute,
+            'compute_ops': compute_cfg[prot_id]['ops'] if enable_compute else [],
             'used_non_bursting_write_protocols':
                 prot_key(used_write_prots, 'bursts', 'not_supported', db),
             'combined_aw_and_w':
diff --git a/util/mario/legalizer.py b/util/mario/legalizer.py
index 1724c00e..51757771 100644
--- a/util/mario/legalizer.py
+++ b/util/mario/legalizer.py
@@ -21,7 +21,7 @@ def prot_force_decouple(used_prots: list, db: dict) -> list:
     return res
 
 
-def render_legalizer(prot_ids: dict, db: dict, tpl_file: str) -> str:
+def render_legalizer(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None) -> str:
     """Generate legalizer"""
     legalizer_rendered = ''
 
@@ -71,6 +71,7 @@ def render_legalizer(prot_ids: dict, db: dict, tpl_file: str) -> str:
             'used_protocols': prot_ids[prot_id]['used'],
             'one_read_port': srp,
             'one_write_port': swp,
+            'enable_compute': prot_id in (compute_cfg or {}),
             'no_read_bursting':
                 not has_read_bursting,
             'has_page_read_bursting':
diff --git a/util/mario/transport_layer.py b/util/mario/transport_layer.py
index 17a23a36..f46c8a38 100644
--- a/util/mario/transport_layer.py
+++ b/util/mario/transport_layer.py
@@ -169,7 +169,8 @@ def render_write_mgr_inst(prot_id: str, prot_ids: dict, db: dict) -> dict:
     return res
 
 
-def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str) -> str:
+def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str, compute_cfg: dict = None
+                           ) -> str:
     """Generate Transport Layer"""
     transport_rendered = ''
 
@@ -188,6 +189,11 @@ def render_transport_layer(prot_ids: dict, db: dict, tpl_file: str) -> str:
             'used_protocols': prot_ids[prot_id]['used'],
             'one_read_port': len(prot_ids[prot_id]['ar']) == 1,
             'one_write_port': len(prot_ids[prot_id]['aw']) == 1,
+            'enable_compute': prot_id in (compute_cfg or {}),
+            'compute_ops':
+                compute_cfg[prot_id]['ops'] if prot_id in (compute_cfg or {}) else [],
+            'compute_full_duplex':
+                compute_cfg[prot_id]['full_duplex'] if prot_id in (compute_cfg or {}) else True,
             'rendered_read_ports': render_read_mgr_inst(prot_id, prot_ids, db),
             'rendered_write_ports': render_write_mgr_inst(prot_id, prot_ids, db)
         }
diff --git a/util/mario/util.py b/util/mario/util.py
index 74376c58..a961889d 100644
--- a/util/mario/util.py
+++ b/util/mario/util.py
@@ -135,3 +135,23 @@ def prepare_fids(fe_strs: list) -> dict:
         res[f'reg{reg[0]}_{reg[1]}d'] = reg
 
     return res
+
+
+def prepare_compute_ids(compute_id_strs: list) -> dict:
+    """Parses compute configuration IDs: <variant>[:<op>[,<op>...]][:fd|hd]"""
+    res = {}
+    for cid_str in (compute_id_strs or []):
+        parts = cid_str.split(':')
+        ops = ['transpose']
+        full_duplex = True
+        for part in parts[1:]:
+            if part in ('fd', 'hd'):
+                full_duplex = part == 'fd'
+            else:
+                ops = part.split(',')
+        for op in ops:
+            if op not in ('transpose',):
+                print(f'[MARIO] {op} is a non-supported compute op in {cid_str}', file=sys.stderr)
+                sys.exit(1)
+        res[parts[0]] = {'ops': ops, 'full_duplex': full_duplex}
+    return res