diff --git a/.github/scripts/build_csp_pr_comment.py b/.github/scripts/build_csp_pr_comment.py
index 31e9670f9..edd633e67 100644
--- a/.github/scripts/build_csp_pr_comment.py
+++ b/.github/scripts/build_csp_pr_comment.py
@@ -2,9 +2,10 @@
 """Build a sticky PR comment for the CSP benchmarks workflow.
 
 Reads the CSV emitted by ``scripts/run_csp_benchmarks.sh`` (one row per
-circuit) and renders it as a markdown table with human-readable units. If
-``--baseline-csv`` is given, each metric cell appends a percentage delta
-versus the baseline value (last successful CSP-benchmarks run on main).
+(circuit, backend)) and renders one markdown table per backend with
+human-readable units. If ``--baseline-csv`` is given, each metric cell
+appends a percentage delta versus the baseline value (last successful
+CSP-benchmarks run on main) keyed by (circuit, backend).
 """
 
 from __future__ import annotations
@@ -111,30 +112,46 @@ def read_rows(csv_path: Path) -> list[dict[str, str]]:
         return list(csv.DictReader(f))
 
 
-def index_baseline(rows: list[dict[str, str]]) -> dict[str, dict[str, float]]:
-    """Index baseline rows by circuit name with float metric values."""
-    out: dict[str, dict[str, float]] = {}
+def index_baseline(rows: list[dict[str, str]]) -> dict[tuple[str, str], dict[str, float]]:
+    """Index baseline rows by (circuit, backend) with float metric values.
+
+    Older baseline CSVs without a `backend` column are treated as `whir`
+    (the only backend that existed before backend-aware benchmarks landed),
+    so deltas remain valid across the schema bump.
+    """
+    out: dict[tuple[str, str], dict[str, float]] = {}
     for row in rows:
         circuit = (row.get("circuit") or "").strip()
         if not circuit:
             continue
+        backend = (row.get("backend") or "whir").strip() or "whir"
         metrics: dict[str, float] = {}
         for metric, _unit in METRIC_COLUMNS:
             try:
                 metrics[metric] = float(row.get(metric) or 0)
             except ValueError:
                 metrics[metric] = 0.0
-        out[circuit] = metrics
+        out[(circuit, backend)] = metrics
+    return out
+
+
+def group_by_backend(rows: list[dict[str, str]]) -> dict[str, list[dict[str, str]]]:
+    """Bucket result rows by backend, preserving insertion order of backends."""
+    out: dict[str, list[dict[str, str]]] = {}
+    for row in rows:
+        backend = (row.get("backend") or "whir").strip() or "whir"
+        out.setdefault(backend, []).append(row)
     return out
 
 
 def render_table(
     rows: list[dict[str, str]],
-    baseline: dict[str, dict[str, float]],
+    backend: str,
+    baseline: dict[tuple[str, str], dict[str, float]],
     has_baseline_file: bool,
 ) -> str:
     if not rows:
-        return "_No benchmark results were produced._"
+        return "_No benchmark results were produced for this backend._"
 
     header = (
         "| Circuit | Constraints | Witnesses | Prover time | Peak RSS | "
@@ -145,7 +162,7 @@ def render_table(
 
     for row in sorted(rows, key=lambda r: r.get("circuit", "")):
         circuit = row.get("circuit", "")
-        baseline_metrics = baseline.get(circuit)
+        baseline_metrics = baseline.get((circuit, backend))
 
         cells = [f"`{circuit}`"]
         for metric, unit in METRIC_COLUMNS:
@@ -169,9 +186,18 @@ def render_table(
     return "\n".join(lines)
 
 
+# Display order for backends. Anything not listed here is appended in the
+# order it appeared in the CSV.
+BACKEND_DISPLAY_ORDER: tuple[str, ...] = ("whir", "groth16")
+BACKEND_TITLES: dict[str, str] = {
+    "whir": "WHIR backend",
+    "groth16": "Groth16 backend",
+}
+
+
 def compose_comment(
     rows: list[dict[str, str]],
-    baseline: dict[str, dict[str, float]],
+    baseline: dict[tuple[str, str], dict[str, float]],
     baseline_run_id: str,
     has_baseline_file: bool,
     run_id: str,
@@ -181,7 +207,15 @@ def compose_comment(
     runs_per_circuit: str,
 ) -> str:
     short_sha = sha[:12] if sha else "unknown"
-    table = render_table(rows, baseline, has_baseline_file)
+    by_backend = group_by_backend(rows)
+
+    # Stable backend display order: known backends first, unknown ones after.
+    backends_present = list(by_backend.keys())
+    ordered_backends = [b for b in BACKEND_DISPLAY_ORDER if b in by_backend]
+    ordered_backends += [b for b in backends_present if b not in BACKEND_DISPLAY_ORDER]
+
+    distinct_circuits = sorted({(row.get("circuit") or "") for row in rows})
+    distinct_circuits = [c for c in distinct_circuits if c]
 
     if has_baseline_file:
         if baseline_run_id:
@@ -189,13 +223,13 @@ def compose_comment(
                 f"Each metric cell shows the current value followed by the "
                 f"percentage delta against the latest successful "
                 f"[`main` run #{baseline_run_id}](https://github.com/worldfnd/provekit/actions/runs/{baseline_run_id}). "
-                f"`(new)` marks circuits absent from the baseline."
+                f"`(new)` marks (circuit, backend) pairs absent from the baseline."
             )
         else:
             baseline_note = (
                 "Each metric cell shows the current value followed by the "
                 "percentage delta against the latest successful `main` run. "
-                "`(new)` marks circuits absent from the baseline."
+                "`(new)` marks (circuit, backend) pairs absent from the baseline."
             )
     else:
         baseline_note = (
@@ -203,6 +237,10 @@ def compose_comment(
             "workflow has produced at least one successful `main` run._"
         )
 
+    backend_summary = ", ".join(
+        f"{BACKEND_TITLES.get(b, b)} ({len(by_backend[b])})" for b in ordered_backends
+    ) or "—"
+
     lines = [
         MARKER,
         "## CSP benchmarks",
@@ -212,8 +250,9 @@ def compose_comment(
         f"| Workflow status | {status_with_icon(status)} |",
         f"| Commit | `{short_sha}` |",
         f"| Run | [#{run_id}]({run_url}) |",
-        f"| Circuits benchmarked | {len(rows)} |",
-        f"| Iterations averaged per circuit | {runs_per_circuit} |",
+        f"| Distinct circuits | {len(distinct_circuits)} |",
+        f"| Backends benchmarked | {backend_summary} |",
+        f"| Iterations averaged per (circuit, backend) | {runs_per_circuit} |",
         "",
         "Prover time, peak RSS, peak heap, and verifier time are arithmetic means "
         "across the iterations. Peak heap comes from the largest "
@@ -222,14 +261,27 @@ def compose_comment(
         "",
         baseline_note,
         "",
-        "<details open>",
-        "<summary>Results</summary>",
-        "",
-        table,
-        "",
-        "</details>",
-        "",
     ]
+
+    if not ordered_backends:
+        lines.append("_No benchmark results were produced._")
+        lines.append("")
+    else:
+        for backend in ordered_backends:
+            title = BACKEND_TITLES.get(backend, backend)
+            table = render_table(by_backend[backend], backend, baseline, has_baseline_file)
+            lines.extend([
+                f"### {title}",
+                "",
+                "<details open>",
+                "<summary>Results</summary>",
+                "",
+                table,
+                "",
+                "</details>",
+                "",
+            ])
+
     return "\n".join(lines)
 
 
diff --git a/.github/workflows/csp-benchmarks.yml b/.github/workflows/csp-benchmarks.yml
index 3e17f6368..a7f4bc351 100644
--- a/.github/workflows/csp-benchmarks.yml
+++ b/.github/workflows/csp-benchmarks.yml
@@ -7,9 +7,17 @@ on:
   workflow_dispatch:
     inputs:
       bench_runs:
-        description: "Iterations per circuit (default: 3)"
+        description: "Iterations per (circuit, backend) (default: 3)"
         required: false
         default: "3"
+      bench_backends:
+        description: "Backends to benchmark, space-separated (default: \"whir groth16\")"
+        required: false
+        default: "whir groth16"
+      bench_skip_groth16:
+        description: "Regex of circuits to skip on the groth16 backend (default: empty)"
+        required: false
+        default: ""
 
 permissions:
   contents: read
@@ -22,6 +30,8 @@ permissions:
 env:
   CARGO_TERM_COLOR: always
   BENCH_RUNS: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.bench_runs != '' && github.event.inputs.bench_runs || '3') || '3' }}
+  BENCH_BACKENDS: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.bench_backends != '' && github.event.inputs.bench_backends || 'whir groth16') || 'whir groth16' }}
+  BENCH_SKIP_GROTH16: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.bench_skip_groth16 || '' }}
   REQUIRED_NARGO_VERSION: "1.0.0-beta.19"
 
 concurrency:
@@ -58,6 +68,8 @@ jobs:
           PROVEKIT_BIN: ${{ github.workspace }}/target/release/provekit-cli
           BENCH_DIR: ${{ github.workspace }}/csp-bench-logs
           BENCH_RUNS: ${{ env.BENCH_RUNS }}
+          BENCH_BACKENDS: ${{ env.BENCH_BACKENDS }}
+          BENCH_SKIP_GROTH16: ${{ env.BENCH_SKIP_GROTH16 }}
         run: |
           bash scripts/run_csp_benchmarks.sh
 
diff --git a/Cargo.lock b/Cargo.lock
index f93dc8543..5290adef5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -264,6 +264,7 @@ dependencies = [
  "num-bigint",
  "num-integer",
  "num-traits",
+ "rayon",
  "zeroize",
 ]
 
@@ -452,6 +453,7 @@ dependencies = [
  "arrayvec",
  "digest 0.10.7",
  "num-bigint",
+ "rayon",
 ]
 
 [[package]]
@@ -493,6 +495,7 @@ checksum = "246a225cc6131e9ee4f24619af0f19d67761fff15d7ccc22e42b80846e69449a"
 dependencies = [
  "num-traits",
  "rand 0.8.5",
+ "rayon",
 ]
 
 [[package]]
@@ -3211,6 +3214,15 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
+[[package]]
+name = "memmap2"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.6.5"
@@ -4595,17 +4607,19 @@ dependencies = [
  "acir",
  "anyhow",
  "argh",
+ "ark-bn254",
+ "ark-ec",
  "ark-ff 0.5.0",
+ "ark-serialize 0.5.0",
  "base64",
  "hex",
- "nargo",
  "nargo_toml",
- "noir_artifact_cli",
  "noirc_abi",
  "noirc_driver",
  "postcard",
  "provekit-common",
  "provekit-gnark",
+ "provekit-groth16",
  "provekit-prover",
  "provekit-r1cs-compiler",
  "provekit-verifier",
@@ -4687,25 +4701,58 @@ dependencies = [
  "whir",
 ]
 
+[[package]]
+name = "provekit-groth16"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "ark-bn254",
+ "ark-ec",
+ "ark-ff 0.5.0",
+ "ark-poly",
+ "ark-serialize 0.5.0",
+ "ark-std 0.5.0",
+ "memmap2",
+ "provekit-common",
+ "rayon",
+ "serde",
+ "sha2 0.10.9",
+ "sha3",
+ "tempfile",
+ "tracing",
+ "zeroize",
+]
+
 [[package]]
 name = "provekit-prover"
 version = "0.1.0"
 dependencies = [
  "acir",
  "anyhow",
+ "ark-bn254",
+ "ark-ec",
  "ark-ff 0.5.0",
+ "ark-poly",
+ "ark-serialize 0.5.0",
  "ark-std 0.5.0",
  "bn254_blackbox_solver",
+ "bytes",
  "mavros-artifacts",
  "mavros-vm",
+ "memmap2",
  "nargo",
  "noir_artifact_cli",
  "noirc_abi",
  "num-bigint",
  "postcard",
  "provekit-common",
+ "provekit-groth16",
+ "rayon",
+ "serde",
  "tracing",
  "whir",
+ "xz2",
+ "zstd",
 ]
 
 [[package]]
@@ -4734,8 +4781,11 @@ name = "provekit-verifier"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "ark-bn254",
+ "ark-serialize 0.5.0",
  "ark-std 0.5.0",
  "provekit-common",
+ "provekit-groth16",
  "rayon",
  "tracing",
  "whir",
diff --git a/Cargo.toml b/Cargo.toml
index 73d5ac541..1c428a66c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,6 +10,7 @@ members = [
   "provekit/r1cs-compiler",
   "provekit/prover",
   "provekit/verifier",
+  "provekit/groth16",
   "tooling/cli",
   "tooling/provekit-bench",
   "tooling/provekit-ffi",
@@ -100,6 +101,7 @@ provekit-cli = { path = "tooling/cli" }
 provekit-common = { path = "provekit/common" , features = ["provekit_ntt"]}
 provekit-ffi = { path = "tooling/provekit-ffi" }
 provekit-gnark = { path = "tooling/provekit-gnark" }
+provekit-groth16 = { path = "provekit/groth16" }
 provekit-prover = { path = "provekit/prover", default-features = false }
 provekit-r1cs-compiler = { path = "provekit/r1cs-compiler" }
 provekit-verifier = { path = "provekit/verifier" }
@@ -155,6 +157,7 @@ parking_lot = "0.12"
 # and calls keccak::f1600(), which was removed in keccak 0.2.0 stable. Pinning to
 # the RC prevents `cargo update` from bumping acvm_blackbox_solver's keccak to stable.
 keccak = "=0.2.0-rc.2"
+memmap2 = "0.9.5"
 xz2 = "0.1.7"
 zerocopy = "0.8.25"
 zeroize = "1.8.1"
@@ -187,12 +190,14 @@ noirc_driver = { git = "https://github.com/noir-lang/noir", rev = "v1.0.0-beta.1
 # Cryptography and proof systems
 ark-bn254 = { version = "0.5.0", default-features = false, features = [
   "scalar_field",
+  "curve",
 ] }
 
+ark-ec = { version = "0.5", features = ["parallel"] }
 ark-ff = { version = "0.5", features = ["asm", "std"] }
 ark-poly = "0.5"
 ark-serialize = "0.5"
-ark-std = { version = "0.5", features = ["std"] }
+ark-std = { version = "0.5", features = ["std", "parallel"] }
 mavros-vm = { git = "https://github.com/reilabs/mavros", rev = "3e47fd58001a0109a0314bc080b5246fd807ba04" }
 mavros-artifacts = { git = "https://github.com/reilabs/mavros", rev = "3e47fd58001a0109a0314bc080b5246fd807ba04" }
 spongefish = { git = "https://github.com/arkworks-rs/spongefish", features = [
diff --git a/README.md b/README.md
index a7e8f0b18..14d7973a4 100644
--- a/README.md
+++ b/README.md
@@ -40,18 +40,41 @@ cargo run --release --bin provekit-cli verify
 
 `prepare` writes a **ProveKit Prover** key (`.pkp`) and a **ProveKit Verifier** key (`.pkv`). `prove` reads the PKP plus `Prover.toml` and writes `proof.np`. `verify` reads the PKV and the proof.
 
+### On-chain verification (Groth16 backend)
+
+For proofs produced with `prepare --backend groth16`, the same PKV + proof can be checked by a Solidity verifier. Two commands generate the artefacts:
+
+```sh
+# Render a circuit-specific Solidity verifier from the PKV.
+cargo run --release --bin provekit-cli export-solidity \
+  --pkv <circuit>.pkv \
+  --template provekit/groth16/contracts/ProvekitGroth16Verifier.sol \
+  --out <circuit>Verifier.sol
+
+# Convert the proof to EVM big-endian calldata + a public-inputs file.
+cargo run --release --bin provekit-cli export-evm-proof \
+  --proof proof.np \
+  --out-dir out/
+```
+
+`export-solidity` substitutes all `CODEGEN` markers in the in-repo template with constants from the verifying key (VK scalars, G1/G2 coordinates, gnark-style negations, public-input bases). `export-evm-proof` produces `out/proof.hex` (uncompressed `Ar‖Bs‖Krs‖Commit‖PoK` in EVM big-endian) and `out/inputs.txt` (one decimal field element per line, sized to `N_PUB`). Deploy the rendered `.sol` and call `verifyProof(bytes, uint256[N])` with those two inputs. Single-commitment circuits only for now — see [`provekit/groth16/contracts/README.md`](./provekit/groth16/contracts/README.md) for scope, constraints, and the multi-commitment roadmap.
+
 ### Command reference
 
 | Command | Purpose | Key options |
 | :--- | :--- | :--- |
-| `prepare` | Compile a Noir package and write prover/verifier keys | `--pkp`/`-p`, `--pkv`/`-v`, `--hash`; default hash: `skyscraper` |
+| `prepare` | Compile a Noir package and write prover/verifier keys | `--pkp`/`-p`, `--pkv`/`-v`, `--hash`, `--backend`, `--mmap` (Groth16 only); default hash: `skyscraper`, default backend: `whir` |
 | `prove` | Produce `proof.np` from a prover key and inputs | `--prover`/`-p`, `--input`/`-i`, `--out`/`-o` |
 | `verify` | Verify a proof against a verifier key | `--verifier`/`-v`, `--proof` |
+| `export-solidity` | Render a circuit-specific Solidity verifier from a Groth16 PKV (substitutes `CODEGEN` markers in the template) | `--pkv`/`-v`, `--template`/`-t`, `--out`/`-o` |
+| `export-evm-proof` | Re-emit a Groth16 `proof.np` as EVM big-endian calldata + a public-inputs file for `verifyProof(bytes, uint256[N])` | `--proof`/`-p`, `--out-dir`/`-o` |
 
 Read the table per command: the short `-p` flag changes meaning between `prepare` and `prove`.
 
 Available `prepare --hash` choices are `skyscraper`, `sha256`, `keccak`, `blake3`, and `poseidon2`.
 
+Available `prepare --backend` choices are `whir` (default) and `groth16`.
+
 ## How It Works
 
 ```mermaid
@@ -109,6 +132,7 @@ For larger circuits and integration experiments, see [`noir-examples/`](./noir-e
 
 ## Advanced Usage
 
+- **Mmap-format `.pkp`** (Groth16 only): pass `--mmap` to `prepare` to write an mmap-friendly `.pkp` instead of the zstd-compressed default. Larger artifact (no compression, raw in-memory layout for curve-point and R1CS arrays), but near-instant load — the kernel pages bytes in lazily as the MSM touches them, matching rapidsnark's zkey-loading model. Both layouts share the `.pkp` extension; `prove` auto-detects via the file's `MMAP` sentinel.
 - **Direct R1CS frontend:** after generating Mavros artifacts, call `provekit-cli prepare --compiler mavros <artifacts.json> --r1cs <r1cs.bin>`.
 - **Recursive verifier inputs:** `provekit-cli generate-gnark-inputs <verifier.pkv> <proof.np>` writes `params_for_recursive_verifier` and `r1cs.json` by default; use `--params` and `--r1cs` to override those paths.
 - **Inspection commands:** use `circuit-stats` for Noir ACIR/R1CS structure, `analyze-pkp` for Noir prover-key size breakdowns, and `show-inputs` for public inputs.
diff --git a/noir-examples/noir_sha256/Prover.toml b/noir-examples/noir_sha256/Prover.toml
index 689c76d15..7c0585c7b 100644
--- a/noir-examples/noir_sha256/Prover.toml
+++ b/noir-examples/noir_sha256/Prover.toml
@@ -1,3 +1,3 @@
 input = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 
-expected = [67, 25, 141, 183, 254, 43, 174, 230, 241, 12, 52, 52, 209, 164, 42, 198, 77, 148, 199, 2, 25, 96, 124, 119, 128, 33, 172, 170, 236, 162, 201, 30]
+expected = [248, 42, 35, 54, 144, 202, 70, 153, 180, 85, 249, 220, 89, 110, 125, 88, 66, 133, 186, 224, 63, 42, 42, 43, 212, 248, 195, 112, 11, 16, 217, 156]
diff --git a/noir-examples/noir_sha256/src/main.nr b/noir-examples/noir_sha256/src/main.nr
index 31f5f3f92..cf5aec930 100644
--- a/noir-examples/noir_sha256/src/main.nr
+++ b/noir-examples/noir_sha256/src/main.nr
@@ -1,7 +1,7 @@
 use sha256::sha256_var;
 
-// Chain 17 SHA-256 rounds over a 32-byte state.
-global NUM_SHA_CALLS: u32 = 17;
+// Change this to control how many SHA256 calls are generated.
+global NUM_SHA_CALLS: u32 = 35;
 
 fn main(input: [u8; 32], expected: pub [u8; 32]) {
     let mut data = input;
diff --git a/playground/passport-input-gen/src/bin/passport_cli/main.rs b/playground/passport-input-gen/src/bin/passport_cli/main.rs
index 0a21bb4fa..fe9d96eff 100644
--- a/playground/passport-input-gen/src/bin/passport_cli/main.rs
+++ b/playground/passport-input-gen/src/bin/passport_cli/main.rs
@@ -247,7 +247,7 @@ fn prove_circuit<T: serde::Serialize>(
         "\n  [{circuit_name}] Loading prover from: {}",
         pkp_path.display()
     );
-    let prover: provekit_common::Prover = provekit_common::file::read(pkp_path)
+    let prover: provekit_prover::Prover = provekit_prover::read_pkp(pkp_path)
         .with_context(|| format!("Reading prover key for {circuit_name}"))?;
 
     let (num_constraints, num_witnesses) = prover.size();
@@ -259,8 +259,9 @@ fn prove_circuit<T: serde::Serialize>(
     let json = serde_json::to_string(inputs)
         .with_context(|| format!("Serializing {circuit_name} inputs to JSON"))?;
     let abi = match &prover {
-        provekit_common::Prover::Noir(p) => p.witness_generator.abi(),
-        provekit_common::Prover::Mavros(p) => &p.abi,
+        provekit_prover::Prover::Noir(p) => p.witness_generator.abi(),
+        provekit_prover::Prover::Mavros(p) => &p.abi,
+        provekit_prover::Prover::Groth16(p) => p.witness_generator.abi(),
     };
     let input_map = Format::Json
         .parse(&json, abi)
diff --git a/provekit/common/src/file/binary_format.rs b/provekit/common/src/file/binary_format.rs
index 44ff55717..949d491fd 100644
--- a/provekit/common/src/file/binary_format.rs
+++ b/provekit/common/src/file/binary_format.rs
@@ -15,13 +15,13 @@ pub const XZ_MAGIC: [u8; 6] = [0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00];
 // ---------------------------------------------------------------------------
 
 pub const PROVER_FORMAT: [u8; 8] = *b"PrvKitPr";
-pub const PROVER_VERSION: (u16, u16) = (1, 2);
+pub const PROVER_VERSION: (u16, u16) = (1, 5);
 
 pub const VERIFIER_FORMAT: [u8; 8] = *b"PrvKitVr";
-pub const VERIFIER_VERSION: (u16, u16) = (1, 3);
+pub const VERIFIER_VERSION: (u16, u16) = (1, 4);
 
 pub const NOIR_PROOF_SCHEME_FORMAT: [u8; 8] = *b"NrProScm";
 pub const NOIR_PROOF_SCHEME_VERSION: (u16, u16) = (1, 2);
 
 pub const NOIR_PROOF_FORMAT: [u8; 8] = *b"NPSProof";
-pub const NOIR_PROOF_VERSION: (u16, u16) = (1, 1);
+pub const NOIR_PROOF_VERSION: (u16, u16) = (1, 2);
diff --git a/provekit/common/src/file/io/bin.rs b/provekit/common/src/file/io/bin.rs
index a092b9462..4ad79f585 100644
--- a/provekit/common/src/file/io/bin.rs
+++ b/provekit/common/src/file/io/bin.rs
@@ -111,12 +111,29 @@ pub fn read_hash_config(
 }
 
 /// Read a compressed binary file, auto-detecting zstd or XZ compression.
+///
+/// The decompressed bytes are streamed directly into postcard's deserializer
+/// instead of being materialized into a single `Vec<u8>`. This keeps peak
+/// memory close to the size of the deserialized struct, instead of paying
+/// twice (once for the decompressed buffer, once for the parsed value).
+///
+/// `postcard::from_io` needs a scratch buffer sized to fit the largest
+/// `deserialize_bytes` / `deserialize_byte_buf` read it will encounter. For
+/// our types that's bounded by the on-disk file size (the largest single
+/// borrowed-bytes field — currently the Groth16 proving key — encodes
+/// ~1:1 against the compressed file because arkworks-serialized curve points
+/// are essentially random). We size the scratch buffer to the file size with
+/// a small floor for tiny files.
 #[instrument(fields(size = path.metadata().map(|m| m.len()).ok()))]
 pub fn read_bin<T: for<'a> Deserialize<'a>>(
     path: &Path,
     format: [u8; 8],
     (major, minor): (u16, u16),
 ) -> Result<T> {
+    use std::io::BufRead;
+
+    let file_size = path.metadata().map(|m| m.len()).unwrap_or(0) as usize;
+
     let mut file = BufReader::new(File::open(path).context("while opening input file")?);
 
     let mut buffer = [0; HEADER_SIZE];
@@ -140,9 +157,79 @@ pub fn read_bin<T: for<'a> Deserialize<'a>>(
     // Skip hash_config byte (can be read separately via read_hash_config if needed)
     let _hash_config_byte = header.get_u8();
 
-    let uncompressed = decompress_stream(&mut file)?;
+    // Detect compression via magic bytes.
+    let peek = file.fill_buf().context("while peeking compression magic")?;
+    ensure!(
+        peek.len() >= 6,
+        "File too small to detect compression format"
+    );
+    let is_zstd = peek[..4] == ZSTD_MAGIC;
+    let is_xz = peek[..6] == XZ_MAGIC;
+
+    // Scratch buffer for postcard streaming. Must be at least as large as
+    // the largest single `deserialize_byte_buf` read; in practice this is
+    // a few MB at most for our formats. Cap the default at 16 MB so that
+    // opening a 1 GB .pkp doesn't allocate a 1 GB scratch on top of the
+    // decoder buffer and the parsed value. Floor at 1 MB for tiny .np
+    // proofs. Override with `PROVEKIT_SCRATCH_MAX_MB` if a future format
+    // needs more.
+    const DEFAULT_SCRATCH_CAP: usize = 16 << 20;
+    let scratch_cap = std::env::var("PROVEKIT_SCRATCH_MAX_MB")
+        .ok()
+        .and_then(|s| s.parse::<usize>().ok())
+        .and_then(|mb| mb.checked_shl(20))
+        .unwrap_or(DEFAULT_SCRATCH_CAP);
+    let scratch_size = file_size.min(scratch_cap).max(1 << 20);
+    let mut scratch = vec![0u8; scratch_size];
+
+    // Wrap the streaming decoder in a `BufReader` so postcard's per-byte
+    // `pop()` calls become fast in-memory reads instead of one syscall each.
+    // 256 KB is large enough to amortize syscall overhead without holding more
+    // decompressed data in memory than necessary.
+    const DECODER_BUF: usize = 256 * 1024;
+
+    // If the cap shrank scratch below the (compressed) file size, the failure
+    // mode for an oversized `deserialize_byte_buf` is opaque ("postcard
+    // streaming failed"). Attach a hint pointing at the env-var escape hatch
+    // so users don't have to guess. Compressed-vs-decompressed is an
+    // intentional under-approximation: if the file is small but the
+    // decompressed payload contains a huge byte_buf, the hint still fires.
+    let scratch_capped = scratch_size < file_size;
+    let postcard_err = |stage: &'static str, e: postcard::Error| -> anyhow::Error {
+        let err = anyhow::Error::from(e).context(stage);
+        if scratch_capped {
+            err.context(format!(
+                "postcard scratch capped at {} MB (file is {} MB); if a single \
+                 `deserialize_byte_buf` read exceeded the cap, raise it with \
+                 `PROVEKIT_SCRATCH_MAX_MB=<MB>`",
+                scratch_size >> 20,
+                file_size >> 20,
+            ))
+        } else {
+            err
+        }
+    };
+
+    let value = if is_zstd {
+        let decoder = zstd::Decoder::new(file).context("while initializing zstd decoder")?;
+        let buffered = BufReader::with_capacity(DECODER_BUF, decoder);
+        let (value, _) = postcard::from_io::<T, _>((buffered, &mut scratch))
+            .map_err(|e| postcard_err("while streaming postcard from zstd", e))?;
+        value
+    } else if is_xz {
+        let decoder = xz2::read::XzDecoder::new(file);
+        let buffered = BufReader::with_capacity(DECODER_BUF, decoder);
+        let (value, _) = postcard::from_io::<T, _>((buffered, &mut scratch))
+            .map_err(|e| postcard_err("while streaming postcard from xz", e))?;
+        value
+    } else {
+        anyhow::bail!(
+            "Unknown compression format (first bytes: {:02X?})",
+            &peek[..peek.len().min(6)]
+        );
+    };
 
-    postcard::from_bytes(&uncompressed).context("while decoding from postcard")
+    Ok(value)
 }
 
 /// Serialize a value to bytes in the same format as `write_bin` (header +
@@ -234,40 +321,3 @@ fn decompress_bytes(data: &[u8]) -> Result<Vec<u8>> {
         );
     }
 }
-
-/// Peek at the first bytes to detect compression format, then
-/// stream-decompress.
-fn decompress_stream(reader: &mut BufReader<File>) -> Result<Vec<u8>> {
-    use std::io::BufRead;
-
-    let buf = reader
-        .fill_buf()
-        .context("while peeking compression magic")?;
-    ensure!(
-        buf.len() >= 6,
-        "File too small to detect compression format"
-    );
-
-    let is_zstd = buf[..4] == ZSTD_MAGIC;
-    let is_xz = buf[..6] == XZ_MAGIC;
-
-    let mut out = Vec::new();
-    if is_zstd {
-        let mut decoder = zstd::Decoder::new(reader).context("while initializing zstd decoder")?;
-        decoder
-            .read_to_end(&mut out)
-            .context("while decompressing zstd data")?;
-    } else if is_xz {
-        let mut decoder = xz2::read::XzDecoder::new(reader);
-        decoder
-            .read_to_end(&mut out)
-            .context("while decompressing XZ data")?;
-    } else {
-        anyhow::bail!(
-            "Unknown compression format (first bytes: {:02X?})",
-            &buf[..buf.len().min(6)]
-        );
-    }
-
-    Ok(out)
-}
diff --git a/provekit/common/src/file/io/mod.rs b/provekit/common/src/file/io/mod.rs
index 049c984a7..c2e6ae9aa 100644
--- a/provekit/common/src/file/io/mod.rs
+++ b/provekit/common/src/file/io/mod.rs
@@ -3,17 +3,18 @@ mod buf_ext;
 mod counting_writer;
 mod json;
 
+pub use self::bin::Compression;
 use {
     self::{
         bin::{
             deserialize_from_bytes, read_bin, read_hash_config as read_hash_config_bin,
-            serialize_to_bytes, write_bin, Compression,
+            serialize_to_bytes, write_bin,
         },
         buf_ext::BufExt,
         counting_writer::CountingWriter,
         json::{read_json, write_json},
     },
-    crate::{HashConfig, NoirProof, NoirProofScheme, Prover, Verifier},
+    crate::{HashConfig, NoirProof, NoirProofScheme, Verifier},
     anyhow::Result,
     serde::{Deserialize, Serialize},
     std::{ffi::OsStr, path::Path},
@@ -29,20 +30,13 @@ pub trait FileFormat: Serialize + for<'a> Deserialize<'a> {
 }
 
 /// Helper trait to optionally extract hash config.
-pub(crate) trait MaybeHashAware {
+///
+/// `pub` so downstream crates (e.g. `provekit_prover`) can implement it for
+/// types they own. Internal helpers in this module are the only consumers.
+pub trait MaybeHashAware {
     fn maybe_hash_config(&self) -> Option<HashConfig>;
 }
 
-/// Impl for Prover (has hash config).
-impl MaybeHashAware for Prover {
-    fn maybe_hash_config(&self) -> Option<HashConfig> {
-        match self {
-            Prover::Noir(p) => Some(p.hash_config),
-            Prover::Mavros(p) => Some(p.hash_config),
-        }
-    }
-}
-
 /// Impl for Verifier (has hash config).
 impl MaybeHashAware for Verifier {
     fn maybe_hash_config(&self) -> Option<HashConfig> {
@@ -74,13 +68,6 @@ impl FileFormat for NoirProofScheme {
     const COMPRESSION: Compression = Compression::Zstd;
 }
 
-impl FileFormat for Prover {
-    const FORMAT: [u8; 8] = crate::binary_format::PROVER_FORMAT;
-    const EXTENSION: &'static str = "pkp";
-    const VERSION: (u16, u16) = crate::binary_format::PROVER_VERSION;
-    const COMPRESSION: Compression = Compression::Xz;
-}
-
 impl FileFormat for Verifier {
     const FORMAT: [u8; 8] = crate::binary_format::VERIFIER_FORMAT;
     const EXTENSION: &'static str = "pkv";
diff --git a/provekit/common/src/interner.rs b/provekit/common/src/interner.rs
index 822a6a7dd..413885f95 100644
--- a/provekit/common/src/interner.rs
+++ b/provekit/common/src/interner.rs
@@ -39,4 +39,30 @@ impl Interner {
     pub fn get(&self, el: InternedFieldElement) -> Option<FieldElement> {
         self.values.get(el.0).copied()
     }
+
+    /// Borrow the deduplicated values array. Used by mmap-format writers
+    /// that need the raw bytes.
+    pub fn values_raw(&self) -> &[FieldElement] {
+        &self.values
+    }
+
+    /// Construct an Interner from a pre-built values vector. Bypasses the
+    /// dedup work in `intern()` — used by mmap-format readers that have
+    /// already loaded a deduplicated set of values from disk.
+    pub fn from_values(values: Vec<FieldElement>) -> Self {
+        Self { values }
+    }
+}
+
+impl InternedFieldElement {
+    /// Construct an InternedFieldElement from a raw index. Used by
+    /// mmap-format readers that load the index Vec from raw bytes.
+    pub const fn new(idx: usize) -> Self {
+        Self(idx)
+    }
+
+    /// Inner index value.
+    pub const fn index(&self) -> usize {
+        self.0
+    }
 }
diff --git a/provekit/common/src/lib.rs b/provekit/common/src/lib.rs
index 3953207d8..cba3fda9f 100644
--- a/provekit/common/src/lib.rs
+++ b/provekit/common/src/lib.rs
@@ -19,19 +19,17 @@ mod verifier;
 mod whir_r1cs;
 pub mod witness;
 
-use crate::{
-    interner::{InternedFieldElement, Interner},
-    sparse_matrix::{HydratedSparseMatrix, SparseMatrix},
-};
 pub use {
     acir::FieldElement as NoirElement,
     ark_bn254::Fr as FieldElement,
     hash_config::HashConfig,
+    interner::{InternedFieldElement, Interner},
     mavros::{MavrosProver, MavrosSchemeData},
     noir_proof_scheme::{NoirProof, NoirProofScheme, NoirSchemeData},
     prefix_covector::{OffsetCovector, PrefixCovector, SparseCovector},
-    prover::{NoirProver, Prover},
+    prover::NoirProver,
     r1cs::R1CS,
+    sparse_matrix::{HydratedSparseMatrix, SparseMatrix},
     transcript_sponge::TranscriptSponge,
     verifier::Verifier,
     whir_r1cs::{R1csHash, WhirConfig, WhirR1CSProof, WhirR1CSScheme, WhirZkConfig},
diff --git a/provekit/common/src/noir_proof_scheme.rs b/provekit/common/src/noir_proof_scheme.rs
index 7731d3c47..d084190ed 100644
--- a/provekit/common/src/noir_proof_scheme.rs
+++ b/provekit/common/src/noir_proof_scheme.rs
@@ -27,10 +27,48 @@ pub enum NoirProofScheme {
     Mavros(MavrosSchemeData),
 }
 
+// INVARIANT: Variant order is wire-format-critical (postcard uses positional
+// discriminants). Do not reorder, cfg-gate, or insert variants without
+// verifying cross-target deserialization (native <-> WASM).
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-pub struct NoirProof {
-    pub public_inputs:   PublicInputs,
-    pub whir_r1cs_proof: WhirR1CSProof,
+pub enum NoirProof {
+    Whir {
+        public_inputs:   PublicInputs,
+        whir_r1cs_proof: WhirR1CSProof,
+    },
+    Groth16 {
+        public_inputs: PublicInputs,
+        /// CanonicalSerialize'd `provekit_groth16::Proof`.
+        groth16_proof: Vec<u8>,
+    },
+}
+
+impl NoirProof {
+    /// Access public inputs regardless of proof variant.
+    pub fn public_inputs(&self) -> &PublicInputs {
+        match self {
+            NoirProof::Whir { public_inputs, .. } => public_inputs,
+            NoirProof::Groth16 { public_inputs, .. } => public_inputs,
+        }
+    }
+
+    /// Mutably access public inputs regardless of proof variant.
+    pub fn public_inputs_mut(&mut self) -> &mut PublicInputs {
+        match self {
+            NoirProof::Whir { public_inputs, .. } => public_inputs,
+            NoirProof::Groth16 { public_inputs, .. } => public_inputs,
+        }
+    }
+
+    /// Access the WHIR proof, panics if this is a Groth16 proof.
+    pub fn whir_r1cs_proof(&self) -> &WhirR1CSProof {
+        match self {
+            NoirProof::Whir {
+                whir_r1cs_proof, ..
+            } => whir_r1cs_proof,
+            NoirProof::Groth16 { .. } => panic!("called whir_r1cs_proof() on a Groth16 proof"),
+        }
+    }
 }
 
 impl NoirProofScheme {
diff --git a/provekit/common/src/prover.rs b/provekit/common/src/prover.rs
index 88e2da07b..88b89e6b8 100644
--- a/provekit/common/src/prover.rs
+++ b/provekit/common/src/prover.rs
@@ -1,12 +1,19 @@
+//! Backend-specific prover types that don't introduce a `provekit_groth16`
+//! dependency.
+//!
+//! `NoirProver` lives here because it's referenced by the WHIR pipeline that
+//! is shared by everything in the workspace. The Groth16 prover and the
+//! `Prover` enum live in `provekit_prover::prover_types` so they can hold a
+//! typed `provekit_groth16::ProvingKey` without creating a dependency cycle
+//! (`provekit_groth16` depends on this crate for `R1CS`).
+
 use {
     crate::{
-        noir_proof_scheme::NoirProofScheme,
         whir_r1cs::WhirR1CSScheme,
         witness::{NoirWitnessGenerator, SplitWitnessBuilders},
-        HashConfig, MavrosProver, NoirElement, R1CS,
+        HashConfig, NoirElement, R1CS,
     },
     acir::circuit::Program,
-    noirc_abi::Abi,
     serde::{Deserialize, Serialize},
 };
 
@@ -19,68 +26,3 @@ pub struct NoirProver {
     pub witness_generator:      NoirWitnessGenerator,
     pub whir_for_witness:       WhirR1CSScheme,
 }
-
-/// On-disk **ProveKit Prover** (PKP) — the prover-side scheme that gets
-/// serialized to a `.pkp` file by `prepare` and loaded by `prove`.
-///
-/// Holds the R1CS, witness builders, WHIR config, and frontend-specific
-/// program data needed to produce a proof.
-///
-/// INVARIANT: Variant order is wire-format-critical (postcard uses positional
-/// discriminants). Do not reorder, cfg-gate, or insert variants without
-/// verifying cross-target deserialization (native <-> WASM).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum Prover {
-    Noir(NoirProver),
-    Mavros(MavrosProver),
-}
-
-impl Prover {
-    /// Convert a compilation output into the on-disk prover format.
-    pub fn from_noir_proof_scheme(scheme: NoirProofScheme) -> Self {
-        match scheme {
-            NoirProofScheme::Noir(d) => Prover::Noir(NoirProver {
-                hash_config:            d.hash_config,
-                program:                d.program,
-                r1cs:                   d.r1cs,
-                split_witness_builders: d.split_witness_builders,
-                witness_generator:      d.witness_generator,
-                whir_for_witness:       d.whir_for_witness,
-            }),
-            NoirProofScheme::Mavros(d) => Prover::Mavros(MavrosProver {
-                abi:                d.abi,
-                num_public_inputs:  d.num_public_inputs,
-                whir_for_witness:   d.whir_for_witness,
-                witgen_binary:      d.witgen_binary,
-                ad_binary:          d.ad_binary,
-                constraints_layout: d.constraints_layout,
-                witness_layout:     d.witness_layout,
-                hash_config:        d.hash_config,
-            }),
-        }
-    }
-
-    pub fn abi(&self) -> &Abi {
-        match self {
-            Prover::Noir(p) => p.witness_generator.abi(),
-            Prover::Mavros(p) => &p.abi,
-        }
-    }
-
-    pub fn size(&self) -> (usize, usize) {
-        match self {
-            Prover::Noir(p) => (p.r1cs.num_constraints(), p.r1cs.num_witnesses()),
-            Prover::Mavros(p) => (
-                p.constraints_layout.algebraic_size,
-                p.witness_layout.algebraic_size,
-            ),
-        }
-    }
-
-    pub fn whir_for_witness(&self) -> &WhirR1CSScheme {
-        match self {
-            Prover::Noir(p) => &p.whir_for_witness,
-            Prover::Mavros(p) => &p.whir_for_witness,
-        }
-    }
-}
diff --git a/provekit/common/src/sparse_matrix.rs b/provekit/common/src/sparse_matrix.rs
index 012a3b447..887fd727a 100644
--- a/provekit/common/src/sparse_matrix.rs
+++ b/provekit/common/src/sparse_matrix.rs
@@ -312,6 +312,45 @@ impl SparseMatrix {
         }
     }
 
+    /// Borrow the internal `new_row_indices` array. Used by mmap-format
+    /// writers that need the raw bytes.
+    pub fn new_row_indices_raw(&self) -> &[u32] {
+        &self.new_row_indices
+    }
+
+    /// Borrow the internal `col_indices` array (absolute, not
+    /// delta-encoded). Used by mmap-format writers that need the raw
+    /// bytes.
+    pub fn col_indices_raw(&self) -> &[u32] {
+        &self.col_indices
+    }
+
+    /// Borrow the internal `values` array (interner indices). Used by
+    /// mmap-format writers that need the raw bytes.
+    pub fn values_raw(&self) -> &[InternedFieldElement] {
+        &self.values
+    }
+
+    /// Construct a `SparseMatrix` directly from its three internal arrays.
+    /// Used by mmap-format readers that have just memcpy'd the bytes
+    /// from disk into owned `Vec`s. Skips the per-entry insertion path
+    /// that goes through `set` / `push_row` / delta decoding.
+    pub fn from_raw_parts(
+        num_rows: usize,
+        num_cols: usize,
+        new_row_indices: Vec<u32>,
+        col_indices: Vec<u32>,
+        values: Vec<InternedFieldElement>,
+    ) -> Self {
+        Self {
+            num_rows,
+            num_cols,
+            new_row_indices,
+            col_indices,
+            values,
+        }
+    }
+
     pub const fn hydrate<'a>(&'a self, interner: &'a Interner) -> HydratedSparseMatrix<'a> {
         HydratedSparseMatrix {
             matrix: self,
diff --git a/provekit/common/src/verifier.rs b/provekit/common/src/verifier.rs
index 2663cff61..a82dcf569 100644
--- a/provekit/common/src/verifier.rs
+++ b/provekit/common/src/verifier.rs
@@ -20,6 +20,10 @@ pub struct Verifier {
     pub whir_for_witness: Option<WhirR1CSScheme>,
     #[serde(with = "serde_jsonify")]
     pub abi:              Abi,
+    /// CanonicalSerialize'd `provekit_groth16::VerifyingKey` (None for WHIR
+    /// proofs).
+    #[serde(default)]
+    pub groth16_vk:       Option<Vec<u8>>,
 }
 
 impl Verifier {
@@ -30,12 +34,14 @@ impl Verifier {
                 whir_for_witness: Some(d.whir_for_witness),
                 abi:              d.witness_generator.abi.clone(),
                 hash_config:      d.hash_config,
+                groth16_vk:       None,
             },
             NoirProofScheme::Mavros(d) => Self {
                 r1cs:             d.r1cs,
                 whir_for_witness: Some(d.whir_for_witness),
                 abi:              d.abi.clone(),
                 hash_config:      d.hash_config,
+                groth16_vk:       None,
             },
         }
     }
diff --git a/provekit/groth16/Cargo.toml b/provekit/groth16/Cargo.toml
new file mode 100644
index 000000000..6cb53c45a
--- /dev/null
+++ b/provekit/groth16/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "provekit-groth16"
+version = "0.1.0"
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+license.workspace = true
+
+[dependencies]
+ark-bn254 = { workspace = true }
+ark-ff = { workspace = true }
+ark-ec = { workspace = true }
+ark-poly = { workspace = true }
+ark-serialize = { workspace = true }
+ark-std = { workspace = true }
+rayon = { workspace = true }
+anyhow = { workspace = true }
+sha3 = { workspace = true }
+serde = { workspace = true }
+tracing = { workspace = true }
+zeroize = { workspace = true, features = ["derive"] }
+provekit-common = { workspace = true }
+
+# Target-specific: mmap-backed proving key loader is non-WASM only.
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+memmap2 = { workspace = true }
+
+[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
+tempfile = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/provekit/groth16/src/lib.rs b/provekit/groth16/src/lib.rs
new file mode 100644
index 000000000..966a477bf
--- /dev/null
+++ b/provekit/groth16/src/lib.rs
@@ -0,0 +1,93 @@
+/// Groth16 proof system with BSB22 commitment extension for BN254.
+///
+/// Built on arkworks primitives for elliptic curve operations, pairings,
+/// FFT, and MSM.
+///
+/// Reference: DIZK paper <https://eprint.iacr.org/2018/691.pdf> (Figure 4)
+/// BSB22 extension: <https://eprint.iacr.org/2022/1072>
+pub mod pedersen;
+pub mod prover;
+pub mod setup;
+pub mod types;
+pub mod verifier;
+
+#[cfg(not(target_arch = "wasm32"))]
+pub mod mmap_pk;
+
+#[cfg(not(target_arch = "wasm32"))]
+pub use mmap_pk::{MmapProvingKey, MMAP_SENTINEL};
+pub use types::{Proof, ProvingKey, VerifyingKey};
+
+/// Extension trait for [`provekit_common::Verifier`] that decodes the
+/// `groth16_vk: Option<Vec<u8>>` field into a typed [`VerifyingKey`] in one
+/// place — so consumers don't each repeat the
+/// `CanonicalDeserialize::deserialize_uncompressed(&bytes[..])` dance.
+pub trait VerifierGroth16Ext {
+    /// Decode the embedded Groth16 verifying key, if present.
+    ///
+    /// `Ok(None)` means this PKV is for the WHIR backend; `Err` means a VK
+    /// was present but failed to deserialize.
+    fn groth16_vk_typed(&self) -> anyhow::Result<Option<VerifyingKey>>;
+}
+
+impl VerifierGroth16Ext for provekit_common::Verifier {
+    fn groth16_vk_typed(&self) -> anyhow::Result<Option<VerifyingKey>> {
+        use {anyhow::Context, ark_serialize::CanonicalDeserialize};
+        self.groth16_vk
+            .as_ref()
+            .map(|bytes| {
+                VerifyingKey::deserialize_uncompressed(&bytes[..]).context("decoding groth16_vk")
+            })
+            .transpose()
+    }
+}
+
+/// Domain separator for BSB22 commitment hashing.
+pub const COMMITMENT_DST: &[u8] = b"bsb22-commitment";
+
+/// Domain separator for folding PoKs.
+pub const BSB22_FOLD_DST: &[u8] = b"G16-BSB22";
+
+/// Field element byte length for BN254.
+pub const FR_BYTES: usize = 32;
+
+/// Information about a single BSB22 commitment within the R1CS.
+///
+/// All wire indices in this struct are **absolute witness indices**: position
+/// 0 is the constant-1 ONE_WIRE, public input `i` lives at index `1 + i`, and
+/// private/challenge wires follow. The verifier subtracts 1 when looking up
+/// values in its `extended_public` vector (which excludes the ONE_WIRE), so
+/// index 0 is never a valid entry in `public_and_commitment_committed`.
+#[derive(Clone, Debug, Default)]
+pub struct CommitmentInfo {
+    /// Indices of public wires and other commitment wires hashed with this
+    /// commitment. See struct-level docs for index convention.
+    pub public_and_commitment_committed: Vec<usize>,
+    /// Indices of private/internal wires committed to.
+    pub private_committed:               Vec<usize>,
+    /// Wire indices that hold derived challenge values for this commitment,
+    /// in the order the verifier inserts them into `extended_public`.
+    /// `setup()` flattens these across `commitment_info` to know which wires
+    /// belong to `vk.g1_k`; the prover writes derived challenges into the
+    /// witness at these positions.
+    pub challenge_indices:               Vec<usize>,
+    /// Number of entries in `public_and_commitment_committed` that are public
+    /// (as opposed to other commitment indices).
+    pub nb_public_committed:             usize,
+}
+
+impl CommitmentInfo {
+    /// Returns the public wire indices committed to.
+    pub fn public_committed(&self) -> &[usize] {
+        &self.public_and_commitment_committed[..self.nb_public_committed]
+    }
+
+    /// Returns the commitment wire indices committed to.
+    pub fn commitment_committed(&self) -> &[usize] {
+        &self.public_and_commitment_committed[self.nb_public_committed..]
+    }
+}
+
+pub(crate) fn msm_err(e: usize) -> anyhow::Error {
+    anyhow::anyhow!("MSM error: bases/scalars length mismatch ({})", e)
+}
diff --git a/provekit/groth16/src/mmap_pk.rs b/provekit/groth16/src/mmap_pk.rs
new file mode 100644
index 000000000..260b7482d
--- /dev/null
+++ b/provekit/groth16/src/mmap_pk.rs
@@ -0,0 +1,1434 @@
+//! mmap-backed Groth16 proving key.
+//!
+//! Mirrors rapidsnark's zkey loading approach (see
+//! `rapidsnark/src/fileloader.cpp` + `binfile_utils.cpp` + `zkey_utils.cpp`):
+//! the file is mmap'd once, sections are indexed from a small in-file table,
+//! and big curve-point arrays are exposed as `&[G1Affine]` / `&[G2Affine]`
+//! slices that point directly into the mmap'd region. No per-point
+//! deserialization, no copy.
+//!
+//! Coexists with the existing zstd-compressed `.pkp` path
+//! (`provekit_prover::pkp_io`); the on-disk discriminator is the 4-byte
+//! sentinel that follows the 21-byte common header — `MMAP_SENTINEL` here vs.
+//! zstd/xz magic in the legacy path.
+//!
+//! ## On-disk layout (after the 21-byte common header)
+//!
+//! ```text
+//! [ MMAP_SENTINEL                    4 bytes   ]
+//! [ metadata_len  (u64 LE)           8 bytes   ]
+//! [ postcard-encoded Prover          metadata_len bytes  (PK = zero-byte placeholder) ]
+//! [ pad to 8-byte align                       ]
+//! [ section_count (u32 LE)           4 bytes   ]
+//! [ section table (id u32, off u64, len u64) × section_count ]
+//! [ pad to MMAP_ALIGN                         ]
+//! [ section bodies (raw arkworks in-memory layout for big arrays) ]
+//! ```
+//!
+//! Section IDs are listed in [`SectionId`].
+//!
+//! ## Why this layout assumes raw Montgomery in-memory bytes
+//!
+//! Arkworks `G1Affine` / `G2Affine` for BN254 are repr-Rust structs containing
+//! `Fp<MontBackend, 4>` field elements. The bytes stored on disk are produced
+//! by `slice::from_raw_parts(slice.as_ptr() as *const u8, ...)` — i.e. the
+//! exact in-memory representation including Montgomery form. On read, the
+//! mmap'd bytes are reinterpreted via [`std::slice::from_raw_parts`] back into
+//! `&[G1Affine]`. This matches rapidsnark's `(G1PointAffine *)ptr` cast.
+//!
+//! The cost is layout coupling: a future arkworks version that changes the
+//! `Affine` struct layout (or its `Fp` representation) silently breaks the
+//! file format. The format is therefore versioned via the common header's
+//! `PROVER_VERSION`; bump the version when the layout assumption changes.
+
+#![cfg(not(target_arch = "wasm32"))]
+
+use {
+    crate::pedersen,
+    anyhow::{bail, ensure, Context, Result},
+    ark_bn254::{Fr, G1Affine, G2Affine},
+    ark_serialize::{CanonicalDeserialize, CanonicalSerialize},
+    memmap2::Mmap,
+    provekit_common::{InternedFieldElement, Interner, SparseMatrix, R1CS},
+    std::{
+        fs::{File, OpenOptions},
+        io::{Read, Seek, SeekFrom, Write},
+        path::Path,
+    },
+    tracing::info_span,
+};
+
+/// 4-byte sentinel that distinguishes a mmap-format `.pkp` from a
+/// zstd/xz-compressed one. Placed immediately after the 21-byte common
+/// header.
+pub const MMAP_SENTINEL: [u8; 4] = *b"MMAP";
+
+/// Required alignment for the start of every section body. Picked to match
+/// `align_of::<G1Affine>()` (which is `align_of::<u64>() == 8` on every
+/// supported target). Section bodies for `bool` arrays only need 1-byte
+/// alignment, but we pad them to `MMAP_ALIGN` too for consistency.
+pub const MMAP_ALIGN: usize = 8;
+
+/// Section IDs in the mmap-format `.pkp` file.
+#[repr(u32)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum SectionId {
+    G1A                 = 1,
+    G1B                 = 2,
+    G1Z                 = 3,
+    G1K                 = 4,
+    G2B                 = 5,
+    InfinityA           = 6,
+    InfinityB           = 7,
+    /// Small fixed-size data: domain_size, domain_gen, g1_alpha, g1_beta,
+    /// g1_delta, g2_beta, g2_delta, nb_infinity_a, nb_infinity_b. Stored
+    /// arkworks-uncompressed. As of PROVER_VERSION (1, 4) pedersen
+    /// commitment keys are no longer appended here — they live in
+    /// dedicated raw sections (`PedersenIndex`, `PedersenBases`,
+    /// `PedersenBasesExpSigma`) so loading them does not require
+    /// per-point arkworks deserialization.
+    Scalars             = 8,
+    /// Per-commitment lengths: `u64 num_commitments` followed by
+    /// `num_commitments × (u64 basis_len, u64 sigma_len)`. Lets the
+    /// reader split the two raw G1Affine sections below into per-commit
+    /// slices.
+    PedersenIndex       = 9,
+    /// Raw `G1Affine` bytes for `pedersen::ProvingKey::basis`, concatenated
+    /// across all commitments. Same in-memory Montgomery layout as the
+    /// `G1A` / `G1B` sections — copied (not borrowed) into owned
+    /// `Vec<G1Affine>` on load, no arkworks per-point deserialize.
+    PedersenBases       = 10,
+    /// Raw `G1Affine` bytes for `pedersen::ProvingKey::basis_exp_sigma`,
+    /// concatenated across all commitments. Same layout as
+    /// `PedersenBases`.
+    PedersenBasesExpSigma = 11,
+    /// R1CS scalars: a small fixed-shape header with `num_public_inputs`,
+    /// `num_virtual`, and per-matrix `num_rows` / `num_cols` for A / B /
+    /// C. Stored as raw `u64` bytes (8 fields × 8 bytes = 64 bytes
+    /// total).
+    R1CSScalars         = 12,
+    /// R1CS interner: deduplicated `Vec<FieldElement>` in raw `Fr` bytes
+    /// (Montgomery layout, like the G1 sections).
+    R1CSInterner        = 13,
+    /// `r1cs.a.new_row_indices` raw `u32` bytes.
+    R1CSAMatrixRowIndices = 14,
+    /// `r1cs.a.col_indices` raw `u32` bytes (absolute column indices —
+    /// the mmap format does not delta-encode).
+    R1CSAMatrixColIndices = 15,
+    /// `r1cs.a.values` raw `usize` (`InternedFieldElement` newtype)
+    /// bytes.
+    R1CSAMatrixValues   = 16,
+    /// `r1cs.b.new_row_indices` raw `u32` bytes.
+    R1CSBMatrixRowIndices = 17,
+    /// `r1cs.b.col_indices` raw `u32` bytes.
+    R1CSBMatrixColIndices = 18,
+    /// `r1cs.b.values` raw `usize` bytes.
+    R1CSBMatrixValues   = 19,
+    /// `r1cs.c.new_row_indices` raw `u32` bytes.
+    R1CSCMatrixRowIndices = 20,
+    /// `r1cs.c.col_indices` raw `u32` bytes.
+    R1CSCMatrixColIndices = 21,
+    /// `r1cs.c.values` raw `usize` bytes.
+    R1CSCMatrixValues   = 22,
+    /// Commitment-info index: `u64 num_commitments` followed by
+    /// `num_commitments × (u64 pub_len, u64 priv_len, u64 chal_len)`.
+    /// Lets the reader split the three raw `u64` sections below into
+    /// per-commitment slices.
+    CommitmentInfoIndex = 23,
+    /// `Groth16CommitmentInfo::public_committed` raw `u64` bytes,
+    /// concatenated across all commitments. (`usize` on 64-bit hosts is
+    /// 8 bytes; we always store as `u64` for portability.)
+    CommitmentInfoPublicCommitted = 24,
+    /// `Groth16CommitmentInfo::private_committed` raw `u64` bytes.
+    CommitmentInfoPrivateCommitted = 25,
+    /// `Groth16CommitmentInfo::challenge_indices` raw `u64` bytes.
+    CommitmentInfoChallengeIndices = 26,
+}
+
+impl SectionId {
+    fn from_u32(v: u32) -> Option<Self> {
+        match v {
+            1 => Some(Self::G1A),
+            2 => Some(Self::G1B),
+            3 => Some(Self::G1Z),
+            4 => Some(Self::G1K),
+            5 => Some(Self::G2B),
+            6 => Some(Self::InfinityA),
+            7 => Some(Self::InfinityB),
+            8 => Some(Self::Scalars),
+            9 => Some(Self::PedersenIndex),
+            10 => Some(Self::PedersenBases),
+            11 => Some(Self::PedersenBasesExpSigma),
+            12 => Some(Self::R1CSScalars),
+            13 => Some(Self::R1CSInterner),
+            14 => Some(Self::R1CSAMatrixRowIndices),
+            15 => Some(Self::R1CSAMatrixColIndices),
+            16 => Some(Self::R1CSAMatrixValues),
+            17 => Some(Self::R1CSBMatrixRowIndices),
+            18 => Some(Self::R1CSBMatrixColIndices),
+            19 => Some(Self::R1CSBMatrixValues),
+            20 => Some(Self::R1CSCMatrixRowIndices),
+            21 => Some(Self::R1CSCMatrixColIndices),
+            22 => Some(Self::R1CSCMatrixValues),
+            23 => Some(Self::CommitmentInfoIndex),
+            24 => Some(Self::CommitmentInfoPublicCommitted),
+            25 => Some(Self::CommitmentInfoPrivateCommitted),
+            26 => Some(Self::CommitmentInfoChallengeIndices),
+            _ => None,
+        }
+    }
+}
+
+/// Compile-time assertion that arkworks BN254 `G1Affine` / `G2Affine` align to
+/// at most `MMAP_ALIGN`. If a future arkworks version raises alignment, this
+/// trips and the file format must be revisited.
+const _: () = {
+    assert!(std::mem::align_of::<G1Affine>() <= MMAP_ALIGN);
+    assert!(std::mem::align_of::<G2Affine>() <= MMAP_ALIGN);
+};
+
+/// Mmap-backed proving key: identical fields to [`crate::ProvingKey`] but the
+/// large arrays are slices into an mmap'd file rather than owned `Vec`s.
+///
+/// The `_mmap` field keeps the file mapping alive for the lifetime of the
+/// struct; the raw pointer/length pairs index into it. The accessor methods
+/// (`g1_a()` etc.) return slices with the struct's lifetime, so the borrow
+/// checker prevents callers from outliving the mapping.
+///
+/// SAFETY: `*_ptr` fields point into `_mmap`'s mapped region. Constructed
+/// only via [`MmapProvingKey::load`], which validates section bounds and
+/// alignment.
+pub struct MmapProvingKey {
+    /// Holds the file mapping alive. Never accessed after construction.
+    _mmap: Mmap,
+
+    pub domain_size: u64,
+    pub domain_gen:  Fr,
+
+    pub g1_alpha: G1Affine,
+    pub g1_beta:  G1Affine,
+    pub g1_delta: G1Affine,
+
+    g1_a_ptr: *const G1Affine,
+    g1_a_len: usize,
+    g1_b_ptr: *const G1Affine,
+    g1_b_len: usize,
+    g1_k_ptr: *const G1Affine,
+    g1_k_len: usize,
+    g1_z_ptr: *const G1Affine,
+    g1_z_len: usize,
+
+    pub g2_beta:  G2Affine,
+    pub g2_delta: G2Affine,
+    g2_b_ptr:     *const G2Affine,
+    g2_b_len:     usize,
+
+    infinity_a_ptr: *const bool,
+    infinity_a_len: usize,
+    infinity_b_ptr: *const bool,
+    infinity_b_len: usize,
+
+    pub nb_infinity_a: u64,
+    pub nb_infinity_b: u64,
+
+    /// Wire indices where `A(τ) != 0`, derived once at load from
+    /// `infinity_a`. Owned (not borrowed from the mmap) — the file format
+    /// doesn't store this; it's a cheap O(n) one-time computation.
+    pub non_inf_a: Vec<usize>,
+    /// Wire indices where `B(τ) != 0`, derived once at load from
+    /// `infinity_b`.
+    pub non_inf_b: Vec<usize>,
+
+    /// Raw-pointer descriptors for each Pedersen commitment key. The
+    /// pointers index into the same `_mmap` mapping above. Lifetime is
+    /// implicit through `&self` — accessors return `&[G1Affine]` slices
+    /// bound to `&self`. No memcpy on load, unlike the legacy
+    /// `Vec<pedersen::ProvingKey>` field this replaces.
+    pub commitment_keys: Vec<MmapPedersenProvingKey>,
+}
+
+/// Borrowed Pedersen proving key whose basis arrays point into an mmap'd
+/// `.pkp` file. Layout-compatible with [`pedersen::ProvingKey`] (the
+/// underlying `G1Affine` bytes are in the same in-memory Montgomery form
+/// as the `G1A` / `G1B` sections), but no `Vec<G1Affine>` is ever
+/// allocated — the pointers reference file pages directly.
+///
+/// SAFETY: the pointers are only valid while the parent `MmapProvingKey`
+/// (and therefore its `_mmap`) is alive. Construction and use are gated
+/// behind that lifetime via the `&self` borrow on the accessors.
+pub struct MmapPedersenProvingKey {
+    basis_ptr:           *const G1Affine,
+    basis_len:           usize,
+    basis_exp_sigma_ptr: *const G1Affine,
+    basis_exp_sigma_len: usize,
+}
+
+// SAFETY: raw pointers into a read-only `Mmap`, same justification as the
+// `MmapProvingKey` Send / Sync impls below.
+unsafe impl Send for MmapPedersenProvingKey {}
+unsafe impl Sync for MmapPedersenProvingKey {}
+
+impl MmapPedersenProvingKey {
+    pub fn basis(&self) -> &[G1Affine] {
+        // SAFETY: pointer / length validated by `load_pedersen_commitment_keys`
+        // (alignment + bounds against the section); mapping outlives `&self`.
+        unsafe { std::slice::from_raw_parts(self.basis_ptr, self.basis_len) }
+    }
+
+    pub fn basis_exp_sigma(&self) -> &[G1Affine] {
+        // SAFETY: see `basis`.
+        unsafe { std::slice::from_raw_parts(self.basis_exp_sigma_ptr, self.basis_exp_sigma_len) }
+    }
+
+    /// Borrow this mmap-backed key as a `pedersen::ProvingKeyView`, so
+    /// callers can run the same `commit` / `prove_knowledge` logic
+    /// whether the bases are owned or mmap-backed.
+    pub fn view(&self) -> pedersen::ProvingKeyView<'_> {
+        pedersen::ProvingKeyView {
+            basis:           self.basis(),
+            basis_exp_sigma: self.basis_exp_sigma(),
+        }
+    }
+}
+
+// SAFETY: `*_ptr` fields point into a read-only `Mmap`. Mmap pages are
+// shareable across threads (the kernel handles paging), and we never mutate
+// through the pointers. `Vec<pedersen::ProvingKey>` is already Send + Sync.
+unsafe impl Send for MmapProvingKey {}
+// SAFETY: same as Send — read-only access through aliasable pointers into a
+// shared mapping.
+unsafe impl Sync for MmapProvingKey {}
+
+impl std::fmt::Debug for MmapProvingKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MmapProvingKey")
+            .field("domain_size", &self.domain_size)
+            .field("g1_a_len", &self.g1_a_len)
+            .field("g1_b_len", &self.g1_b_len)
+            .field("g1_k_len", &self.g1_k_len)
+            .field("g1_z_len", &self.g1_z_len)
+            .field("g2_b_len", &self.g2_b_len)
+            .field("nb_infinity_a", &self.nb_infinity_a)
+            .field("nb_infinity_b", &self.nb_infinity_b)
+            .field("nb_commitment_keys", &self.commitment_keys.len())
+            .finish()
+    }
+}
+
+impl MmapProvingKey {
+    pub fn g1_a(&self) -> &[G1Affine] {
+        // SAFETY: pointer/len validated in `load`; mapping outlives `&self`.
+        unsafe { std::slice::from_raw_parts(self.g1_a_ptr, self.g1_a_len) }
+    }
+
+    pub fn g1_b(&self) -> &[G1Affine] {
+        // SAFETY: see `g1_a`.
+        unsafe { std::slice::from_raw_parts(self.g1_b_ptr, self.g1_b_len) }
+    }
+
+    pub fn g1_k(&self) -> &[G1Affine] {
+        // SAFETY: see `g1_a`.
+        unsafe { std::slice::from_raw_parts(self.g1_k_ptr, self.g1_k_len) }
+    }
+
+    pub fn g1_z(&self) -> &[G1Affine] {
+        // SAFETY: see `g1_a`.
+        unsafe { std::slice::from_raw_parts(self.g1_z_ptr, self.g1_z_len) }
+    }
+
+    pub fn g2_b(&self) -> &[G2Affine] {
+        // SAFETY: see `g1_a`.
+        unsafe { std::slice::from_raw_parts(self.g2_b_ptr, self.g2_b_len) }
+    }
+
+    pub fn infinity_a(&self) -> &[bool] {
+        // SAFETY: see `g1_a`. `bool` has alignment 1, len validated.
+        unsafe { std::slice::from_raw_parts(self.infinity_a_ptr, self.infinity_a_len) }
+    }
+
+    pub fn infinity_b(&self) -> &[bool] {
+        // SAFETY: see `infinity_a`.
+        unsafe { std::slice::from_raw_parts(self.infinity_b_ptr, self.infinity_b_len) }
+    }
+
+    /// Load a proving key from a mmap-format file. The file's 21-byte common
+    /// header has already been read and validated by the caller; `data_offset`
+    /// is the offset (within the mmap) where the [`MMAP_SENTINEL`] starts.
+    ///
+    /// The caller is responsible for any postcard metadata that lives in the
+    /// same file — this function reads only the section table and section
+    /// bodies for the proving key.
+    pub fn load(mmap: Mmap, sections_start: usize) -> Result<Self> {
+        ensure!(
+            sections_start + 4 <= mmap.len(),
+            "mmap pkp: section_count out of bounds"
+        );
+        let section_count =
+            u32::from_le_bytes(mmap[sections_start..sections_start + 4].try_into().unwrap());
+        let table_start = sections_start + 4;
+        let table_entry_bytes = 4 + 8 + 8;
+        let table_end = table_start + section_count as usize * table_entry_bytes;
+        ensure!(
+            table_end <= mmap.len(),
+            "mmap pkp: section table out of bounds (table_end={}, file_len={})",
+            table_end,
+            mmap.len()
+        );
+
+        // Parse section table.
+        let section_offsets = {
+            let _s = info_span!("section_table_parse", section_count).entered();
+            let mut section_offsets = std::collections::HashMap::<SectionId, (usize, usize)>::new();
+            for i in 0..section_count {
+                let entry = table_start + i as usize * table_entry_bytes;
+                let id = u32::from_le_bytes(mmap[entry..entry + 4].try_into().unwrap());
+                let off =
+                    u64::from_le_bytes(mmap[entry + 4..entry + 12].try_into().unwrap()) as usize;
+                let len =
+                    u64::from_le_bytes(mmap[entry + 12..entry + 20].try_into().unwrap()) as usize;
+                ensure!(
+                    off + len <= mmap.len(),
+                    "mmap pkp: section {} body out of bounds",
+                    id
+                );
+                let Some(sid) = SectionId::from_u32(id) else {
+                    bail!("mmap pkp: unknown section id {}", id);
+                };
+                section_offsets.insert(sid, (off, len));
+            }
+            section_offsets
+        };
+
+        let g1_size = std::mem::size_of::<G1Affine>();
+        let g2_size = std::mem::size_of::<G2Affine>();
+
+        let load_g1_section = |sid: SectionId| -> Result<(*const G1Affine, usize)> {
+            let (off, len) = *section_offsets
+                .get(&sid)
+                .ok_or_else(|| anyhow::anyhow!("missing section {:?}", sid))?;
+            ensure!(
+                off % MMAP_ALIGN == 0,
+                "section {:?} body not aligned (off={})",
+                sid,
+                off
+            );
+            ensure!(
+                len % g1_size == 0,
+                "section {:?} body length {} not a multiple of size_of::<G1Affine>()={}",
+                sid,
+                len,
+                g1_size
+            );
+            let count = len / g1_size;
+            let ptr = unsafe { mmap.as_ptr().add(off) } as *const G1Affine;
+            Ok((ptr, count))
+        };
+
+        let load_g2_section = |sid: SectionId| -> Result<(*const G2Affine, usize)> {
+            let (off, len) = *section_offsets
+                .get(&sid)
+                .ok_or_else(|| anyhow::anyhow!("missing section {:?}", sid))?;
+            ensure!(
+                off % MMAP_ALIGN == 0,
+                "section {:?} body not aligned (off={})",
+                sid,
+                off
+            );
+            ensure!(
+                len % g2_size == 0,
+                "section {:?} body length {} not a multiple of size_of::<G2Affine>()={}",
+                sid,
+                len,
+                g2_size
+            );
+            let count = len / g2_size;
+            let ptr = unsafe { mmap.as_ptr().add(off) } as *const G2Affine;
+            Ok((ptr, count))
+        };
+
+        let load_bool_section = |sid: SectionId| -> Result<(*const bool, usize)> {
+            let (off, len) = *section_offsets
+                .get(&sid)
+                .ok_or_else(|| anyhow::anyhow!("missing section {:?}", sid))?;
+            ensure!(
+                off.checked_add(len).map_or(false, |end| end <= mmap.len()),
+                "section {:?} body out of bounds (off={}, len={})",
+                sid,
+                off,
+                len
+            );
+            // SAFETY: reinterpreting bytes as `&[bool]` is UB unless every byte
+            // is 0 or 1. The mmap is attacker-controllable on iOS/Android, so
+            // validate the bool validity invariant before exposing the slice.
+            let bytes = &mmap[off..off + len];
+            ensure!(
+                bytes.iter().all(|&b| b <= 1),
+                "section {:?} contains invalid bool byte (not 0 or 1)",
+                sid
+            );
+            let ptr = bytes.as_ptr() as *const bool;
+            Ok((ptr, len))
+        };
+
+        let (
+            g1_a_ptr,
+            g1_a_len,
+            g1_b_ptr,
+            g1_b_len,
+            g1_z_ptr,
+            g1_z_len,
+            g1_k_ptr,
+            g1_k_len,
+            g2_b_ptr,
+            g2_b_len,
+            infinity_a_ptr,
+            infinity_a_len,
+            infinity_b_ptr,
+            infinity_b_len,
+        ) = {
+            let _s = info_span!("bulk_section_pointers").entered();
+            let (g1_a_ptr, g1_a_len) = load_g1_section(SectionId::G1A)?;
+            let (g1_b_ptr, g1_b_len) = load_g1_section(SectionId::G1B)?;
+            let (g1_z_ptr, g1_z_len) = load_g1_section(SectionId::G1Z)?;
+            let (g1_k_ptr, g1_k_len) = load_g1_section(SectionId::G1K)?;
+            let (g2_b_ptr, g2_b_len) = load_g2_section(SectionId::G2B)?;
+            let (infinity_a_ptr, infinity_a_len) = load_bool_section(SectionId::InfinityA)?;
+            let (infinity_b_ptr, infinity_b_len) = load_bool_section(SectionId::InfinityB)?;
+            (
+                g1_a_ptr,
+                g1_a_len,
+                g1_b_ptr,
+                g1_b_len,
+                g1_z_ptr,
+                g1_z_len,
+                g1_k_ptr,
+                g1_k_len,
+                g2_b_ptr,
+                g2_b_len,
+                infinity_a_ptr,
+                infinity_a_len,
+                infinity_b_ptr,
+                infinity_b_len,
+            )
+        };
+
+        // Scalars: arkworks-uncompressed.
+        let (sp_off, sp_len) = *section_offsets
+            .get(&SectionId::Scalars)
+            .ok_or_else(|| anyhow::anyhow!("missing scalars section"))?;
+        let mut sp = &mmap[sp_off..sp_off + sp_len];
+
+        let (
+            domain_size,
+            domain_gen,
+            g1_alpha,
+            g1_beta,
+            g1_delta,
+            g2_beta,
+            g2_delta,
+            nb_infinity_a,
+            nb_infinity_b,
+        ) = {
+            let _s = info_span!("scalars_deserialize", sp_len).entered();
+            let domain_size =
+                u64::deserialize_uncompressed_unchecked(&mut sp).context("read domain_size")?;
+            let domain_gen =
+                Fr::deserialize_uncompressed_unchecked(&mut sp).context("read domain_gen")?;
+            let g1_alpha =
+                G1Affine::deserialize_uncompressed_unchecked(&mut sp).context("read g1_alpha")?;
+            let g1_beta =
+                G1Affine::deserialize_uncompressed_unchecked(&mut sp).context("read g1_beta")?;
+            let g1_delta =
+                G1Affine::deserialize_uncompressed_unchecked(&mut sp).context("read g1_delta")?;
+            let g2_beta =
+                G2Affine::deserialize_uncompressed_unchecked(&mut sp).context("read g2_beta")?;
+            let g2_delta =
+                G2Affine::deserialize_uncompressed_unchecked(&mut sp).context("read g2_delta")?;
+            let nb_infinity_a =
+                u64::deserialize_uncompressed_unchecked(&mut sp).context("read nb_infinity_a")?;
+            let nb_infinity_b =
+                u64::deserialize_uncompressed_unchecked(&mut sp).context("read nb_infinity_b")?;
+            (
+                domain_size,
+                domain_gen,
+                g1_alpha,
+                g1_beta,
+                g1_delta,
+                g2_beta,
+                g2_delta,
+                nb_infinity_a,
+                nb_infinity_b,
+            )
+        };
+
+        // Pedersen commitment keys: raw G1Affine bytes in three sections.
+        // Layout-compatible with the bulk G1 sections (same in-memory
+        // Montgomery form), but built into owned `Vec<G1Affine>` here so
+        // the existing `pedersen::ProvingKey` API stays intact. The copy
+        // is one memcpy per basis/sigma slice — no per-point arkworks
+        // deserialization, no Montgomery rebuild.
+        let commitment_keys = {
+            let _s = info_span!("pedersen_commitment_keys_load").entered();
+            load_pedersen_commitment_keys(&mmap, &section_offsets)?
+        };
+
+        // Derive non-infinity index lists from the mmap'd `infinity_a/b`
+        // bytes. One-time O(n) walk at load — amortized across every
+        // subsequent prove call.
+        // SAFETY: pointers / lengths were validated by `load_bool_section`
+        // above, and the mapping outlives this scope.
+        let infinity_a_slice: &[bool] =
+            unsafe { std::slice::from_raw_parts(infinity_a_ptr, infinity_a_len) };
+        let infinity_b_slice: &[bool] =
+            unsafe { std::slice::from_raw_parts(infinity_b_ptr, infinity_b_len) };
+        let non_inf_a: Vec<usize> = infinity_a_slice
+            .iter()
+            .enumerate()
+            .filter_map(|(i, &x)| if !x { Some(i) } else { None })
+            .collect();
+        let non_inf_b: Vec<usize> = infinity_b_slice
+            .iter()
+            .enumerate()
+            .filter_map(|(i, &x)| if !x { Some(i) } else { None })
+            .collect();
+
+        Ok(MmapProvingKey {
+            _mmap: mmap,
+            domain_size,
+            domain_gen,
+            g1_alpha,
+            g1_beta,
+            g1_delta,
+            g1_a_ptr,
+            g1_a_len,
+            g1_b_ptr,
+            g1_b_len,
+            g1_k_ptr,
+            g1_k_len,
+            g1_z_ptr,
+            g1_z_len,
+            g2_beta,
+            g2_delta,
+            g2_b_ptr,
+            g2_b_len,
+            infinity_a_ptr,
+            infinity_a_len,
+            infinity_b_ptr,
+            infinity_b_len,
+            nb_infinity_a,
+            nb_infinity_b,
+            non_inf_a,
+            non_inf_b,
+            commitment_keys,
+        })
+    }
+}
+
+/// Read the three Pedersen sections and build
+/// `Vec<MmapPedersenProvingKey>` with raw pointers into the mmap. Pure
+/// zero-copy — no `Vec<G1Affine>` is allocated, no per-point arkworks
+/// deserialization, no memcpy of the basis bytes. Allocation cost is one
+/// outer `Vec<MmapPedersenProvingKey>` of `num_commitments` × 32-byte
+/// descriptors (a few hundred bytes for typical circuits).
+///
+/// If there are no commitment keys (circuit without BSB22 commitments)
+/// the index section still exists but encodes zero commitments, and
+/// the two byte sections are empty.
+fn load_pedersen_commitment_keys(
+    mmap: &Mmap,
+    section_offsets: &std::collections::HashMap<SectionId, (usize, usize)>,
+) -> Result<Vec<MmapPedersenProvingKey>> {
+    let g1_size = std::mem::size_of::<G1Affine>();
+
+    let (idx_off, idx_len) = *section_offsets
+        .get(&SectionId::PedersenIndex)
+        .ok_or_else(|| anyhow::anyhow!("missing pedersen index section"))?;
+    let (bases_off, bases_len) = *section_offsets
+        .get(&SectionId::PedersenBases)
+        .ok_or_else(|| anyhow::anyhow!("missing pedersen bases section"))?;
+    let (sigma_off, sigma_len) = *section_offsets
+        .get(&SectionId::PedersenBasesExpSigma)
+        .ok_or_else(|| anyhow::anyhow!("missing pedersen basis_exp_sigma section"))?;
+
+    ensure!(
+        bases_off % MMAP_ALIGN == 0,
+        "PedersenBases body not aligned (off={})",
+        bases_off
+    );
+    ensure!(
+        sigma_off % MMAP_ALIGN == 0,
+        "PedersenBasesExpSigma body not aligned (off={})",
+        sigma_off
+    );
+    ensure!(
+        bases_len % g1_size == 0,
+        "PedersenBases body length {} not a multiple of size_of::<G1Affine>()={}",
+        bases_len,
+        g1_size
+    );
+    ensure!(
+        sigma_len % g1_size == 0,
+        "PedersenBasesExpSigma body length {} not a multiple of size_of::<G1Affine>()={}",
+        sigma_len,
+        g1_size
+    );
+
+    // Parse the index: u64 num_commitments followed by num × (u64
+    // basis_len, u64 sigma_len). Validate that the sum of per-commit
+    // lengths exactly matches the byte sections.
+    ensure!(idx_len >= 8, "pedersen index too short for num_commitments");
+    let num_commitments =
+        u64::from_le_bytes(mmap[idx_off..idx_off + 8].try_into().unwrap()) as usize;
+    let expected_idx_len = 8 + num_commitments * 16;
+    ensure!(
+        idx_len == expected_idx_len,
+        "pedersen index length mismatch (got {}, expected {})",
+        idx_len,
+        expected_idx_len
+    );
+
+    let mut commitment_keys = Vec::with_capacity(num_commitments);
+    let mut basis_cursor = bases_off;
+    let mut sigma_cursor = sigma_off;
+    let bases_end = bases_off + bases_len;
+    let sigma_end = sigma_off + sigma_len;
+
+    for i in 0..num_commitments {
+        let entry = idx_off + 8 + i * 16;
+        let basis_count = u64::from_le_bytes(mmap[entry..entry + 8].try_into().unwrap()) as usize;
+        let sigma_count =
+            u64::from_le_bytes(mmap[entry + 8..entry + 16].try_into().unwrap()) as usize;
+
+        let basis_bytes = basis_count * g1_size;
+        let sigma_bytes = sigma_count * g1_size;
+        ensure!(
+            basis_cursor + basis_bytes <= bases_end,
+            "pedersen basis #{} runs past PedersenBases section",
+            i
+        );
+        ensure!(
+            sigma_cursor + sigma_bytes <= sigma_end,
+            "pedersen basis_exp_sigma #{} runs past PedersenBasesExpSigma section",
+            i
+        );
+
+        // SAFETY: section offsets validated MMAP_ALIGN-aligned above,
+        // lengths are multiples of size_of::<G1Affine>(), pointers stay
+        // within the section bounds we just checked. The raw bytes are
+        // in the same in-memory Montgomery layout written by
+        // `write_pk_sections` (see the `[G1Affine] as &[u8]` cast there
+        // — the inverse cast here is layout-compatible). The pointers
+        // are stored alongside the mmap they index into in
+        // `MmapProvingKey`; accessors are bound to `&self` on that
+        // struct so the pointers can never outlive the mapping.
+        let basis_ptr = unsafe { mmap.as_ptr().add(basis_cursor) as *const G1Affine };
+        let basis_exp_sigma_ptr = unsafe { mmap.as_ptr().add(sigma_cursor) as *const G1Affine };
+        commitment_keys.push(MmapPedersenProvingKey {
+            basis_ptr,
+            basis_len: basis_count,
+            basis_exp_sigma_ptr,
+            basis_exp_sigma_len: sigma_count,
+        });
+
+        basis_cursor += basis_bytes;
+        sigma_cursor += sigma_bytes;
+    }
+
+    ensure!(
+        basis_cursor == bases_end,
+        "PedersenBases section has {} trailing bytes after all commitments",
+        bases_end - basis_cursor
+    );
+    ensure!(
+        sigma_cursor == sigma_end,
+        "PedersenBasesExpSigma section has {} trailing bytes after all commitments",
+        sigma_end - sigma_cursor
+    );
+
+    Ok(commitment_keys)
+}
+
+/// Write the curve-point sections of a [`crate::ProvingKey`] in mmap-friendly
+/// raw layout, plus a small arkworks-encoded scalars+pedersen section.
+///
+/// Writes at the current file position. The 21-byte common header,
+/// [`MMAP_SENTINEL`], and the postcard-encoded prover metadata are written by
+/// the caller (lives in `provekit_prover::pkp_io`); this function appends the
+/// section table and section bodies.
+///
+/// Returns the number of bytes written.
+pub fn write_pk_sections(pk: &crate::ProvingKey, file: &mut File) -> Result<u64> {
+    // Build the scalars blob first so we know its length. As of
+    // PROVER_VERSION (1, 4) pedersen `commitment_keys` are no longer
+    // included here — they live in dedicated raw G1Affine sections
+    // (PedersenIndex / PedersenBases / PedersenBasesExpSigma) and are
+    // memcpy'd, not arkworks-deserialized, on load.
+    let mut sp_bytes: Vec<u8> = Vec::new();
+    pk.domain_size
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write domain_size")?;
+    pk.domain_gen
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write domain_gen")?;
+    pk.g1_alpha
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write g1_alpha")?;
+    pk.g1_beta
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write g1_beta")?;
+    pk.g1_delta
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write g1_delta")?;
+    pk.g2_beta
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write g2_beta")?;
+    pk.g2_delta
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write g2_delta")?;
+    pk.nb_infinity_a
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write nb_infinity_a")?;
+    pk.nb_infinity_b
+        .serialize_uncompressed(&mut sp_bytes)
+        .context("write nb_infinity_b")?;
+
+    // Build the pedersen index: u64 num_commitments, then per-commit
+    // (u64 basis_len, u64 sigma_len). The two body sections store the
+    // raw G1Affine bytes concatenated in the same order.
+    let mut pedersen_index: Vec<u8> = Vec::new();
+    pedersen_index.extend_from_slice(&(pk.commitment_keys.len() as u64).to_le_bytes());
+    let mut total_basis_count: u64 = 0;
+    let mut total_sigma_count: u64 = 0;
+    for ck in &pk.commitment_keys {
+        pedersen_index.extend_from_slice(&(ck.basis.len() as u64).to_le_bytes());
+        pedersen_index.extend_from_slice(&(ck.basis_exp_sigma.len() as u64).to_le_bytes());
+        total_basis_count += ck.basis.len() as u64;
+        total_sigma_count += ck.basis_exp_sigma.len() as u64;
+    }
+
+    // Section bodies (in the order they'll be written).
+    let g1_size = std::mem::size_of::<G1Affine>();
+    let g2_size = std::mem::size_of::<G2Affine>();
+
+    // (id, body_byte_len)
+    let sections: [(SectionId, u64); 11] = [
+        (SectionId::G1A, (pk.g1_a.len() * g1_size) as u64),
+        (SectionId::G1B, (pk.g1_b.len() * g1_size) as u64),
+        (SectionId::G1Z, (pk.g1_z.len() * g1_size) as u64),
+        (SectionId::G1K, (pk.g1_k.len() * g1_size) as u64),
+        (SectionId::G2B, (pk.g2_b.len() * g2_size) as u64),
+        (SectionId::InfinityA, pk.infinity_a.len() as u64),
+        (SectionId::InfinityB, pk.infinity_b.len() as u64),
+        (SectionId::Scalars, sp_bytes.len() as u64),
+        (SectionId::PedersenIndex, pedersen_index.len() as u64),
+        (SectionId::PedersenBases, total_basis_count * g1_size as u64),
+        (
+            SectionId::PedersenBasesExpSigma,
+            total_sigma_count * g1_size as u64,
+        ),
+    ];
+
+    // Compute byte offsets for each section body, padding each to MMAP_ALIGN.
+    // Offsets are absolute in the file. We need to know:
+    //   table_start = current file pos + 4 (section_count u32)
+    //   table_end   = table_start + section_count * (4+8+8)
+    //   body_start  = round_up(table_end, MMAP_ALIGN)
+    let table_start = file.stream_position()? + 4;
+    let table_end = table_start + sections.len() as u64 * 20;
+    let mut cur_off = round_up(table_end, MMAP_ALIGN as u64);
+
+    let mut section_offsets: Vec<(SectionId, u64, u64)> = Vec::with_capacity(sections.len());
+    for &(id, len) in &sections {
+        section_offsets.push((id, cur_off, len));
+        cur_off = round_up(cur_off + len, MMAP_ALIGN as u64);
+    }
+    let total_end = cur_off;
+
+    // Write section count.
+    file.write_all(&(sections.len() as u32).to_le_bytes())?;
+    // Write section table.
+    for &(id, off, len) in &section_offsets {
+        file.write_all(&(id as u32).to_le_bytes())?;
+        file.write_all(&off.to_le_bytes())?;
+        file.write_all(&len.to_le_bytes())?;
+    }
+    // Pad to body_start.
+    let body_start = section_offsets[0].1;
+    pad_to(file, body_start)?;
+
+    // Write section bodies, each followed by alignment padding for the next.
+    let g1_a_bytes = unsafe {
+        std::slice::from_raw_parts(pk.g1_a.as_ptr() as *const u8, pk.g1_a.len() * g1_size)
+    };
+    write_section_body(file, g1_a_bytes, section_offsets[1].1)?;
+
+    let g1_b_bytes = unsafe {
+        std::slice::from_raw_parts(pk.g1_b.as_ptr() as *const u8, pk.g1_b.len() * g1_size)
+    };
+    write_section_body(file, g1_b_bytes, section_offsets[2].1)?;
+
+    let g1_z_bytes = unsafe {
+        std::slice::from_raw_parts(pk.g1_z.as_ptr() as *const u8, pk.g1_z.len() * g1_size)
+    };
+    write_section_body(file, g1_z_bytes, section_offsets[3].1)?;
+
+    let g1_k_bytes = unsafe {
+        std::slice::from_raw_parts(pk.g1_k.as_ptr() as *const u8, pk.g1_k.len() * g1_size)
+    };
+    write_section_body(file, g1_k_bytes, section_offsets[4].1)?;
+
+    let g2_b_bytes = unsafe {
+        std::slice::from_raw_parts(pk.g2_b.as_ptr() as *const u8, pk.g2_b.len() * g2_size)
+    };
+    write_section_body(file, g2_b_bytes, section_offsets[5].1)?;
+
+    let infinity_a_bytes = unsafe {
+        std::slice::from_raw_parts(pk.infinity_a.as_ptr() as *const u8, pk.infinity_a.len())
+    };
+    write_section_body(file, infinity_a_bytes, section_offsets[6].1)?;
+
+    let infinity_b_bytes = unsafe {
+        std::slice::from_raw_parts(pk.infinity_b.as_ptr() as *const u8, pk.infinity_b.len())
+    };
+    write_section_body(file, infinity_b_bytes, section_offsets[7].1)?;
+
+    // Scalars (small, arkworks-encoded).
+    write_section_body(file, &sp_bytes, section_offsets[8].1)?;
+
+    // Pedersen index (small, hand-rolled).
+    write_section_body(file, &pedersen_index, section_offsets[9].1)?;
+
+    // Pedersen bases: raw G1Affine bytes concatenated. Mirrors the layout
+    // for the G1A/G1B/G1Z/G1K sections so the reader can recover the
+    // bases by memcpy instead of arkworks per-point deserialize.
+    for ck in &pk.commitment_keys {
+        let bytes = unsafe {
+            std::slice::from_raw_parts(ck.basis.as_ptr() as *const u8, ck.basis.len() * g1_size)
+        };
+        file.write_all(bytes)?;
+    }
+    pad_to(file, section_offsets[10].1)?;
+
+    // Pedersen basis_exp_sigma: raw G1Affine bytes concatenated.
+    for ck in &pk.commitment_keys {
+        let bytes = unsafe {
+            std::slice::from_raw_parts(
+                ck.basis_exp_sigma.as_ptr() as *const u8,
+                ck.basis_exp_sigma.len() * g1_size,
+            )
+        };
+        file.write_all(bytes)?;
+    }
+    pad_to(file, total_end)?;
+
+    Ok(total_end - (table_start - 4))
+}
+
+fn round_up(v: u64, align: u64) -> u64 {
+    (v + align - 1) / align * align
+}
+
+fn pad_to(file: &mut File, target: u64) -> Result<()> {
+    let cur = file.stream_position()?;
+    if cur < target {
+        let pad = vec![0u8; (target - cur) as usize];
+        file.write_all(&pad)?;
+    } else if cur > target {
+        bail!("pad_to: current position {} is past target {}", cur, target);
+    }
+    Ok(())
+}
+
+fn write_section_body(file: &mut File, body: &[u8], next_section_off: u64) -> Result<()> {
+    file.write_all(body)?;
+    pad_to(file, next_section_off)
+}
+
+// ---------------------------------------------------------------------------
+// R1CS chunk: raw-byte layout for the R1CS struct, appended after the PK
+// section table so the mmap reader can memcpy it back without going through
+// postcard (~70 ms → ~3-5 ms on the noir_sha256 benchmark).
+// ---------------------------------------------------------------------------
+
+/// Per-commitment lengths used by the writer to size the
+/// commitment-info chunk, and returned by the reader after parsing the
+/// chunk. Each triple is `(public_committed, private_committed,
+/// challenge_indices)` as `Vec<u64>` (the prover crate converts to and
+/// from `Vec<usize>` at the boundary).
+pub type CommitmentInfoTriple = (Vec<u64>, Vec<u64>, Vec<u64>);
+
+const R1CS_CHUNK_MAGIC: [u8; 4] = *b"R1CS";
+const CI_CHUNK_MAGIC: [u8; 4] = *b"CINF";
+
+/// Write an R1CS to disk in mmap-friendly raw byte layout. Caller passes
+/// the file at the position where the chunk should start; chunk is
+/// 8-byte aligned. Returns the absolute file position immediately
+/// after the chunk (which is where the next chunk, e.g. commitment_info,
+/// should be written).
+///
+/// Layout:
+/// ```text
+/// [ "R1CS" magic (4) ]
+/// [ pad (4) ]
+/// [ u64 num_public_inputs ]
+/// [ u64 num_virtual ]
+/// [ u64 a_num_rows ]   [ u64 a_num_cols ]
+/// [ u64 b_num_rows ]   [ u64 b_num_cols ]
+/// [ u64 c_num_rows ]   [ u64 c_num_cols ]
+/// [ u64 interner_len ]
+/// [ raw Fr bytes × interner_len ]      [ pad to 8 ]
+/// for each matrix (a, b, c):
+///     [ u64 new_row_indices_len ]      [ raw u32 bytes ]   [ pad to 8 ]
+///     [ u64 col_indices_len ]          [ raw u32 bytes ]   [ pad to 8 ]
+///     [ u64 values_len ]               [ raw usize bytes ] [ pad to 8 ]
+/// ```
+pub fn write_r1cs_chunk(r1cs: &R1CS, file: &mut File) -> Result<u64> {
+    // Align start of chunk to 8 bytes so the raw arrays inside can be
+    // slice-cast.
+    let chunk_start = round_up(file.stream_position()?, MMAP_ALIGN as u64);
+    pad_to(file, chunk_start)?;
+
+    file.write_all(&R1CS_CHUNK_MAGIC)?;
+    file.write_all(&[0u8; 4])?; // pad to 8-byte alignment for the u64s
+    file.write_all(&(r1cs.num_public_inputs as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.num_virtual as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.a.num_rows as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.a.num_cols as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.b.num_rows as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.b.num_cols as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.c.num_rows as u64).to_le_bytes())?;
+    file.write_all(&(r1cs.c.num_cols as u64).to_le_bytes())?;
+
+    // Interner values
+    let interner_values = r1cs.interner.values_raw();
+    file.write_all(&(interner_values.len() as u64).to_le_bytes())?;
+    let interner_bytes = unsafe {
+        std::slice::from_raw_parts(
+            interner_values.as_ptr() as *const u8,
+            interner_values.len() * std::mem::size_of::<Fr>(),
+        )
+    };
+    file.write_all(interner_bytes)?;
+    {
+        let p = file.stream_position()?;
+        pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+    }
+
+    for matrix in [&r1cs.a, &r1cs.b, &r1cs.c] {
+        write_sparse_matrix_arrays(matrix, file)?;
+    }
+
+    Ok(file.stream_position()?)
+}
+
+fn write_sparse_matrix_arrays(matrix: &SparseMatrix, file: &mut File) -> Result<()> {
+    let row_indices = matrix.new_row_indices_raw();
+    file.write_all(&(row_indices.len() as u64).to_le_bytes())?;
+    let row_bytes = unsafe {
+        std::slice::from_raw_parts(row_indices.as_ptr() as *const u8, row_indices.len() * 4)
+    };
+    file.write_all(row_bytes)?;
+    {
+        let p = file.stream_position()?;
+        pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+    }
+
+    let col_indices = matrix.col_indices_raw();
+    file.write_all(&(col_indices.len() as u64).to_le_bytes())?;
+    let col_bytes = unsafe {
+        std::slice::from_raw_parts(col_indices.as_ptr() as *const u8, col_indices.len() * 4)
+    };
+    file.write_all(col_bytes)?;
+    {
+        let p = file.stream_position()?;
+        pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+    }
+
+    let values = matrix.values_raw();
+    file.write_all(&(values.len() as u64).to_le_bytes())?;
+    let values_bytes = unsafe {
+        std::slice::from_raw_parts(
+            values.as_ptr() as *const u8,
+            values.len() * std::mem::size_of::<InternedFieldElement>(),
+        )
+    };
+    file.write_all(values_bytes)?;
+    {
+        let p = file.stream_position()?;
+        pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+    }
+
+    Ok(())
+}
+
+/// Parse the PK section table at `sections_start` and return the
+/// position where the PK section bodies end (max of `offset + len` over
+/// all sections, rounded up to `MMAP_ALIGN`). The R1CS chunk starts at
+/// this position. Does not consume the mmap.
+pub fn pk_sections_end_offset(mmap: &[u8], sections_start: usize) -> Result<usize> {
+    ensure!(
+        sections_start + 4 <= mmap.len(),
+        "section_count out of bounds"
+    );
+    let section_count =
+        u32::from_le_bytes(mmap[sections_start..sections_start + 4].try_into().unwrap());
+    let table_start = sections_start + 4;
+    let table_entry_bytes = 4 + 8 + 8;
+    let table_end = table_start + section_count as usize * table_entry_bytes;
+    ensure!(table_end <= mmap.len(), "pk section table out of bounds");
+
+    let mut max_end: usize = round_up(table_end as u64, MMAP_ALIGN as u64) as usize;
+    for i in 0..section_count {
+        let entry = table_start + i as usize * table_entry_bytes;
+        let off = u64::from_le_bytes(mmap[entry + 4..entry + 12].try_into().unwrap()) as usize;
+        let len = u64::from_le_bytes(mmap[entry + 12..entry + 20].try_into().unwrap()) as usize;
+        let end_rounded = round_up((off + len) as u64, MMAP_ALIGN as u64) as usize;
+        if end_rounded > max_end {
+            max_end = end_rounded;
+        }
+    }
+    Ok(max_end)
+}
+
+/// Read an R1CS chunk back from mmap bytes via memcpy. `bytes` should be
+/// the mmap slice starting at the chunk's first byte; the chunk consumes
+/// however many bytes its layout requires. Returns the parsed R1CS plus
+/// the number of bytes consumed (so the caller can advance to the next
+/// chunk).
+pub fn read_r1cs_chunk(bytes: &[u8]) -> Result<(R1CS, usize)> {
+    ensure!(bytes.len() >= 8, "r1cs chunk too short for magic");
+    ensure!(bytes[..4] == R1CS_CHUNK_MAGIC, "r1cs chunk magic mismatch");
+    let mut pos = 8usize;
+    let read_u64 = |bytes: &[u8], pos: &mut usize| -> Result<u64> {
+        ensure!(*pos + 8 <= bytes.len(), "r1cs chunk: short read for u64");
+        let v = u64::from_le_bytes(bytes[*pos..*pos + 8].try_into().unwrap());
+        *pos += 8;
+        Ok(v)
+    };
+
+    let num_public_inputs = read_u64(bytes, &mut pos)? as usize;
+    let num_virtual = read_u64(bytes, &mut pos)? as usize;
+    let a_num_rows = read_u64(bytes, &mut pos)? as usize;
+    let a_num_cols = read_u64(bytes, &mut pos)? as usize;
+    let b_num_rows = read_u64(bytes, &mut pos)? as usize;
+    let b_num_cols = read_u64(bytes, &mut pos)? as usize;
+    let c_num_rows = read_u64(bytes, &mut pos)? as usize;
+    let c_num_cols = read_u64(bytes, &mut pos)? as usize;
+
+    // Interner
+    let interner_len = read_u64(bytes, &mut pos)? as usize;
+    let fr_size = std::mem::size_of::<Fr>();
+    let interner_bytes_len = interner_len * fr_size;
+    ensure!(
+        pos + interner_bytes_len <= bytes.len(),
+        "r1cs chunk: short read for interner"
+    );
+    // SAFETY: source bytes are in the same in-memory Montgomery layout
+    // written by `write_r1cs_chunk` (Fr-as-raw-bytes cast). Source is
+    // 8-byte aligned because `write_r1cs_chunk` pads after each blob.
+    let interner_slice: &[Fr] =
+        unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(pos) as *const Fr, interner_len) };
+    let interner = Interner::from_values(interner_slice.to_vec());
+    pos += interner_bytes_len;
+    pos = round_up(pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let a = read_sparse_matrix_arrays(bytes, &mut pos, a_num_rows, a_num_cols)?;
+    let b = read_sparse_matrix_arrays(bytes, &mut pos, b_num_rows, b_num_cols)?;
+    let c = read_sparse_matrix_arrays(bytes, &mut pos, c_num_rows, c_num_cols)?;
+
+    let r1cs = R1CS {
+        num_public_inputs,
+        interner,
+        a,
+        b,
+        c,
+        num_virtual,
+    };
+    Ok((r1cs, pos))
+}
+
+fn read_sparse_matrix_arrays(
+    bytes: &[u8],
+    pos: &mut usize,
+    num_rows: usize,
+    num_cols: usize,
+) -> Result<SparseMatrix> {
+    let read_u64 = |bytes: &[u8], pos: &mut usize| -> Result<u64> {
+        ensure!(*pos + 8 <= bytes.len(), "r1cs chunk: short read for u64");
+        let v = u64::from_le_bytes(bytes[*pos..*pos + 8].try_into().unwrap());
+        *pos += 8;
+        Ok(v)
+    };
+
+    let row_len = read_u64(bytes, pos)? as usize;
+    ensure!(*pos + row_len * 4 <= bytes.len(), "r1cs chunk: short row");
+    // SAFETY: writer cast u32 array to bytes; reader does the inverse.
+    // Source is 8-byte aligned because `write_r1cs_chunk` pads after
+    // each blob and `u32` only needs 4-byte alignment.
+    let row_slice: &[u32] =
+        unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(*pos) as *const u32, row_len) };
+    let new_row_indices = row_slice.to_vec();
+    *pos += row_len * 4;
+    *pos = round_up(*pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let col_len = read_u64(bytes, pos)? as usize;
+    ensure!(*pos + col_len * 4 <= bytes.len(), "r1cs chunk: short cols");
+    let col_slice: &[u32] =
+        unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(*pos) as *const u32, col_len) };
+    let col_indices = col_slice.to_vec();
+    *pos += col_len * 4;
+    *pos = round_up(*pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let val_len = read_u64(bytes, pos)? as usize;
+    let val_size = std::mem::size_of::<InternedFieldElement>();
+    ensure!(
+        *pos + val_len * val_size <= bytes.len(),
+        "r1cs chunk: short values"
+    );
+    let val_slice: &[InternedFieldElement] = unsafe {
+        std::slice::from_raw_parts(
+            bytes.as_ptr().add(*pos) as *const InternedFieldElement,
+            val_len,
+        )
+    };
+    let values = val_slice.to_vec();
+    *pos += val_len * val_size;
+    *pos = round_up(*pos as u64, MMAP_ALIGN as u64) as usize;
+
+    Ok(SparseMatrix::from_raw_parts(
+        num_rows,
+        num_cols,
+        new_row_indices,
+        col_indices,
+        values,
+    ))
+}
+
+// ---------------------------------------------------------------------------
+// commitment_info chunk: raw-byte layout for `Vec<Groth16CommitmentInfo>`,
+// stored as triples of `Vec<u64>` (the prover crate converts to/from
+// `Vec<usize>` at the boundary).
+// ---------------------------------------------------------------------------
+
+/// Write the commitment-info data in raw byte layout. Returns the
+/// absolute file position after the chunk.
+///
+/// Layout:
+/// ```text
+/// [ "CINF" magic (4) ][ pad (4) ]
+/// [ u64 num_commitments ]
+/// [ for each commitment: u64 pub_len, u64 priv_len, u64 chal_len ]
+/// [ pad to 8 ]
+/// [ raw u64 bytes: all pub_committed concatenated ][ pad to 8 ]
+/// [ raw u64 bytes: all priv_committed concatenated ][ pad to 8 ]
+/// [ raw u64 bytes: all chal_indices concatenated ][ pad to 8 ]
+/// ```
+pub fn write_commitment_info_chunk(
+    triples: &[CommitmentInfoTriple],
+    file: &mut File,
+) -> Result<u64> {
+    let chunk_start = round_up(file.stream_position()?, MMAP_ALIGN as u64);
+    pad_to(file, chunk_start)?;
+
+    file.write_all(&CI_CHUNK_MAGIC)?;
+    file.write_all(&[0u8; 4])?;
+    file.write_all(&(triples.len() as u64).to_le_bytes())?;
+    for (pub_v, priv_v, chal_v) in triples {
+        file.write_all(&(pub_v.len() as u64).to_le_bytes())?;
+        file.write_all(&(priv_v.len() as u64).to_le_bytes())?;
+        file.write_all(&(chal_v.len() as u64).to_le_bytes())?;
+    }
+    {
+        let p = file.stream_position()?;
+        pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+    }
+
+    for which in 0..3 {
+        for triple in triples {
+            let v = match which {
+                0 => &triple.0,
+                1 => &triple.1,
+                _ => &triple.2,
+            };
+            let bytes = unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, v.len() * 8) };
+            file.write_all(bytes)?;
+        }
+        {
+            let p = file.stream_position()?;
+            pad_to(file, round_up(p, MMAP_ALIGN as u64))?;
+        }
+    }
+
+    Ok(file.stream_position()?)
+}
+
+/// Read the commitment-info chunk back via memcpy. Returns the triples
+/// and the number of bytes consumed.
+pub fn read_commitment_info_chunk(bytes: &[u8]) -> Result<(Vec<CommitmentInfoTriple>, usize)> {
+    ensure!(bytes.len() >= 8, "ci chunk too short");
+    ensure!(bytes[..4] == CI_CHUNK_MAGIC, "ci chunk magic mismatch");
+    let mut pos = 8usize;
+    let read_u64 = |bytes: &[u8], pos: &mut usize| -> Result<u64> {
+        ensure!(*pos + 8 <= bytes.len(), "ci chunk: short read for u64");
+        let v = u64::from_le_bytes(bytes[*pos..*pos + 8].try_into().unwrap());
+        *pos += 8;
+        Ok(v)
+    };
+
+    let num_commitments = read_u64(bytes, &mut pos)? as usize;
+    let mut lens: Vec<(usize, usize, usize)> = Vec::with_capacity(num_commitments);
+    for _ in 0..num_commitments {
+        let p = read_u64(bytes, &mut pos)? as usize;
+        let pr = read_u64(bytes, &mut pos)? as usize;
+        let ch = read_u64(bytes, &mut pos)? as usize;
+        lens.push((p, pr, ch));
+    }
+    pos = round_up(pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let mut pub_vecs = Vec::with_capacity(num_commitments);
+    for &(p, ..) in &lens {
+        ensure!(pos + p * 8 <= bytes.len(), "ci chunk: short pub");
+        let s: &[u64] =
+            unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(pos) as *const u64, p) };
+        pub_vecs.push(s.to_vec());
+        pos += p * 8;
+    }
+    pos = round_up(pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let mut priv_vecs = Vec::with_capacity(num_commitments);
+    for &(_, pr, _) in &lens {
+        ensure!(pos + pr * 8 <= bytes.len(), "ci chunk: short priv");
+        let s: &[u64] =
+            unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(pos) as *const u64, pr) };
+        priv_vecs.push(s.to_vec());
+        pos += pr * 8;
+    }
+    pos = round_up(pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let mut chal_vecs = Vec::with_capacity(num_commitments);
+    for &(_, _, ch) in &lens {
+        ensure!(pos + ch * 8 <= bytes.len(), "ci chunk: short chal");
+        let s: &[u64] =
+            unsafe { std::slice::from_raw_parts(bytes.as_ptr().add(pos) as *const u64, ch) };
+        chal_vecs.push(s.to_vec());
+        pos += ch * 8;
+    }
+    pos = round_up(pos as u64, MMAP_ALIGN as u64) as usize;
+
+    let triples: Vec<CommitmentInfoTriple> = pub_vecs
+        .into_iter()
+        .zip(priv_vecs.into_iter())
+        .zip(chal_vecs.into_iter())
+        .map(|((p, pr), ch)| (p, pr, ch))
+        .collect();
+
+    Ok((triples, pos))
+}
+
+/// Open a file and validate it is a mmap-format `.pkp` (i.e. has the
+/// [`MMAP_SENTINEL`] following the 21-byte common header). Returns the open
+/// file handle and the offset within it where the postcard metadata starts.
+///
+/// Used by the prover crate to coordinate metadata + section-body reads off
+/// the same file.
+pub fn open_mmap_pkp(path: &Path) -> Result<(File, u64)> {
+    let mut file = OpenOptions::new()
+        .read(true)
+        .open(path)
+        .with_context(|| format!("opening {}", path.display()))?;
+    // Skip the 21-byte common header (caller has already validated it via
+    // `provekit_common::binary_format`).
+    file.seek(SeekFrom::Start(21))?;
+    let mut sentinel = [0u8; 4];
+    file.read_exact(&mut sentinel)?;
+    ensure!(
+        sentinel == MMAP_SENTINEL,
+        "not an mmap-format .pkp (sentinel mismatch)"
+    );
+    Ok((file, 25))
+}
+
+#[cfg(test)]
+mod tests {
+    use {super::*, ark_ec::AffineRepr, provekit_common::R1CS, tempfile::tempdir};
+
+    /// Round-trip: setup a tiny PK, write its sections via
+    /// `write_pk_sections` into a bare file, then mmap-load and assert the
+    /// big arrays match byte-for-byte (and the small scalars equal their
+    /// originals). This is the format-stability test for the on-disk layout.
+    #[test]
+    fn test_mmap_pk_roundtrip() {
+        // Trivial circuit: x * x = y
+        let mut r1cs = R1CS::new();
+        r1cs.num_public_inputs = 1;
+        r1cs.add_witnesses(3);
+        let one = ark_bn254::Fr::from(1u64);
+        r1cs.add_constraint(&[(one, 2)], &[(one, 2)], &[(one, 1)]);
+        let (pk, _vk) = crate::setup::setup(&r1cs, &[], &[]).unwrap();
+
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("pk_sections.bin");
+
+        // Layout the test file as: [section_count + table + bodies] starting
+        // at offset 0, matching what `MmapProvingKey::load(mmap, 0)` expects.
+        {
+            let mut f = File::create(&path).unwrap();
+            write_pk_sections(&pk, &mut f).unwrap();
+            f.sync_all().unwrap();
+        }
+
+        let file = std::fs::File::open(&path).unwrap();
+        let mmap = unsafe { Mmap::map(&file).unwrap() };
+        let loaded = MmapProvingKey::load(mmap, 0).unwrap();
+
+        // Big arrays: bytewise equality between the live PK and the
+        // mmap-loaded view.
+        assert_eq!(loaded.g1_a().len(), pk.g1_a.len(), "g1_a length");
+        assert_eq!(loaded.g1_a(), pk.g1_a.as_slice(), "g1_a contents");
+        assert_eq!(loaded.g1_b(), pk.g1_b.as_slice(), "g1_b contents");
+        assert_eq!(loaded.g1_z(), pk.g1_z.as_slice(), "g1_z contents");
+        assert_eq!(loaded.g1_k(), pk.g1_k.as_slice(), "g1_k contents");
+        assert_eq!(loaded.g2_b(), pk.g2_b.as_slice(), "g2_b contents");
+        assert_eq!(loaded.infinity_a(), pk.infinity_a.as_slice());
+        assert_eq!(loaded.infinity_b(), pk.infinity_b.as_slice());
+
+        // Small scalars / individual points.
+        assert_eq!(loaded.domain_size, pk.domain_size);
+        assert_eq!(loaded.domain_gen, pk.domain_gen);
+        assert_eq!(loaded.g1_alpha, pk.g1_alpha);
+        assert_eq!(loaded.g1_beta, pk.g1_beta);
+        assert_eq!(loaded.g1_delta, pk.g1_delta);
+        assert_eq!(loaded.g2_beta, pk.g2_beta);
+        assert_eq!(loaded.g2_delta, pk.g2_delta);
+        assert_eq!(loaded.nb_infinity_a, pk.nb_infinity_a);
+        assert_eq!(loaded.nb_infinity_b, pk.nb_infinity_b);
+        assert_eq!(loaded.commitment_keys.len(), pk.commitment_keys.len());
+
+        // Sanity: the points are still on the curve after the mmap cast.
+        for p in loaded.g1_a() {
+            assert!(p.is_on_curve() || p.is_zero());
+        }
+    }
+
+    #[test]
+    fn test_section_id_roundtrip() {
+        for sid in [
+            SectionId::G1A,
+            SectionId::G1B,
+            SectionId::G1Z,
+            SectionId::G1K,
+            SectionId::G2B,
+            SectionId::InfinityA,
+            SectionId::InfinityB,
+            SectionId::Scalars,
+            SectionId::PedersenIndex,
+            SectionId::PedersenBases,
+            SectionId::PedersenBasesExpSigma,
+        ] {
+            let v = sid as u32;
+            assert_eq!(SectionId::from_u32(v), Some(sid));
+        }
+        assert_eq!(SectionId::from_u32(0), None);
+        assert_eq!(SectionId::from_u32(99), None);
+    }
+}
diff --git a/provekit/groth16/src/pedersen.rs b/provekit/groth16/src/pedersen.rs
new file mode 100644
index 000000000..0efb0cc9c
--- /dev/null
+++ b/provekit/groth16/src/pedersen.rs
@@ -0,0 +1,338 @@
+/// Pedersen commitment scheme for BSB22 extension.
+///
+/// A Pedersen commitment C = Σ vᵢ·Gᵢ binds the prover to values v₁..vₖ
+/// using bases G₁..Gₖ from the trusted setup. The proof of knowledge (PoK)
+/// proves the prover knows the committed values without revealing them.
+use anyhow::{ensure, Result};
+use {
+    ark_bn254::{Fr, G1Affine, G1Projective, G2Affine, G2Projective},
+    ark_ec::{AffineRepr, CurveGroup, VariableBaseMSM},
+    ark_ff::{One, UniformRand, Zero},
+    ark_serialize::{CanonicalDeserialize, CanonicalSerialize},
+    zeroize::Zeroizing,
+};
+
+/// A Pedersen commitment `C = Σ vᵢ · Gᵢ` — binds the prover to values.
+///
+/// Wrapping `G1Affine` in a distinct newtype prevents accidentally passing a
+/// proof of knowledge where a commitment is expected (or vice versa): they're
+/// both `G1Affine` at the curve level but represent semantically distinct
+/// objects, and a swap would silently verify the wrong pairing equation.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct Commitment(pub G1Affine);
+
+/// A Pedersen proof of knowledge `PoK = Σ vᵢ · (σ·Gᵢ)` — proves the prover
+/// knows the opening of a [`Commitment`] without revealing the values. See
+/// [`Commitment`] for the rationale behind making this a newtype.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct ProofOfKnowledge(pub G1Affine);
+
+/// Pedersen proving key: bases for commitment and PoK generation.
+#[derive(Clone, Debug, CanonicalSerialize, CanonicalDeserialize)]
+pub struct ProvingKey {
+    /// Original bases [G₁, G₂, ..., Gₖ] from trusted setup.
+    pub basis:           Vec<G1Affine>,
+    /// Bases raised to secret sigma: [G₁^σ, G₂^σ, ..., Gₖ^σ].
+    pub basis_exp_sigma: Vec<G1Affine>,
+}
+
+/// Pedersen verifying key: G2 elements for pairing-based verification.
+#[derive(Clone, Debug, CanonicalSerialize, CanonicalDeserialize)]
+pub struct VerifyingKey {
+    /// Random G2 generator chosen during setup.
+    pub g:           G2Affine,
+    /// G^(-σ) where σ is the secret from setup.
+    pub g_sigma_neg: G2Affine,
+}
+
+/// Generate Pedersen commitment keys from bases.
+///
+/// `bases_per_commitment` is a slice of slices — one set of bases per
+/// commitment. `g2_point` is an optional pre-chosen G2 point (if None, a random
+/// one is sampled).
+pub fn setup(
+    bases_per_commitment: &[&[G1Affine]],
+    g2_point: Option<G2Affine>,
+) -> Result<(Vec<ProvingKey>, VerifyingKey)> {
+    let mut rng = ark_std::rand::thread_rng();
+
+    // Choose G2 generator
+    let g = g2_point.unwrap_or_else(|| G2Projective::rand(&mut rng).into_affine());
+
+    // Sample secret sigma. `Zeroizing` wipes the field element when it drops,
+    // so the toxic Pedersen secret can't be recovered from freed memory after
+    // setup returns.
+    let sigma = Zeroizing::new(Fr::rand(&mut rng));
+    ensure!(!sigma.is_zero(), "sigma must be non-zero");
+
+    // Compute G^(-sigma)
+    let g_sigma_neg: G2Affine = (-(G2Projective::from(g) * *sigma)).into_affine();
+
+    let vk = VerifyingKey { g, g_sigma_neg };
+
+    let pks: Vec<ProvingKey> = bases_per_commitment
+        .iter()
+        .map(|bases| {
+            // BasisExpSigma[i] = Basis[i] * sigma
+            let basis_exp_sigma: Vec<G1Affine> = bases
+                .iter()
+                .map(|b| (G1Projective::from(*b) * *sigma).into_affine())
+                .collect();
+
+            ProvingKey {
+                basis: bases.to_vec(),
+                basis_exp_sigma,
+            }
+        })
+        .collect();
+
+    Ok((pks, vk))
+}
+
+/// Chunk size for Pedersen MSMs. arkworks' `VariableBaseMSM` keeps a
+/// projective copy of every base plus per-thread bucket state, so a single
+/// 1M-element call holds hundreds of MB of transient memory. Splitting into
+/// 100k-element chunks caps that to ~tens of MB at the cost of ~10% wall
+/// clock.
+const PEDERSEN_MSM_CHUNK: usize = 100_000;
+
+fn chunked_g1_msm(bases: &[G1Affine], values: &[Fr]) -> Result<G1Projective> {
+    ensure!(
+        bases.len() == values.len(),
+        "chunked_g1_msm length mismatch: {} bases vs {} values",
+        bases.len(),
+        values.len()
+    );
+    let mut acc = G1Projective::zero();
+    for (b_chunk, v_chunk) in bases
+        .chunks(PEDERSEN_MSM_CHUNK)
+        .zip(values.chunks(PEDERSEN_MSM_CHUNK))
+    {
+        acc += G1Projective::msm(b_chunk, v_chunk).map_err(crate::msm_err)?;
+    }
+    Ok(acc)
+}
+
+/// Borrowed view over a Pedersen `ProvingKey`'s bases. Same `commit` /
+/// `prove_knowledge` API as [`ProvingKey`], but the basis slices can point
+/// at either owned `Vec<G1Affine>`s (legacy path) or mmap'd file pages
+/// (rapidsnark-style raw layout). Lets callers be polymorphic over the
+/// backing store without a runtime indirection or memcpy.
+#[derive(Clone, Copy)]
+pub struct ProvingKeyView<'a> {
+    pub basis:           &'a [G1Affine],
+    pub basis_exp_sigma: &'a [G1Affine],
+}
+
+impl<'a> ProvingKeyView<'a> {
+    /// Compute Pedersen commitment: `C = Σ vᵢ · Basis[i]`.
+    pub fn commit(&self, values: &[Fr]) -> Result<Commitment> {
+        ensure!(
+            values.len() == self.basis.len(),
+            "commit: got {} values, expected {}",
+            values.len(),
+            self.basis.len()
+        );
+
+        if values.is_empty() {
+            return Ok(Commitment(G1Affine::zero()));
+        }
+
+        let commitment = chunked_g1_msm(self.basis, values)?;
+        Ok(Commitment(commitment.into_affine()))
+    }
+
+    /// Generate proof of knowledge: `PoK = Σ vᵢ · BasisExpSigma[i]`.
+    pub fn prove_knowledge(&self, values: &[Fr]) -> Result<ProofOfKnowledge> {
+        ensure!(
+            values.len() == self.basis_exp_sigma.len(),
+            "prove_knowledge: got {} values, expected {}",
+            values.len(),
+            self.basis_exp_sigma.len()
+        );
+
+        if values.is_empty() {
+            return Ok(ProofOfKnowledge(G1Affine::zero()));
+        }
+
+        let pok = chunked_g1_msm(self.basis_exp_sigma, values)?;
+        Ok(ProofOfKnowledge(pok.into_affine()))
+    }
+}
+
+impl ProvingKey {
+    /// Borrow this owned key as a view. Cheap — just two slice references.
+    pub fn view(&self) -> ProvingKeyView<'_> {
+        ProvingKeyView {
+            basis:           &self.basis,
+            basis_exp_sigma: &self.basis_exp_sigma,
+        }
+    }
+
+    /// Compute Pedersen commitment: `C = Σ vᵢ · Basis[i]`.
+    pub fn commit(&self, values: &[Fr]) -> Result<Commitment> {
+        self.view().commit(values)
+    }
+
+    /// Generate proof of knowledge: `PoK = Σ vᵢ · BasisExpSigma[i]`.
+    ///
+    /// Proves the prover knows the values inside the commitment without
+    /// revealing them. The verifier checks e(C, G^(-σ)) · e(PoK, G) == 1.
+    pub fn prove_knowledge(&self, values: &[Fr]) -> Result<ProofOfKnowledge> {
+        self.view().prove_knowledge(values)
+    }
+}
+
+/// Fold multiple G1 points into one using a random linear combination.
+///
+/// Returns: `points[0] + coeff·points[1] + coeff²·points[2] + ...`
+pub fn fold(points: &[G1Affine], coeff: Fr) -> Result<G1Affine> {
+    if points.is_empty() {
+        return Ok(G1Affine::zero());
+    }
+    if points.len() == 1 {
+        return Ok(points[0]);
+    }
+
+    // Build scalars: [1, coeff, coeff², coeff³, ...]
+    let mut scalars = Vec::with_capacity(points.len());
+    let mut power = Fr::one();
+    for _ in 0..points.len() {
+        scalars.push(power);
+        power *= coeff;
+    }
+
+    let result = G1Projective::msm(points, &scalars).map_err(crate::msm_err)?;
+    Ok(result.into_affine())
+}
+
+/// Batch verify multiple commitments against multiple verifying keys.
+///
+/// Checks that for each commitment Cᵢ with PoKᵢ and verifying key VKᵢ:
+///   e(Cᵢ, VKᵢ.GSigmaNeg) · e(PoKᵢ, VKᵢ.G) == 1
+///
+/// All PoKs are expected to have already been folded into a single point.
+pub fn batch_verify_multi_vk(
+    vks: &[VerifyingKey],
+    commitments: &[Commitment],
+    folded_pok: ProofOfKnowledge,
+    folding_challenge: Fr,
+) -> Result<()> {
+    use {ark_bn254::Bn254, ark_ec::pairing::Pairing};
+
+    ensure!(
+        vks.len() == commitments.len(),
+        "batch_verify: {} vks vs {} commitments",
+        vks.len(),
+        commitments.len()
+    );
+
+    if vks.is_empty() {
+        return Ok(());
+    }
+
+    // All VKs must share the same G point. `setup()` always emits a single G,
+    // but a deserialized batch could mix VKs whose `g` differs — folding
+    // `g_sigma_neg` against `vks[0].g` would then quietly check the wrong
+    // pairing equation, so reject the batch outright.
+    let g = vks[0].g;
+    ensure!(
+        vks.iter().all(|v| v.g == g),
+        "batch_verify: all verifying keys must share the same G point"
+    );
+
+    // Fold commitments: C_folded = C₀ + challenge·C₁ + challenge²·C₂ + ...
+    let commitments_g1: Vec<G1Affine> = commitments.iter().map(|c| c.0).collect();
+    let folded_commitment = fold(&commitments_g1, folding_challenge)?;
+
+    // Fold GSigmaNeg: we need Σ rⁱ·VKᵢ.GSigmaNeg
+    // Since all G points are the same, this simplifies to:
+    // GSigmaNeg_folded = Σ rⁱ · GSigmaNeg_i
+    let g_sigma_negs: Vec<G2Affine> = vks.iter().map(|vk| vk.g_sigma_neg).collect();
+    let fold_scalars: Vec<Fr> = {
+        let mut s = Vec::with_capacity(vks.len());
+        let mut power = Fr::one();
+        for _ in 0..vks.len() {
+            s.push(power);
+            power *= folding_challenge;
+        }
+        s
+    };
+    let g_sigma_neg_folded: G2Affine = {
+        use ark_ec::VariableBaseMSM;
+        <G2Projective as VariableBaseMSM>::msm(&g_sigma_negs, &fold_scalars)
+            .map_err(crate::msm_err)?
+            .into_affine()
+    };
+
+    // Pairing check: e(folded_commitment, g_sigma_neg_folded) · e(folded_pok, g) ==
+    // 1
+    let result = Bn254::multi_pairing([folded_commitment, folded_pok.0], [g_sigma_neg_folded, g]);
+
+    ensure!(
+        result.0.is_one(),
+        "pedersen batch verification failed: pairing check did not pass"
+    );
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use {super::*, ark_ff::UniformRand};
+
+    #[test]
+    fn test_commit_and_verify() {
+        let mut rng = ark_std::test_rng();
+
+        // Generate random bases
+        let bases: Vec<G1Affine> = (0..5)
+            .map(|_| G1Projective::rand(&mut rng).into_affine())
+            .collect();
+
+        let (pks, vk) = setup(&[&bases], None).unwrap();
+        let pk = &pks[0];
+
+        // Commit to random values
+        let values: Vec<Fr> = (0..5).map(|_| Fr::rand(&mut rng)).collect();
+        let commitment = pk.commit(&values).unwrap();
+        let pok = pk.prove_knowledge(&values).unwrap();
+
+        // Verify
+        batch_verify_multi_vk(
+            &[vk],
+            &[commitment],
+            pok,
+            Fr::one(), // trivial challenge for single commitment
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_fold_single() {
+        let mut rng = ark_std::test_rng();
+        let p = G1Projective::rand(&mut rng).into_affine();
+        let result = fold(&[p], Fr::rand(&mut rng)).unwrap();
+        assert_eq!(result, p);
+    }
+
+    #[test]
+    fn test_commit_wrong_values_fails() {
+        let mut rng = ark_std::test_rng();
+        let bases: Vec<G1Affine> = (0..3)
+            .map(|_| G1Projective::rand(&mut rng).into_affine())
+            .collect();
+        let (pks, vk) = setup(&[&bases], None).unwrap();
+        let pk = &pks[0];
+
+        let values: Vec<Fr> = (0..3).map(|_| Fr::rand(&mut rng)).collect();
+        let commitment = pk.commit(&values).unwrap();
+
+        // Generate PoK with WRONG values
+        let wrong_values: Vec<Fr> = (0..3).map(|_| Fr::rand(&mut rng)).collect();
+        let wrong_pok = pk.prove_knowledge(&wrong_values).unwrap();
+
+        let result = batch_verify_multi_vk(&[vk], &[commitment], wrong_pok, Fr::one());
+        assert!(result.is_err());
+    }
+}
diff --git a/provekit/groth16/src/prover.rs b/provekit/groth16/src/prover.rs
new file mode 100644
index 000000000..4bcc7f397
--- /dev/null
+++ b/provekit/groth16/src/prover.rs
@@ -0,0 +1,373 @@
+//! Groth16+BSB22 prover building blocks: generates proofs from R1CS + witness.
+
+use {
+    crate::{pedersen, CommitmentInfo, BSB22_FOLD_DST, COMMITMENT_DST, FR_BYTES},
+    anyhow::{ensure, Result},
+    ark_bn254::{Fr, G1Affine, G1Projective, G2Affine, G2Projective},
+    ark_ec::{AffineRepr, CurveGroup, VariableBaseMSM},
+    ark_ff::{FftField, Field, One, PrimeField, Zero},
+    ark_poly::{EvaluationDomain, Radix2EvaluationDomain},
+    rayon::{self, prelude::*},
+    tracing::{info_span, instrument},
+};
+
+/// BSB22 batched proof of knowledge over all commitments, folded into a
+/// single G1 element. Independent of `H`, so callers can run this in
+/// parallel with [`compute_h`].
+#[instrument(skip_all)]
+pub fn bsb22_pok(
+    commitment_keys: &[pedersen::ProvingKeyView<'_>],
+    committed_values: &[Vec<Fr>],
+    challenge_wire_indices: &[usize],
+    wire_values: &[Fr],
+) -> Result<G1Affine> {
+    let poks: Vec<G1Affine> = commitment_keys
+        .iter()
+        .zip(committed_values.iter())
+        .map(|(ck, vals)| ck.prove_knowledge(vals).map(|p| p.0))
+        .collect::<Result<Vec<_>>>()?;
+
+    if poks.is_empty() {
+        return Ok(G1Affine::zero());
+    }
+
+    let mut commitments_serialized = vec![0u8; FR_BYTES * challenge_wire_indices.len()];
+    for (j, &wire_idx) in challenge_wire_indices.iter().enumerate() {
+        let val = wire_values.get(wire_idx).ok_or_else(|| {
+            anyhow::anyhow!(
+                "challenge wire index {wire_idx} out of bounds (witness len = {})",
+                wire_values.len()
+            )
+        })?;
+        let bytes = fr_to_bytes(val)?;
+        commitments_serialized[FR_BYTES * j..FR_BYTES * (j + 1)].copy_from_slice(&bytes);
+    }
+
+    let challenge = hash_to_fr(&commitments_serialized, BSB22_FOLD_DST)?;
+    pedersen::fold(&poks, challenge)
+}
+
+/// Compute `A_r`, `B_s`, and `Bs1` (the G1 form of `B_s` needed later in the
+/// `Krs` cross-term). Independent of `H`, so callers can run this in
+/// parallel with `compute_h`.
+#[allow(clippy::too_many_arguments)]
+#[instrument(skip_all)]
+pub fn prove_ar_bs_bs1(
+    g1_a: &[G1Affine],
+    g1_b: &[G1Affine],
+    g2_b: &[G2Affine],
+    non_inf_a: &[usize],
+    non_inf_b: &[usize],
+    wire_values: &[Fr],
+    g1_alpha: G1Affine,
+    g1_beta: G1Affine,
+    g2_beta: G2Affine,
+    g2_delta: G2Affine,
+    r_delta: G1Affine,
+    s_delta: G1Affine,
+    s_scalar: Fr,
+) -> Result<(G1Affine, G2Affine, G1Projective)> {
+    // Direct gather using the precomputed non-infinity index lists from
+    // setup. Replaces the original "iterate all wires, filter by
+    // `infinity_a/b[i]`" pattern — fewer iterations, no bool branch.
+    let (wire_values_a, wire_values_b) = {
+        let _s = info_span!("gather_wires_ab").entered();
+        rayon::join(
+            || {
+                non_inf_a
+                    .iter()
+                    .map(|&i| wire_values[i])
+                    .collect::<Vec<Fr>>()
+            },
+            || {
+                non_inf_b
+                    .iter()
+                    .map(|&i| wire_values[i])
+                    .collect::<Vec<Fr>>()
+            },
+        )
+    };
+
+    let _s = info_span!("msm_ar_bs").entered();
+    // Sequential, not nested-rayon::join: arkworks' MSM is already rayon-
+    // parallel internally, so concurrent MSMs would just stack bucket
+    // allocators (~3×) without speeding up wall-clock. Sequential keeps one
+    // bucket set alive at a time — important when this whole function runs
+    // in parallel with `compute_h`.
+    let ar = {
+        let msm = G1Projective::msm(g1_a, &wire_values_a).map_err(crate::msm_err)?;
+        let mut result = msm;
+        result += G1Projective::from(g1_alpha);
+        result += G1Projective::from(r_delta);
+        result.into_affine()
+    };
+    let bs = {
+        let msm =
+            <G2Projective as VariableBaseMSM>::msm(g2_b, &wire_values_b).map_err(crate::msm_err)?;
+        let mut result = msm;
+        result += G2Projective::from(g2_beta);
+        result += G2Projective::from(g2_delta) * s_scalar;
+        result.into_affine()
+    };
+    let bs1 = {
+        let msm = G1Projective::msm(g1_b, &wire_values_b).map_err(crate::msm_err)?;
+        let mut result = msm;
+        result += G1Projective::from(g1_beta);
+        result += G1Projective::from(s_delta);
+        result
+    };
+    Ok((ar, bs, bs1))
+}
+
+#[allow(clippy::too_many_arguments)]
+#[instrument(skip_all)]
+pub fn prove_krs(
+    g1_k: &[G1Affine],
+    g1_z: &[G1Affine],
+    h: &[Fr],
+    wire_values: &[Fr],
+    r1cs_nb_public: usize,
+    commitment_info: &[CommitmentInfo],
+    challenge_wire_indices: &[usize],
+    domain_size: u64,
+    ar: G1Affine,
+    bs1: G1Projective,
+    kr_delta: G1Affine,
+    r_scalar: Fr,
+    s_scalar: Fr,
+) -> Result<G1Affine> {
+    let private_wire_values: Vec<Fr> = {
+        let _s = info_span!("filter_private_wires").entered();
+        let mut to_remove: Vec<usize> = Vec::new();
+        for ci in commitment_info {
+            to_remove.extend_from_slice(&ci.private_committed);
+        }
+        to_remove.extend_from_slice(challenge_wire_indices);
+        to_remove.sort_unstable();
+        to_remove.dedup();
+        filter_by_sorted_indices(&wire_values[r1cs_nb_public..], &to_remove, r1cs_nb_public)
+    };
+
+    ensure!(
+        private_wire_values.len() == g1_k.len(),
+        "private wire count mismatch: got {}, expected {}",
+        private_wire_values.len(),
+        g1_k.len()
+    );
+
+    let _s = info_span!("msm_krs").entered();
+    let size_h = domain_size as usize - 1;
+
+    let (krs1_result, krs2_result) = rayon::join(
+        || G1Projective::msm(g1_k, &private_wire_values).map_err(crate::msm_err),
+        || {
+            if !h.is_empty() && !g1_z.is_empty() {
+                let h_slice = &h[..size_h.min(h.len())];
+                let z_slice = &g1_z[..size_h.min(g1_z.len())];
+                let min_len = h_slice.len().min(z_slice.len());
+                G1Projective::msm(&z_slice[..min_len], &h_slice[..min_len]).map_err(crate::msm_err)
+            } else {
+                Ok(G1Projective::zero())
+            }
+        },
+    );
+
+    let mut result = krs1_result? + krs2_result?;
+    result += G1Projective::from(kr_delta);
+
+    // Cross-terms: s·Ar + r·Bs1
+    let (s_ar, r_bs1) = rayon::join(|| G1Projective::from(ar) * s_scalar, || bs1 * r_scalar);
+    result += s_ar;
+    result += r_bs1;
+
+    Ok(result.into_affine())
+}
+
+/// Merge-scan, O(n + k) — assumes `sorted_indices` is sorted and deduplicated.
+fn filter_by_sorted_indices(slice: &[Fr], sorted_indices: &[usize], base_offset: usize) -> Vec<Fr> {
+    if sorted_indices.is_empty() {
+        return slice.to_vec();
+    }
+    let mut result = Vec::with_capacity(slice.len());
+    let mut remove_idx = 0;
+    for (i, val) in slice.iter().enumerate() {
+        let abs_idx = i + base_offset;
+        // Advance past any indices below current position
+        while remove_idx < sorted_indices.len() && sorted_indices[remove_idx] < abs_idx {
+            remove_idx += 1;
+        }
+        // Skip this element if it's in the removal list
+        if remove_idx < sorted_indices.len() && sorted_indices[remove_idx] == abs_idx {
+            remove_idx += 1;
+            continue;
+        }
+        result.push(*val);
+    }
+    result
+}
+
+/// Compute quotient polynomial H from the R1CS solution vectors.
+///
+/// Buffers are consumed: `a_evals` is reused in-place for the returned H
+/// coefficients (avoiding a second domain-sized allocation); `b_evals` /
+/// `c_evals` are dropped at end of call. Short buffers are zero-padded to
+/// `domain.size()` internally.
+#[instrument(skip_all)]
+pub fn compute_h(
+    mut a_evals: Vec<Fr>,
+    mut b_evals: Vec<Fr>,
+    mut c_evals: Vec<Fr>,
+    domain: &Radix2EvaluationDomain<Fr>,
+) -> Result<Vec<Fr>> {
+    let n = domain.size();
+
+    // Pad to domain size
+    a_evals.resize(n, Fr::zero());
+    b_evals.resize(n, Fr::zero());
+    c_evals.resize(n, Fr::zero());
+
+    // IFFT → coset FFT for each buffer. The three pipelines are independent
+    // (separate buffers, immutable domain refs), so run them in parallel.
+    let coset_domain = domain
+        .get_coset(Fr::GENERATOR)
+        .ok_or_else(|| anyhow::anyhow!("failed to construct coset domain"))?;
+    rayon::join(
+        || {
+            domain.ifft_in_place(&mut a_evals);
+            coset_domain.fft_in_place(&mut a_evals);
+        },
+        || {
+            rayon::join(
+                || {
+                    domain.ifft_in_place(&mut b_evals);
+                    coset_domain.fft_in_place(&mut b_evals);
+                },
+                || {
+                    domain.ifft_in_place(&mut c_evals);
+                    coset_domain.fft_in_place(&mut c_evals);
+                },
+            )
+        },
+    );
+
+    // Pointwise: a[i] = (a[i] * b[i] - c[i]) / Z(coset), computed in parallel.
+    // Reuses a_evals in-place to avoid an extra domain-sized allocation.
+    // Z(g·ωⁱ) = (g·ωⁱ)^N - 1 = g^N - 1 (constant on coset)
+    let z_inv = {
+        let gen_n = Fr::GENERATOR.pow([n as u64]);
+        (gen_n - Fr::one())
+            .inverse()
+            .ok_or_else(|| anyhow::anyhow!("Z(coset) is zero, cannot invert"))?
+    };
+
+    a_evals
+        .par_iter_mut()
+        .zip(b_evals.par_iter())
+        .zip(c_evals.par_iter())
+        .for_each(|((a, b), c)| {
+            *a = (*a * b - c) * z_inv;
+        });
+
+    // IFFT on coset: evaluation on coset → coefficient form
+    coset_domain.ifft_in_place(&mut a_evals);
+
+    Ok(a_evals)
+}
+
+pub fn fr_to_bytes(val: &Fr) -> Result<Vec<u8>> {
+    use ark_serialize::CanonicalSerialize;
+    let mut bytes = vec![0u8; FR_BYTES];
+    val.serialize_compressed(&mut bytes[..])
+        .map_err(|e| anyhow::anyhow!("failed to serialize Fr: {e}"))?;
+    Ok(bytes)
+}
+
+/// Hash bytes with a domain separator to produce a field element.
+///
+/// Uses EVM-native Keccak-256 over `dst || msg`, then interprets the 32-byte
+/// digest as a big-endian integer reduced mod R.
+///
+/// Bias note: the result is biased by at most ~2^-126 (R is 254-bit, hash is
+/// 256-bit; the modular reduction wraps unevenly over ~4 buckets at the top
+/// of the 256-bit range). For BSB22 challenge derivation this is negligible.
+///
+/// Intentionally diverges from the BSB22-spec hash (RFC 9380
+/// `expand_message_xmd-SHA256`) — the trade is ~130 k gas of on-chain
+/// SHA-256 + XMD scaffolding for a single `keccak256` opcode.
+pub fn hash_to_fr(msg: &[u8], dst: &[u8]) -> Result<Fr> {
+    use sha3::{Digest, Keccak256};
+    let mut h = Keccak256::new();
+    h.update(dst);
+    h.update(msg);
+    let digest: [u8; 32] = h.finalize().into();
+    Ok(Fr::from_be_bytes_mod_order(&digest))
+}
+
+/// Hash bytes with a domain separator to produce multiple field elements.
+///
+/// Single-root counter chain:
+/// ```text
+///   root      = keccak256(dst || msg)
+///   out[i]    = keccak256(root || I2OSP(i, 1))   reduced mod R
+/// ```
+///
+/// One outer hash over the (often large) `msg`, then N cheap 33-byte hashes
+/// — keeps total work close to a single keccak even for N up to ~32.
+pub fn hash_to_fr_multi(msg: &[u8], dst: &[u8], count: usize) -> Result<Vec<Fr>> {
+    use sha3::{Digest, Keccak256};
+    ensure!(count <= 255, "hash_to_fr_multi: count must fit in one byte");
+
+    let root: [u8; 32] = {
+        let mut h = Keccak256::new();
+        h.update(dst);
+        h.update(msg);
+        h.finalize().into()
+    };
+
+    let result = (0..count)
+        .map(|i| {
+            let mut h = Keccak256::new();
+            h.update(root);
+            h.update([i as u8]);
+            let digest: [u8; 32] = h.finalize().into();
+            Fr::from_be_bytes_mod_order(&digest)
+        })
+        .collect();
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_to_fr_deterministic() {
+        let data = b"test data";
+        let dst = b"test dst";
+        let h1 = hash_to_fr(data, dst).unwrap();
+        let h2 = hash_to_fr(data, dst).unwrap();
+        assert_eq!(h1, h2);
+    }
+
+    #[test]
+    fn test_hash_to_fr_different_inputs() {
+        let h1 = hash_to_fr(b"input1", b"dst").unwrap();
+        let h2 = hash_to_fr(b"input2", b"dst").unwrap();
+        assert_ne!(h1, h2);
+    }
+
+    #[test]
+    fn test_hash_to_fr_produces_nonzero() {
+        let h = hash_to_fr(b"test", b"dst").unwrap();
+        assert!(!h.is_zero());
+    }
+
+    #[test]
+    fn test_hash_to_fr_multi() {
+        let results = hash_to_fr_multi(b"test", b"dst", 3).unwrap();
+        assert_eq!(results.len(), 3);
+        // All should be different
+        assert_ne!(results[0], results[1]);
+        assert_ne!(results[1], results[2]);
+    }
+}
diff --git a/provekit/groth16/src/setup.rs b/provekit/groth16/src/setup.rs
new file mode 100644
index 000000000..49c7502b6
--- /dev/null
+++ b/provekit/groth16/src/setup.rs
@@ -0,0 +1,477 @@
+/// Groth16 trusted setup: generates ProvingKey and VerifyingKey from R1CS.
+///
+/// Notation follows DIZK paper Figure 4.
+use anyhow::Result;
+use {
+    crate::{pedersen, CommitmentInfo},
+    ark_bn254::{Fr, G1Affine, G1Projective, G2Affine, G2Projective},
+    ark_ec::{scalar_mul::BatchMulPreprocessing, AffineRepr, CurveGroup},
+    ark_ff::{Field, One, UniformRand, Zero},
+    ark_poly::{EvaluationDomain, Radix2EvaluationDomain},
+    ark_std::rand::Rng,
+    provekit_common::R1CS,
+    rayon::prelude::*,
+};
+
+/// Toxic waste: secret random values used during setup and then destroyed.
+///
+/// `ZeroizeOnDrop` wipes every secret field when the value goes out of scope,
+/// so the trusted-setup secrets can't be recovered from freed memory.
+#[derive(zeroize::Zeroize, zeroize::ZeroizeOnDrop)]
+struct ToxicWaste {
+    t:         Fr,
+    alpha:     Fr,
+    beta:      Fr,
+    gamma:     Fr,
+    delta:     Fr,
+    gamma_inv: Fr,
+    delta_inv: Fr,
+}
+
+impl ToxicWaste {
+    fn sample<R: Rng>(rng: &mut R) -> Result<Self> {
+        let sample_nonzero = |rng: &mut R| -> Fr {
+            loop {
+                let v = Fr::rand(rng);
+                if !v.is_zero() {
+                    return v;
+                }
+            }
+        };
+
+        let t = sample_nonzero(rng);
+        let alpha = sample_nonzero(rng);
+        let beta = sample_nonzero(rng);
+        let gamma = sample_nonzero(rng);
+        let delta = sample_nonzero(rng);
+
+        Ok(ToxicWaste {
+            t,
+            alpha,
+            beta,
+            gamma,
+            delta,
+            gamma_inv: gamma
+                .inverse()
+                .ok_or_else(|| anyhow::anyhow!("gamma is zero, cannot invert"))?,
+            delta_inv: delta
+                .inverse()
+                .ok_or_else(|| anyhow::anyhow!("delta is zero, cannot invert"))?,
+        })
+    }
+}
+
+/// Run the Groth16 trusted setup.
+///
+/// Challenge wires are taken from `commitment_info.challenge_indices` —
+/// single source of truth, no separate `challenge_wire_indices` to keep
+/// in sync between setup, prover, and verifier.
+pub fn setup(
+    r1cs: &R1CS,
+    commitment_info: &[CommitmentInfo],
+    num_challenges_per_commitment: &[usize],
+) -> Result<(crate::ProvingKey, crate::VerifyingKey)> {
+    let mut rng = ark_std::rand::thread_rng();
+    let toxic = ToxicWaste::sample(&mut rng)?;
+
+    let nb_wires = r1cs.num_witnesses();
+    // nb_public_variables includes constant-1 wire
+    let nb_public_variables = 1 + r1cs.num_public_inputs;
+    let private_committed: Vec<Vec<usize>> = commitment_info
+        .iter()
+        .map(|c| c.private_committed.clone())
+        .collect();
+    let nb_private_committed: usize = private_committed.iter().map(|v| v.len()).sum();
+    // Flatten challenge wire indices across commitments in iteration order;
+    // within each commitment, in `challenge_indices` order. Single source of
+    // truth, so prover and setup cannot drift.
+    let challenge_wire_indices: Vec<usize> = commitment_info
+        .iter()
+        .flat_map(|ci| ci.challenge_indices.iter().copied())
+        .collect();
+    let total_challenge_wires = challenge_wire_indices.len();
+
+    // All challenge wire indices are treated as public on the Groth16 level.
+    let nb_public = nb_public_variables + total_challenge_wires;
+    let nb_private = nb_wires - nb_public_variables - nb_private_committed - total_challenge_wires;
+
+    // FFT domain
+    let domain = Radix2EvaluationDomain::<Fr>::new(r1cs.num_constraints())
+        .ok_or_else(|| anyhow::anyhow!("failed to create FFT domain"))?;
+    let domain_size = domain.size() as u64;
+
+    // Evaluate A, B, C at the toxic waste point t using Lagrange basis.
+    let (a_at_t, b_at_t, c_at_t) = evaluate_abc_at_t(r1cs, &domain, &toxic)?;
+
+    // Compute K values: K(i) = (β·A(i) + α·B(i) + C(i)) / γ or / δ
+    let mut pk_k = Vec::with_capacity(nb_private); // private wires → divided by δ
+    let mut vk_k = Vec::with_capacity(nb_public); // public wires → divided by γ
+    let mut ck_k: Vec<Vec<Fr>> = commitment_info
+        .iter()
+        .map(|c| Vec::with_capacity(c.private_committed.len()))
+        .collect();
+
+    // Wire-id → commitment-index lookup. Wire ids are dense in `0..nb_wires`,
+    // so a direct-indexed `Vec<Option<usize>>` is both faster (no hashing, hot
+    // in cache) and smaller than a `HashMap<usize, usize>` for the typical
+    // case where most wires belong to no commitment.
+    let mut committed_map: Vec<Option<usize>> = vec![None; nb_wires];
+    for (ci, info) in commitment_info.iter().enumerate() {
+        for &wire_id in &info.private_committed {
+            committed_map[wire_id] = Some(ci);
+        }
+    }
+
+    let mut commitment_wire_set: Vec<bool> = vec![false; nb_wires];
+    for &wire_idx in &challenge_wire_indices {
+        commitment_wire_set[wire_idx] = true;
+    }
+
+    let k_at = |i: usize| -> Fr {
+        // K(i) = β·A(i) + α·B(i) + C(i)
+        toxic.beta * a_at_t[i] + toxic.alpha * b_at_t[i] + c_at_t[i]
+    };
+
+    // Pass 1: public wires (constant + Noir public inputs), in wire-index
+    // order. `vk.g1_k[0]` corresponds to the constant-1 wire and is paired
+    // with the implicit `1` term in the verifier; `vk.g1_k[1..1+num_public]`
+    // is paired with `public_witness` in the same order Noir emits public
+    // inputs.
+    for i in 0..nb_public_variables {
+        vk_k.push(k_at(i) * toxic.gamma_inv);
+    }
+
+    // Pass 2: challenge wires in commitment-iteration order. The verifier
+    // appends derived challenges to `extended_public` in this same order
+    // (`for (i, _) in vk.public_and_commitment_committed.iter().enumerate()`
+    // → `extended_public.extend_from_slice(&challenges)`), so the bases
+    // emitted here line up with the scalars the verifier produces.
+    for &wire_idx in &challenge_wire_indices {
+        vk_k.push(k_at(wire_idx) * toxic.gamma_inv);
+    }
+
+    // Pass 3: private wires. Each goes either to a commitment bucket (if
+    // it's in `private_committed` for some commitment) or to `pk_k`.
+    // Challenge wires that landed in the private range are skipped — they
+    // were already pushed to `vk_k` in pass 2.
+    for i in nb_public_variables..nb_wires {
+        if commitment_wire_set[i] {
+            continue;
+        }
+        let k_val = k_at(i);
+        if let Some(ci) = committed_map[i] {
+            ck_k[ci].push(k_val * toxic.gamma_inv);
+        } else {
+            pk_k.push(k_val * toxic.delta_inv);
+        }
+    }
+
+    // Z(τ) scalars: Z(t)/δ · t^i for i in 0..domain_size
+    let z_at_t: Fr = {
+        let t_n = toxic.t.pow([domain_size]);
+        (t_n - Fr::one()) * toxic.delta_inv
+    };
+    let mut z_scalars = Vec::with_capacity(domain_size as usize);
+    let mut z_cur = z_at_t;
+    for _ in 0..domain_size {
+        z_scalars.push(z_cur);
+        z_cur *= toxic.t;
+    }
+
+    // Mark infinity points (where A(τ) or B(τ) is zero)
+    let mut infinity_a = vec![false; nb_wires];
+    let mut infinity_b = vec![false; nb_wires];
+    let mut a_scalars_filtered = Vec::new();
+    let mut b_scalars_filtered = Vec::new();
+
+    for i in 0..nb_wires {
+        if a_at_t[i] == Fr::zero() {
+            infinity_a[i] = true;
+        } else {
+            a_scalars_filtered.push(a_at_t[i]);
+        }
+        if b_at_t[i] == Fr::zero() {
+            infinity_b[i] = true;
+        } else {
+            b_scalars_filtered.push(b_at_t[i]);
+        }
+    }
+
+    let nb_infinity_a = infinity_a.iter().filter(|&&x| x).count() as u64;
+    let nb_infinity_b = infinity_b.iter().filter(|&&x| x).count() as u64;
+
+    // Precompute non-infinity wire indices. Lets the prover build the MSM
+    // input by direct indexing instead of re-scanning `infinity_a/b` on every
+    // prove call. Pure circuit-structural data — no soundness implication.
+    let non_inf_a: Vec<usize> = (0..nb_wires).filter(|&i| !infinity_a[i]).collect();
+    let non_inf_b: Vec<usize> = (0..nb_wires).filter(|&i| !infinity_b[i]).collect();
+
+    // Scalar multiplication on the fixed generators g1_gen / g2_gen.
+    //
+    // Each batch below multiplies many scalars by the SAME base point. The
+    // previous code rebuilt the doubling chain per scalar; `BatchMulPreprocessing`
+    // precomputes a window table for the generator once, then reads several
+    // scalar bits per add. ~1.5–2× faster on the big lists (SHA-style setup).
+    //
+    // Parallelism: `batch_mul` uses `ark_std::cfg_iter!` internally, which is
+    // rayon-backed because the workspace enables `ark-std/parallel`.
+    let g1_gen = G1Affine::generator();
+    let g2_gen = G2Affine::generator();
+
+    // Size each window table for the biggest batch it'll be reused for —
+    // smaller batches still benefit from the precomputed table.
+    let max_g1_batch = [
+        a_scalars_filtered.len(),
+        b_scalars_filtered.len(),
+        z_scalars.len(),
+        vk_k.len(),
+        pk_k.len(),
+    ]
+    .into_iter()
+    .chain(ck_k.iter().map(|v| v.len()))
+    .max()
+    .unwrap_or(3)
+    .max(3);
+    let g1_prep =
+        BatchMulPreprocessing::<G1Projective>::new(G1Projective::from(g1_gen), max_g1_batch);
+
+    let max_g2_batch = b_scalars_filtered.len().max(3);
+    let g2_prep =
+        BatchMulPreprocessing::<G2Projective>::new(G2Projective::from(g2_gen), max_g2_batch);
+
+    let fb_g1 = |scalars: &[Fr]| -> Vec<G1Affine> {
+        if scalars.is_empty() {
+            return Vec::new();
+        }
+        g1_prep.batch_mul(scalars)
+    };
+    let fb_g2 = |scalars: &[Fr]| -> Vec<G2Affine> {
+        if scalars.is_empty() {
+            return Vec::new();
+        }
+        g2_prep.batch_mul(scalars)
+    };
+
+    // Batch the three toxic-scalar muls into a single call per group.
+    let [g1_alpha, g1_beta, g1_delta] = {
+        let v = fb_g1(&[toxic.alpha, toxic.beta, toxic.delta]);
+        [v[0], v[1], v[2]]
+    };
+
+    let g1_a = fb_g1(&a_scalars_filtered);
+    let g1_b = fb_g1(&b_scalars_filtered);
+
+    let mut g1_z = fb_g1(&z_scalars);
+    // No bit-reverse permutation: arkworks' IFFT outputs H in natural order,
+    // so Z points must also be in natural order for the MSM Σ h[i]·Z[i].
+    // deg(H) = (n-1)+(n-1)-n = n-2, so we need n-1 Z points
+    let size_z = domain_size as usize - 1;
+    g1_z.truncate(size_z);
+
+    let g1_vk_k = fb_g1(&vk_k);
+    let g1_pk_k = fb_g1(&pk_k);
+
+    // Commitment bases in G1
+    let g1_ck_k: Vec<Vec<G1Affine>> = ck_k.iter().map(|ck| fb_g1(ck)).collect();
+
+    // G2: same pattern.
+    let [g2_beta, g2_delta, g2_gamma] = {
+        let v = fb_g2(&[toxic.beta, toxic.delta, toxic.gamma]);
+        [v[0], v[1], v[2]]
+    };
+
+    let g2_b = fb_g2(&b_scalars_filtered);
+
+    // Pedersen commitment setup
+    let g2_random = G2Projective::rand(&mut rng).into_affine();
+    let mut pk_commitment_keys = Vec::new();
+    let mut vk_commitment_keys = Vec::new();
+
+    for ck_bases in &g1_ck_k {
+        if ck_bases.is_empty() {
+            continue;
+        }
+        let (pks, vk) = pedersen::setup(&[ck_bases], Some(g2_random))?;
+        let pk = pks
+            .into_iter()
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("pedersen::setup returned empty proving key vector"))?;
+        pk_commitment_keys.push(pk);
+        vk_commitment_keys.push(vk);
+    }
+
+    // Public and commitment committed indices for verification
+    let public_and_commitment_committed: Vec<Vec<usize>> = commitment_info
+        .iter()
+        .map(|c| c.public_and_commitment_committed.clone())
+        .collect();
+
+    // Build VerifyingKey
+    let mut vk = crate::VerifyingKey {
+        g1_alpha,
+        g1_k: g1_vk_k,
+        g2_beta,
+        g2_delta,
+        g2_gamma,
+        g2_delta_neg: G2Affine::zero(), // will be set by precompute
+        g2_gamma_neg: G2Affine::zero(),
+        e_alpha_beta: ark_ff::AdditiveGroup::ZERO,
+        commitment_keys: vk_commitment_keys,
+        public_and_commitment_committed,
+        num_challenges_per_commitment: num_challenges_per_commitment.to_vec(),
+    };
+    vk.precompute()?;
+
+    // Build ProvingKey
+    let pk = crate::ProvingKey {
+        domain_size,
+        domain_gen: Fr::from(domain.group_gen()),
+        g1_alpha,
+        g1_beta,
+        g1_delta,
+        g1_a,
+        g1_b,
+        g1_k: g1_pk_k,
+        g1_z,
+        g2_beta,
+        g2_delta,
+        g2_b,
+        infinity_a,
+        infinity_b,
+        nb_infinity_a,
+        nb_infinity_b,
+        non_inf_a,
+        non_inf_b,
+        commitment_keys: pk_commitment_keys,
+    };
+
+    // toxic waste is dropped here — in production this is the MPC ceremony's job.
+    // `ToxicWaste` is `ZeroizeOnDrop`, so the secret field elements are wiped
+    // from memory when this drop runs.
+    drop(toxic);
+
+    Ok((pk, vk))
+}
+
+/// Evaluate A(τ), B(τ), C(τ) for each wire using Lagrange interpolation at τ.
+fn evaluate_abc_at_t(
+    r1cs: &R1CS,
+    domain: &Radix2EvaluationDomain<Fr>,
+    toxic: &ToxicWaste,
+) -> Result<(Vec<Fr>, Vec<Fr>, Vec<Fr>)> {
+    let nb_wires = r1cs.num_witnesses();
+    let w = domain.group_gen();
+    let n = r1cs.num_constraints();
+
+    // Precompute [τ - ω^i] and their inverses
+    let mut t_minus_wi = Vec::with_capacity(n + 1);
+    let mut wi = Fr::one();
+    for _ in 0..=n {
+        t_minus_wi.push(toxic.t - wi);
+        wi *= w;
+    }
+    let t_minus_wi_inv = {
+        let mut inv = t_minus_wi.clone();
+        ark_ff::batch_inversion(&mut inv);
+        inv
+    };
+
+    // Phase 1: materialize the Lagrange values L_j(τ) for j ∈ 0..n as an
+    // explicit prefix-product table. The recurrence
+    //
+    //   L_{j+1}(τ) = L_j(τ) · ω · (τ - ω^j) / (τ - ω^(j+1))
+    //
+    // is a serial cumulative product (each L_{j+1} depends on L_j), but a
+    // single O(n) pass is cheap — and once the values are materialized, the
+    // matrix accumulation in phase 2 has no inter-row data dependency and
+    // can run in parallel.
+    //
+    // L₀(τ) = (τⁿ - 1) / (n · (τ - ω⁰))
+    let t_n = toxic.t.pow([domain.size() as u64]);
+    let n_inv = Fr::from(domain.size() as u64)
+        .inverse()
+        .ok_or_else(|| anyhow::anyhow!("FFT domain size is zero, cannot invert"))?;
+    let mut lagrange = Vec::with_capacity(n);
+    let mut cur = (t_n - Fr::one()) * t_minus_wi_inv[0] * n_inv;
+    for j in 0..n {
+        lagrange.push(cur);
+        if j + 1 < n {
+            cur *= w;
+            cur *= t_minus_wi[j];
+            cur *= t_minus_wi_inv[j + 1];
+        }
+    }
+
+    // Phase 2: parallel scatter. For each row j, accumulate
+    //   X[col] += coeff(j, col) · L_j(τ)
+    // into thread-local (a, b, c) vectors. Rayon's `try_fold` keeps each
+    // worker on its own chunk; `try_reduce` sums the chunks. Reduction cost
+    // is O(threads · nb_wires) — dwarfed by the matrix work for any
+    // non-trivial circuit.
+    let lookup_coeff = |interned| -> Result<Fr> {
+        r1cs.interner
+            .get(interned)
+            .ok_or_else(|| anyhow::anyhow!("R1CS interner missing value for matrix entry"))
+    };
+    let zero_vecs = || {
+        (
+            vec![Fr::zero(); nb_wires],
+            vec![Fr::zero(); nb_wires],
+            vec![Fr::zero(); nb_wires],
+        )
+    };
+    let (a, b, c) = (0..n)
+        .into_par_iter()
+        .try_fold(zero_vecs, |(mut a, mut b, mut c), j| -> Result<_> {
+            let l = lagrange[j];
+            for (col, interned) in r1cs.a.iter_row(j) {
+                a[col] += lookup_coeff(interned)? * l;
+            }
+            for (col, interned) in r1cs.b.iter_row(j) {
+                b[col] += lookup_coeff(interned)? * l;
+            }
+            for (col, interned) in r1cs.c.iter_row(j) {
+                c[col] += lookup_coeff(interned)? * l;
+            }
+            Ok((a, b, c))
+        })
+        .try_reduce(
+            zero_vecs,
+            |(mut a1, mut b1, mut c1), (a2, b2, c2)| -> Result<_> {
+                for i in 0..nb_wires {
+                    a1[i] += a2[i];
+                    b1[i] += b2[i];
+                    c1[i] += c2[i];
+                }
+                Ok((a1, b1, c1))
+            },
+        )?;
+
+    Ok((a, b, c))
+}
+
+#[cfg(test)]
+mod tests {
+    use {super::*, provekit_common::FieldElement};
+
+    /// Simple test: setup with a trivial R1CS should not panic.
+    #[test]
+    fn test_setup_trivial() {
+        // x * x = y (where wire 0=constant, wire 1=public output y, wire 2=secret x)
+        let mut r1cs = R1CS::new();
+        r1cs.num_public_inputs = 1; // one public input (y), excludes constant wire
+        r1cs.add_witnesses(3); // wire 0 (const), wire 1 (y), wire 2 (x)
+
+        let one = FieldElement::from(1u64);
+        // A: x (wire 2), B: x (wire 2), C: y (wire 1)
+        r1cs.add_constraint(
+            &[(one, 2)], // A: 1·x
+            &[(one, 2)], // B: 1·x
+            &[(one, 1)], // C: 1·y
+        );
+
+        let (pk, vk) = setup(&r1cs, &[], &[]).unwrap();
+        assert!(!pk.g1_a.is_empty());
+        assert!(!vk.g1_k.is_empty());
+    }
+}
diff --git a/provekit/groth16/src/types.rs b/provekit/groth16/src/types.rs
new file mode 100644
index 000000000..d3d153be8
--- /dev/null
+++ b/provekit/groth16/src/types.rs
@@ -0,0 +1,347 @@
+/// Core Groth16+BSB22 types: Proof, ProvingKey, VerifyingKey.
+///
+/// Notation follows Figure 4 in the DIZK paper.
+use ark_bn254::{Bn254, Fr, G1Affine, G2Affine, G2Projective};
+use {
+    crate::pedersen,
+    ark_ec::{pairing::Pairing, AffineRepr},
+    ark_ff::Zero,
+    ark_serialize::{CanonicalDeserialize, CanonicalSerialize},
+    serde::{Deserialize, Deserializer, Serialize, Serializer},
+};
+
+/// A Groth16+BSB22 proof.
+///
+/// Contains the standard Groth16 elements (Ar, Bs, Krs) plus
+/// BSB22 Pedersen commitments and a batched proof of knowledge.
+#[derive(Clone, Debug, CanonicalSerialize, CanonicalDeserialize)]
+pub struct Proof {
+    /// `[A]₁ = Σ wᵢ·[Aᵢ(τ)]₁ + [α]₁ + r·[δ]₁`
+    pub ar:             G1Affine,
+    /// `[B]₂ = Σ wᵢ·[Bᵢ(τ)]₂ + [β]₂ + s·[δ]₂`
+    pub bs:             G2Affine,
+    /// `[C]₁ = Σ wᵢ·[Kᵢ(τ)]₁ + Σ hⱼ·[Zⱼ(τ)]₁ + s·[A]₁ + r·[B]₁ - rs·[δ]₁`
+    pub krs:            G1Affine,
+    /// Pedersen commitments (BSB22 extension).
+    pub commitments:    Vec<G1Affine>,
+    /// Batched proof of knowledge for all commitments.
+    pub commitment_pok: G1Affine,
+}
+
+impl Proof {
+    /// Checks that proof elements are on the curve and in the correct subgroup.
+    pub fn is_valid(&self) -> bool {
+        // Ar must be a non-zero G1 point on the curve.
+        // G1 has cofactor 1 on BN254, so on-curve implies in-subgroup.
+        if !self.ar.is_on_curve() || self.ar.is_zero() {
+            return false;
+        }
+
+        // Bs is a G2 point. BN254 G2 has a non-trivial cofactor, so
+        // on-curve does NOT imply in-subgroup. Explicit check required.
+        if !self.bs.is_on_curve()
+            || self.bs.is_zero()
+            || !self.bs.is_in_correct_subgroup_assuming_on_curve()
+        {
+            return false;
+        }
+
+        // Krs must be a non-zero G1 point on the curve. A zero Krs is
+        // overwhelmingly unlikely for an honest prover (`r`/`s` are sampled
+        // uniformly), and accepting it widens the surface for malformed or
+        // malicious proofs.
+        if !self.krs.is_on_curve() || self.krs.is_zero() {
+            return false;
+        }
+
+        // Commitment points (G1) must be on the curve.
+        for c in &self.commitments {
+            if !c.is_on_curve() {
+                return false;
+            }
+        }
+        if !self.commitment_pok.is_on_curve() {
+            return false;
+        }
+
+        true
+    }
+}
+
+/// Groth16 proving key.
+///
+/// Contains all curve points needed by the prover to generate a proof.
+/// These are computed during trusted setup from the toxic waste.
+#[derive(Clone, Debug, CanonicalSerialize, CanonicalDeserialize)]
+pub struct ProvingKey {
+    /// FFT domain cardinality (number of constraints rounded up to power of 2).
+    pub domain_size: u64,
+    /// Generator of the FFT domain.
+    pub domain_gen:  Fr,
+
+    // -- G1 elements --
+    /// `[α]₁`
+    pub g1_alpha: G1Affine,
+    /// `[β]₁`
+    pub g1_beta:  G1Affine,
+    /// `[δ]₁`
+    pub g1_delta: G1Affine,
+    /// `[Aᵢ(τ)]₁` for each wire (excluding infinity points).
+    pub g1_a:     Vec<G1Affine>,
+    /// `[Bᵢ(τ)]₁` for each wire (excluding infinity points).
+    pub g1_b:     Vec<G1Affine>,
+    /// `[Kᵢ(τ)]₁` for private wires only.
+    pub g1_k:     Vec<G1Affine>,
+    /// `[τⁱ · Z(τ)/δ]₁` for i in `0..domain_size-1`.
+    pub g1_z:     Vec<G1Affine>,
+
+    // -- G2 elements --
+    /// `[β]₂`
+    pub g2_beta:  G2Affine,
+    /// `[δ]₂`
+    pub g2_delta: G2Affine,
+    /// `[Bᵢ(τ)]₂` for each wire (excluding infinity points).
+    pub g2_b:     Vec<G2Affine>,
+
+    // -- Infinity tracking --
+    /// `infinity_a[i] == true` means wire `i` has `A(τ) == 0`.
+    pub infinity_a:    Vec<bool>,
+    /// `infinity_b[i] == true` means wire `i` has `B(τ) == 0`.
+    pub infinity_b:    Vec<bool>,
+    /// Count of infinity points in A.
+    pub nb_infinity_a: u64,
+    /// Count of infinity points in B.
+    pub nb_infinity_b: u64,
+    /// Wire indices where `A(τ) != 0`, precomputed at setup. Lets the prover
+    /// build the A-side MSM input by direct indexing instead of re-filtering
+    /// the `infinity_a` bool array on every prove call. Always satisfies
+    /// `non_inf_a.len() == nb_wires - nb_infinity_a`.
+    pub non_inf_a:     Vec<usize>,
+    /// Wire indices where `B(τ) != 0`. Analogous to `non_inf_a`.
+    pub non_inf_b:     Vec<usize>,
+
+    /// Pedersen commitment proving keys (one per BSB22 commitment).
+    pub commitment_keys: Vec<pedersen::ProvingKey>,
+}
+
+/// Groth16 verifying key.
+///
+/// Contains the minimal curve points needed by the verifier.
+/// Note: precomputed fields (g2_delta_neg, g2_gamma_neg, e_alpha_beta)
+/// are not serialized — call `precompute()` after deserialization.
+#[derive(Clone, Debug)]
+pub struct VerifyingKey {
+    // -- G1 elements --
+    /// `[α]₁`
+    pub g1_alpha: G1Affine,
+    /// `[Kᵢ(τ)]₁` for public wires (including commitment wires).
+    pub g1_k:     Vec<G1Affine>,
+
+    // -- G2 elements --
+    /// `[β]₂`
+    pub g2_beta:  G2Affine,
+    /// `[δ]₂`
+    pub g2_delta: G2Affine,
+    /// `[γ]₂`
+    pub g2_gamma: G2Affine,
+
+    // -- Precomputed (set by precompute(), not serialized) --
+    /// `-[δ]₂`
+    pub g2_delta_neg: G2Affine,
+    /// `-[γ]₂`
+    pub g2_gamma_neg: G2Affine,
+    /// `e([α]₁, [β]₂)`
+    pub e_alpha_beta: <Bn254 as Pairing>::TargetField,
+
+    /// Pedersen commitment verifying keys (one per BSB22 commitment).
+    pub commitment_keys:                 Vec<pedersen::VerifyingKey>,
+    /// For each commitment, the indices of public/commitment-committed wires.
+    ///
+    /// Indices are **absolute witness indices**: position 0 is the constant-1
+    /// ONE_WIRE, public input `i` lives at index `1 + i`, and challenge wires
+    /// follow the public range. Index 0 is therefore never a valid entry —
+    /// the verifier maps `idx` → `extended_public[idx - 1]` to strip the
+    /// ONE_WIRE offset (see [`crate::verifier::verify`]). Producers (e.g.
+    /// `cli/src/cmd/prepare.rs`) populate this with `(1..num_public)`.
+    pub public_and_commitment_committed: Vec<Vec<usize>>,
+    /// Number of challenges derived from each commitment.
+    /// Single-challenge: all 1s. Multi-challenge: `[N]` for one commitment
+    /// producing N challenges.
+    pub num_challenges_per_commitment:   Vec<usize>,
+}
+
+impl CanonicalSerialize for VerifyingKey {
+    fn serialize_with_mode<W: std::io::Write>(
+        &self,
+        writer: W,
+        compress: ark_serialize::Compress,
+    ) -> Result<(), ark_serialize::SerializationError> {
+        let mut w = writer;
+        self.g1_alpha.serialize_with_mode(&mut w, compress)?;
+        self.g1_k.serialize_with_mode(&mut w, compress)?;
+        self.g2_beta.serialize_with_mode(&mut w, compress)?;
+        self.g2_delta.serialize_with_mode(&mut w, compress)?;
+        self.g2_gamma.serialize_with_mode(&mut w, compress)?;
+        self.commitment_keys.serialize_with_mode(&mut w, compress)?;
+        self.public_and_commitment_committed
+            .serialize_with_mode(&mut w, compress)?;
+        self.num_challenges_per_commitment
+            .serialize_with_mode(&mut w, compress)?;
+        Ok(())
+    }
+
+    fn serialized_size(&self, compress: ark_serialize::Compress) -> usize {
+        self.g1_alpha.serialized_size(compress)
+            + self.g1_k.serialized_size(compress)
+            + self.g2_beta.serialized_size(compress)
+            + self.g2_delta.serialized_size(compress)
+            + self.g2_gamma.serialized_size(compress)
+            + self.commitment_keys.serialized_size(compress)
+            + self
+                .public_and_commitment_committed
+                .serialized_size(compress)
+            + self.num_challenges_per_commitment.serialized_size(compress)
+    }
+}
+
+impl ark_serialize::Valid for VerifyingKey {
+    fn check(&self) -> Result<(), ark_serialize::SerializationError> {
+        self.g1_alpha.check()?;
+        for pt in &self.g1_k {
+            pt.check()?;
+        }
+        self.g2_beta.check()?;
+        self.g2_delta.check()?;
+        self.g2_gamma.check()?;
+        for ck in &self.commitment_keys {
+            ck.check()?;
+        }
+        if self.commitment_keys.len() != self.public_and_commitment_committed.len() {
+            return Err(ark_serialize::SerializationError::InvalidData);
+        }
+        if self.num_challenges_per_commitment.len() != self.commitment_keys.len() {
+            return Err(ark_serialize::SerializationError::InvalidData);
+        }
+        Ok(())
+    }
+}
+
+impl CanonicalDeserialize for VerifyingKey {
+    fn deserialize_with_mode<R: std::io::Read>(
+        reader: R,
+        compress: ark_serialize::Compress,
+        validate: ark_serialize::Validate,
+    ) -> Result<Self, ark_serialize::SerializationError> {
+        let mut r = reader;
+        let g1_alpha = G1Affine::deserialize_with_mode(&mut r, compress, validate)?;
+        let g1_k = Vec::<G1Affine>::deserialize_with_mode(&mut r, compress, validate)?;
+        let g2_beta = G2Affine::deserialize_with_mode(&mut r, compress, validate)?;
+        let g2_delta = G2Affine::deserialize_with_mode(&mut r, compress, validate)?;
+        let g2_gamma = G2Affine::deserialize_with_mode(&mut r, compress, validate)?;
+        let commitment_keys =
+            Vec::<pedersen::VerifyingKey>::deserialize_with_mode(&mut r, compress, validate)?;
+        let public_and_commitment_committed =
+            Vec::<Vec<usize>>::deserialize_with_mode(&mut r, compress, validate)?;
+        let num_challenges_per_commitment =
+            Vec::<usize>::deserialize_with_mode(&mut r, compress, validate)?;
+
+        // Compute cached values inline so a freshly deserialized VK is
+        // immediately ready to verify. Equivalent to calling `precompute()`
+        // — kept here so callers cannot forget that step.
+        let e_alpha_beta = Bn254::pairing(g1_alpha, g2_beta).0;
+        let g2_delta_neg: G2Affine = (-G2Projective::from(g2_delta)).into();
+        let g2_gamma_neg: G2Affine = (-G2Projective::from(g2_gamma)).into();
+
+        Ok(Self {
+            g1_alpha,
+            g1_k,
+            g2_beta,
+            g2_delta,
+            g2_gamma,
+            g2_delta_neg,
+            g2_gamma_neg,
+            e_alpha_beta,
+            commitment_keys,
+            public_and_commitment_committed,
+            num_challenges_per_commitment,
+        })
+    }
+}
+
+impl VerifyingKey {
+    /// Precompute cached values: e(α,β), -δ₂, -γ₂.
+    /// Must be called after deserialization.
+    pub fn precompute(&mut self) -> anyhow::Result<()> {
+        use ark_ec::pairing::Pairing;
+        self.e_alpha_beta = Bn254::pairing(self.g1_alpha, self.g2_beta).0;
+
+        self.g2_delta_neg = (-G2Projective::from(self.g2_delta)).into();
+
+        self.g2_gamma_neg = (-G2Projective::from(self.g2_gamma)).into();
+
+        Ok(())
+    }
+
+    /// Number of public witness elements expected (excluding the constant 1
+    /// wire).
+    pub fn nb_public_witness(&self) -> usize {
+        self.g1_k.len() - 1
+    }
+}
+
+// Serde adapters for ProvingKey.
+//
+// The proving key is large (hundreds of MB) and arkworks-serialized bytes are
+// best read/written outside postcard's wire format to avoid materializing the
+// full byte stream in memory. The .pkp file layout treats the PK as an
+// out-of-band section appended after the postcard-encoded `Prover` (see
+// `provekit_prover::pkp_io`), so the serde impls here are no-ops:
+//   * `Serialize` writes `()` (postcard emits zero bytes).
+//   * `Deserialize` ignores the input and yields `ProvingKey::empty()`.
+//
+// In practice these impls only run for `Groth16Prover` round-trips; the file
+// I/O layer fills in the real PK after postcard returns.
+impl Serialize for ProvingKey {
+    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        // Emit a unit value: postcard encodes `()` as zero bytes, leaving the
+        // PK out of the postcard stream entirely.
+        serializer.serialize_unit()
+    }
+}
+
+impl<'de> Deserialize<'de> for ProvingKey {
+    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
+        let _: () = Deserialize::deserialize(deserializer)?;
+        Ok(ProvingKey::empty())
+    }
+}
+
+impl ProvingKey {
+    /// A zero-state placeholder used while a `Groth16Prover` is being
+    /// reconstituted out of band. The actual proving key is loaded separately
+    /// by the .pkp I/O path and replaces this placeholder before any
+    /// cryptographic operations occur.
+    pub fn empty() -> Self {
+        ProvingKey {
+            domain_size:     0,
+            domain_gen:      Fr::zero(),
+            g1_alpha:        G1Affine::zero(),
+            g1_beta:         G1Affine::zero(),
+            g1_delta:        G1Affine::zero(),
+            g1_a:            Vec::new(),
+            g1_b:            Vec::new(),
+            g1_k:            Vec::new(),
+            g1_z:            Vec::new(),
+            g2_beta:         G2Affine::zero(),
+            g2_delta:        G2Affine::zero(),
+            g2_b:            Vec::new(),
+            infinity_a:      Vec::new(),
+            infinity_b:      Vec::new(),
+            nb_infinity_a:   0,
+            nb_infinity_b:   0,
+            non_inf_a:       Vec::new(),
+            non_inf_b:       Vec::new(),
+            commitment_keys: Vec::new(),
+        }
+    }
+}
diff --git a/provekit/groth16/src/verifier.rs b/provekit/groth16/src/verifier.rs
new file mode 100644
index 000000000..6418d9d6b
--- /dev/null
+++ b/provekit/groth16/src/verifier.rs
@@ -0,0 +1,176 @@
+/// Groth16+BSB22 verifier: verifies proofs against a verifying key.
+///
+/// Verification steps:
+/// 1. Subgroup check on proof elements
+/// 2. Recompute BSB22 commitment challenges from proof commitments
+/// 3. Verify Pedersen commitment PoKs via batch verification
+/// 4. Compute public input contribution via MSM
+/// 5. Check the Groth16 pairing equation
+use anyhow::{ensure, Context, Result};
+use {
+    crate::{
+        pedersen,
+        prover::{hash_to_fr, hash_to_fr_multi},
+        types::{Proof, VerifyingKey},
+        BSB22_FOLD_DST, COMMITMENT_DST, FR_BYTES,
+    },
+    ark_bn254::{Bn254, Fr, G1Projective},
+    ark_ec::{pairing::Pairing, CurveGroup, VariableBaseMSM},
+};
+
+/// Verify a Groth16+BSB22 proof. `vk` must have had
+/// [`VerifyingKey::precompute`] called; `public_witness` excludes the
+/// constant-1 wire.
+pub fn verify(proof: &Proof, vk: &VerifyingKey, public_witness: &[Fr]) -> Result<()> {
+    let total_challenges: usize = vk.num_challenges_per_commitment.iter().sum();
+    // Guard the subtraction below: a malformed VK with more declared
+    // challenges than g1_k entries would otherwise underflow `usize` (panic
+    // in debug, wrap in release — release still rejects via the size-check
+    // a few lines down, but the panic in debug is a DoS surface and the
+    // wrap masks the real problem).
+    ensure!(
+        vk.g1_k.len() >= total_challenges + 1,
+        "invalid verifying key: g1_k has {} entries but {} challenges + ONE_WIRE were declared",
+        vk.g1_k.len(),
+        total_challenges,
+    );
+    let nb_public_vars = vk.g1_k.len() - total_challenges;
+    let expected_commitments = vk.public_and_commitment_committed.len();
+
+    ensure!(
+        vk.commitment_keys.len() == expected_commitments,
+        "invalid verifying key: got {} commitment keys, expected {}",
+        vk.commitment_keys.len(),
+        expected_commitments
+    );
+    ensure!(
+        proof.commitments.len() == expected_commitments,
+        "invalid proof: got {} commitments, expected {}",
+        proof.commitments.len(),
+        expected_commitments
+    );
+    ensure!(
+        vk.num_challenges_per_commitment.len() == expected_commitments,
+        "invalid verifying key: got {} challenge counts, expected {}",
+        vk.num_challenges_per_commitment.len(),
+        expected_commitments
+    );
+    ensure!(
+        public_witness.len() == nb_public_vars - 1,
+        "invalid witness size: got {}, expected {} (public - ONE_WIRE)",
+        public_witness.len(),
+        nb_public_vars - 1
+    );
+
+    // Step 1: Subgroup check
+    ensure!(proof.is_valid(), "proof elements not in correct subgroup");
+
+    // Step 2: Recompute commitment challenges and verify BSB22
+    let mut extended_public = public_witness.to_vec();
+    let mut commitments_serialized = vec![0u8; total_challenges * FR_BYTES];
+    let mut serial_offset = 0usize;
+
+    for (i, committed_indices) in vk.public_and_commitment_committed.iter().enumerate() {
+        let num_challenges = vk.num_challenges_per_commitment[i];
+
+        let public_vals: Vec<Fr> = committed_indices
+            .iter()
+            .map(|&idx| {
+                ensure!(
+                    idx > 0 && idx - 1 < extended_public.len(),
+                    "commitment public index {} out of bounds (extended_public len = {})",
+                    idx,
+                    extended_public.len()
+                );
+                Ok(extended_public[idx - 1])
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // Always use the counter-chain `hash_to_fr_multi` — including the
+        // N=1 case — so the verifier matches the prover, which calls
+        // `hash_to_fr_multi(..., N)` unconditionally for every N.
+        let challenge_data = {
+            use ark_serialize::CanonicalSerialize;
+            let mut data = Vec::new();
+            let mut commitment_bytes = Vec::new();
+            proof.commitments[i]
+                .serialize_uncompressed(&mut commitment_bytes)
+                .map_err(|e| anyhow::anyhow!("serialize commitment: {e}"))?;
+            data.extend_from_slice(&commitment_bytes);
+            for val in &public_vals {
+                let bytes = crate::prover::fr_to_bytes(val)?;
+                data.extend_from_slice(&bytes);
+            }
+            data
+        };
+
+        let challenges = hash_to_fr_multi(&challenge_data, COMMITMENT_DST, num_challenges)?;
+
+        for ch in &challenges {
+            let bytes = crate::prover::fr_to_bytes(ch)?;
+            commitments_serialized[FR_BYTES * serial_offset..FR_BYTES * (serial_offset + 1)]
+                .copy_from_slice(&bytes);
+            serial_offset += 1;
+        }
+
+        extended_public.extend_from_slice(&challenges);
+    }
+
+    // Step 3: Verify BSB22 Pedersen commitments
+    if !vk.commitment_keys.is_empty() {
+        let folding_challenge = hash_to_fr(&commitments_serialized, BSB22_FOLD_DST)?;
+
+        let typed_commitments: Vec<pedersen::Commitment> = proof
+            .commitments
+            .iter()
+            .copied()
+            .map(pedersen::Commitment)
+            .collect();
+        pedersen::batch_verify_multi_vk(
+            &vk.commitment_keys,
+            &typed_commitments,
+            pedersen::ProofOfKnowledge(proof.commitment_pok),
+            folding_challenge,
+        )
+        .context("Pedersen batch verification failed")?;
+    }
+
+    // Step 4: Compute public input contribution
+    let k_sum = {
+        let mut sum = G1Projective::from(vk.g1_k[0]);
+
+        if !extended_public.is_empty() {
+            let msm_bases = &vk.g1_k[1..1 + extended_public.len()];
+            let msm = G1Projective::msm(msm_bases, &extended_public).map_err(crate::msm_err)?;
+            sum += msm;
+        }
+
+        for c in &proof.commitments {
+            sum += G1Projective::from(*c);
+        }
+
+        sum.into_affine()
+    };
+
+    // Step 5: Pairing check
+    let left = Bn254::multi_pairing([proof.krs, proof.ar, k_sum], [
+        vk.g2_delta_neg,
+        proof.bs,
+        vk.g2_gamma_neg,
+    ]);
+
+    ensure!(
+        left.0 == vk.e_alpha_beta,
+        "pairing check failed: proof is invalid"
+    );
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Integration tests would go here, requiring a full setup → prove → verify
+    // cycle.
+}
diff --git a/provekit/prover/Cargo.toml b/provekit/prover/Cargo.toml
index 82f848326..a4916d015 100644
--- a/provekit/prover/Cargo.toml
+++ b/provekit/prover/Cargo.toml
@@ -16,13 +16,18 @@ parallel = ["provekit-common/parallel"]
 [dependencies]
 # Workspace crates
 provekit-common.workspace = true
+provekit-groth16.workspace = true
 
 # Noir language
 acir.workspace = true
 noirc_abi.workspace = true
 
 # Cryptography and proof systems
+ark-bn254.workspace = true
+ark-ec.workspace = true
 ark-ff.workspace = true
+ark-poly.workspace = true
+ark-serialize.workspace = true
 ark-std.workspace = true
 whir.workspace = true
 
@@ -30,15 +35,23 @@ whir.workspace = true
 anyhow.workspace = true
 num-bigint.workspace = true
 postcard.workspace = true
+rayon.workspace = true
+serde.workspace = true
 tracing.workspace = true
 
-# Target-specific dependencies: only on non-WASM targets
+# Target-specific dependencies: only on non-WASM targets.
+# `xz2` and `zstd` wrap C libraries and don't build for wasm32; `bytes` is only
+# used by `pkp_io`, which is itself non-wasm.
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 bn254_blackbox_solver = { workspace = true, optional = true }
 nargo = { workspace = true, optional = true }
 noir_artifact_cli = { workspace = true, optional = true }
 mavros-vm.workspace = true
 mavros-artifacts.workspace = true
+bytes.workspace = true
+memmap2 = { workspace = true }
+xz2.workspace = true
+zstd.workspace = true
 
 [lints]
 workspace = true
diff --git a/provekit/prover/src/lib.rs b/provekit/prover/src/lib.rs
index 28af714d9..c3858f934 100644
--- a/provekit/prover/src/lib.rs
+++ b/provekit/prover/src/lib.rs
@@ -8,8 +8,8 @@ use {
     acir::native_types::{Witness, WitnessMap},
     anyhow::{Context, Result},
     provekit_common::{
-        utils::noir_to_native, FieldElement, NoirElement, NoirProof, NoirProver, Prover,
-        PublicInputs, TranscriptSponge,
+        utils::noir_to_native, FieldElement, NoirElement, NoirProof, NoirProver, PublicInputs,
+        TranscriptSponge,
     },
     std::mem::size_of,
     tracing::{debug, info_span, instrument},
@@ -30,12 +30,28 @@ pub(crate) mod bigint_mod;
 pub(crate) mod ec_arith;
 #[cfg(not(target_arch = "wasm32"))]
 pub mod input_utils;
+// `pkp_io` depends on `xz2`/`zstd`/`bytes`, none of which build on wasm32.
+#[cfg(not(target_arch = "wasm32"))]
+pub mod pkp_io;
+// Mmap-backed `.pkp` I/O (rapidsnark-style). Same extension as legacy `.pkp`,
+// distinguished by an in-file sentinel; see `pkp_mmap_io` module docs.
+#[cfg(not(target_arch = "wasm32"))]
+pub mod pkp_mmap_io;
+pub mod prover_types;
 pub(crate) mod r1cs;
 mod whir_r1cs;
 mod witness;
 
 // Public re-exports for items used by integration tests and benchmarks.
-pub use {ec_arith::ec_scalar_mul, r1cs::solve_witness_vec};
+#[cfg(not(target_arch = "wasm32"))]
+pub use pkp_io::{deserialize_pkp, read_pkp, serialize_pkp, write_pkp};
+#[cfg(not(target_arch = "wasm32"))]
+pub use pkp_mmap_io::{is_mmap_pkp, read_pkp_mmap, write_pkp_mmap};
+pub use {
+    ec_arith::ec_scalar_mul,
+    prover_types::{Groth16CommitmentInfo, Groth16PkSource, Groth16Prover, Prover},
+    r1cs::solve_witness_vec,
+};
 
 /// `prove` and `prove_with_toml` are native-only (cfg-gated out on wasm32).
 /// `prove_with_witness` is available on all targets. `MavrosProver` does not
@@ -82,6 +98,38 @@ fn generate_noir_witness(
         .witness)
 }
 
+#[instrument(skip_all)]
+#[cfg(all(feature = "witness-generation", not(target_arch = "wasm32")))]
+fn generate_noir_witness_for_groth16(
+    prover: &mut Groth16Prover,
+    input_map: InputMap,
+) -> Result<WitnessMap<NoirElement>> {
+    let solver = Bn254BlackBoxSolver::default();
+    let mut output_buffer = Vec::new();
+    let mut foreign_call_executor = DefaultForeignCallBuilder {
+        output:       &mut output_buffer,
+        enable_mocks: false,
+        resolver_url: None,
+        root_path:    None,
+        package_name: None,
+    }
+    .build();
+
+    let initial_witness = prover.witness_generator.abi().encode(&input_map, None)?;
+
+    let mut witness_stack = nargo::ops::execute_program(
+        &prover.program,
+        initial_witness,
+        &solver,
+        &mut foreign_call_executor,
+    )?;
+
+    Ok(witness_stack
+        .pop()
+        .context("Missing witness results")?
+        .witness)
+}
+
 impl Prove for NoirProver {
     #[cfg(all(feature = "witness-generation", not(target_arch = "wasm32")))]
     #[instrument(skip_all)]
@@ -260,7 +308,7 @@ impl Prove for NoirProver {
             .prove_noir(merlin, r1cs, commitments, full_witness, &public_inputs)
             .context("While proving R1CS instance")?;
 
-        Ok(NoirProof {
+        Ok(NoirProof::Whir {
             public_inputs,
             whir_r1cs_proof,
         })
@@ -354,7 +402,7 @@ impl Prove for MavrosProver {
             )
             .context("While proving R1CS instance")?;
 
-        Ok(NoirProof {
+        Ok(NoirProof::Whir {
             public_inputs,
             whir_r1cs_proof,
         })
@@ -380,12 +428,357 @@ impl Prove for MavrosProver {
     }
 }
 
+impl Prove for Groth16Prover {
+    #[cfg(all(feature = "witness-generation", not(target_arch = "wasm32")))]
+    #[instrument(skip_all)]
+    fn prove(mut self, input_map: InputMap) -> Result<NoirProof> {
+        let witness = generate_noir_witness_for_groth16(&mut self, input_map)?;
+        self.prove_with_witness(witness)
+    }
+
+    #[cfg(all(feature = "witness-generation", not(target_arch = "wasm32")))]
+    #[instrument(skip_all)]
+    fn prove_with_toml(self, prover_toml: impl AsRef<Path>) -> Result<NoirProof> {
+        let (input_map, _return_value) =
+            read_inputs_from_file(prover_toml.as_ref(), self.witness_generator.abi())?;
+        self.prove(input_map)
+    }
+
+    #[instrument(skip_all)]
+    fn prove_with_witness(
+        self,
+        acir_witness_idx_to_value_map: WitnessMap<NoirElement>,
+    ) -> Result<NoirProof> {
+        use ark_serialize::CanonicalSerialize;
+
+        // Take ownership of each field so we can drop the large ones the
+        // moment they stop being used.
+        let Groth16Prover {
+            program,
+            r1cs,
+            split_witness_builders,
+            witness_generator,
+            groth16_pk: pk,
+            commitment_info,
+        } = self;
+
+        let mut public_input_indices = program.functions[0].public_inputs().indices();
+        public_input_indices.sort_unstable();
+        let public_inputs = if public_input_indices.is_empty() {
+            PublicInputs::new()
+        } else {
+            let values = public_input_indices
+                .iter()
+                .map(|&idx| {
+                    let noir_val = acir_witness_idx_to_value_map
+                        .get(&Witness::from(idx))
+                        .ok_or_else(|| anyhow::anyhow!("Missing public input at index {idx}"))?;
+                    Ok(noir_to_native(*noir_val))
+                })
+                .collect::<Result<Vec<_>>>()?;
+            PublicInputs::from_vec(values)
+        };
+
+        // ABI / circuit metadata aren't touched after public-input extraction.
+        // Dropping them shrinks resident memory before witness solving — the
+        // current peak phase.
+        drop(program);
+        drop(witness_generator);
+
+        let num_witnesses = r1cs.num_witnesses();
+
+        let has_commitments = !commitment_info.is_empty();
+
+        // Allocate witness vector
+        let mut witness: Vec<Option<FieldElement>> = vec![None; r1cs.num_witnesses_for_solving()];
+
+        // `solve_witness_vec` requires a `&mut ProverState` because the WHIR
+        // pipeline absorbs intermediate values into the transcript while
+        // solving. Groth16 doesn't share that protocol — its only Fiat-Shamir
+        // step is the BSB22 commitment hash, done explicitly below — so we
+        // pass a throwaway transcript here purely to satisfy the signature.
+        // Nothing absorbed into it ever leaves this function.
+        let dummy_instance: Vec<u8> = Vec::new();
+        let ds =
+            whir::transcript::DomainSeparator::protocol(&"groth16-dummy").instance(&dummy_instance);
+        let mut dummy_transcript = ProverState::new(&ds, TranscriptSponge::default());
+
+        // --- Phase 1: Solve w1 witnesses (pre-commitment) ---
+        {
+            let _s = info_span!("solve_groth16_w1").entered();
+            crate::r1cs::solve_witness_vec(
+                &mut witness,
+                split_witness_builders.w1_layers,
+                &acir_witness_idx_to_value_map,
+                &mut dummy_transcript,
+            )
+            .context("While solving Groth16 w1 witnesses")?;
+        }
+
+        // --- Phase 2: BSB22 Pedersen commitment (WHIR-style: one commit, N challenges)
+        // ---
+        let mut pedersen_commitments: Vec<ark_bn254::G1Affine> = Vec::new();
+        let mut committed_values: Vec<Vec<FieldElement>> = Vec::new();
+        let mut groth16_ci: Vec<provekit_groth16::CommitmentInfo> = Vec::new();
+
+        if has_commitments {
+            let _s = info_span!("groth16_bsb22_commit").entered();
+
+            // One commitment covering all private w1 wires
+            let ci = &commitment_info[0];
+
+            // Gather private committed witness values
+            let private_vals: Vec<FieldElement> = ci
+                .private_committed
+                .iter()
+                .map(|&wire_idx| {
+                    witness[wire_idx].ok_or_else(|| {
+                        anyhow::anyhow!(
+                            "BSB22: private committed wire {wire_idx} not solved before commitment"
+                        )
+                    })
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            // Compute Pedersen commitment: C = Σ vᵢ · Basis[i]
+            let commitment = pk.view().commitment_keys[0].commit(&private_vals)?.0;
+
+            // Gather public values for hashing
+            let public_vals: Vec<FieldElement> = ci
+                .public_committed
+                .iter()
+                .map(|&wire_idx| {
+                    witness[wire_idx].ok_or_else(|| {
+                        anyhow::anyhow!(
+                            "BSB22: public wire {wire_idx} not solved before commitment"
+                        )
+                    })
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            // Derive N challenges from one commitment via hash_to_fr_multi
+            let challenge_data = {
+                use ark_serialize::CanonicalSerialize;
+                let mut data = Vec::new();
+                let mut commitment_bytes = Vec::new();
+                commitment
+                    .serialize_uncompressed(&mut commitment_bytes)
+                    .context("while serializing commitment")?;
+                data.extend_from_slice(&commitment_bytes);
+                for val in &public_vals {
+                    let bytes = provekit_groth16::prover::fr_to_bytes(val)?;
+                    data.extend_from_slice(&bytes);
+                }
+                data
+            };
+
+            let challenges = provekit_groth16::prover::hash_to_fr_multi(
+                &challenge_data,
+                provekit_groth16::COMMITMENT_DST,
+                ci.challenge_indices.len(),
+            )?;
+
+            // Insert each challenge into its wire index
+            for (challenge, &wire_idx) in challenges.iter().zip(ci.challenge_indices.iter()) {
+                witness[wire_idx] = Some(*challenge);
+            }
+
+            // Build groth16 CommitmentInfo for the inner prove() call.
+            groth16_ci.push(provekit_groth16::CommitmentInfo {
+                public_and_commitment_committed: ci.public_committed.clone(),
+                private_committed:               ci.private_committed.clone(),
+                challenge_indices:               ci.challenge_indices.clone(),
+                nb_public_committed:             ci.public_committed.len(),
+            });
+
+            pedersen_commitments.push(commitment);
+            committed_values.push(private_vals);
+        }
+
+        // --- Phase 3: Solve w2 witnesses (post-commitment, if any) ---
+        if has_commitments {
+            let _s = info_span!("solve_groth16_w2").entered();
+            crate::r1cs::solve_witness_vec(
+                &mut witness,
+                split_witness_builders.w2_layers,
+                &acir_witness_idx_to_value_map,
+                &mut dummy_transcript,
+            )
+            .context("While solving Groth16 w2 witnesses")?;
+        }
+
+        // Extract solved witness vector, then free the Option wrapper vec
+        let full_witness: Vec<FieldElement> = witness[..num_witnesses]
+            .iter()
+            .enumerate()
+            .map(|(i, w)| w.ok_or_else(|| anyhow::anyhow!("Witness {i} unsolved")))
+            .collect::<Result<Vec<_>>>()?;
+        drop(witness);
+        drop(acir_witness_idx_to_value_map);
+
+        // Compute R1CS solution vectors: A·w, B·w, C·w
+        let (a_evals, b_evals, c_evals) = {
+            let _s = info_span!("r1cs_matvec").entered();
+            let a = r1cs.a() * full_witness.as_slice();
+            let b = r1cs.b() * full_witness.as_slice();
+            let c = r1cs.c() * full_witness.as_slice();
+            (a, b, c)
+        };
+
+        // Save values needed later, then free R1CS (~200+ MB of sparse matrices)
+        let nb_public = 1 + r1cs.num_public_inputs;
+        let num_constraints = r1cs.num_constraints();
+        let challenge_wire_indices: Vec<usize> = commitment_info
+            .iter()
+            .flat_map(|ci| ci.challenge_indices.iter().copied())
+            .collect();
+        drop(r1cs);
+        drop(commitment_info);
+
+        // Borrowed view over the PK. Uniform across owned and mmap-backed
+        // sources — the mmap variant exposes the same slice shape, just
+        // pointing at file pages.
+        //
+        // Memory note: for the Owned source we explicitly drop the bases used
+        // by stage 1 (g1_a/g1_b/g2_b/infinity_a/b/non_inf_a/b/commitment_keys)
+        // before stage 2 (`prove_krs`) starts — see the `mem::take` block
+        // below. Without it, ~230 MB of stage-1-only bases stay resident while
+        // `prove_krs`'s MSM bucket allocator runs, doubling prove-time peak
+        // RSS on big circuits (SHA256: 848 MB → ~380 MB observed). The Mmap
+        // path skips this — its slices borrow into the file mapping, so
+        // there's nothing on the heap to release.
+        let mut pk = pk;
+        let domain_size;
+        let g1_delta;
+        {
+            let pk_view = pk.view();
+            domain_size = pk_view.domain_size;
+            g1_delta = pk_view.g1_delta;
+        }
+
+        // r/s and the δ multiples are needed by both the H-independent stages
+        // and `prove_krs`, so sample them before the rayon::join below.
+        use {ark_ec::CurveGroup, ark_std::UniformRand};
+        let mut rng = ark_std::rand::thread_rng();
+        let r_scalar = FieldElement::rand(&mut rng);
+        let s_scalar = FieldElement::rand(&mut rng);
+        let kr_scalar = -(r_scalar * s_scalar);
+        let r_delta = (ark_bn254::G1Projective::from(g1_delta) * r_scalar).into_affine();
+        let s_delta = (ark_bn254::G1Projective::from(g1_delta) * s_scalar).into_affine();
+        let kr_delta = (ark_bn254::G1Projective::from(g1_delta) * kr_scalar).into_affine();
+
+        let domain: ark_poly::Radix2EvaluationDomain<FieldElement> =
+            ark_poly::EvaluationDomain::new(num_constraints)
+                .ok_or_else(|| anyhow::anyhow!("failed to create FFT domain"))?;
+
+        // Stage 1: overlap the FFT-bound `compute_h` with the H-independent
+        // Groth16 stages (`bsb22_pok` + `prove_ar_bs_bs1`). `prove_ar_bs_bs1`
+        // serializes its three internal MSMs so only one bucket allocator
+        // is alive at a time — without that, FFT scratch and MSM buckets
+        // would stack and inflate peak under rayon contention.
+        let (h, branch_b) = {
+            let pk_view = pk.view();
+            rayon::join(
+                move || provekit_groth16::prover::compute_h(a_evals, b_evals, c_evals, &domain),
+                || -> Result<(
+                    ark_bn254::G1Affine,
+                    ark_bn254::G1Affine,
+                    ark_bn254::G2Affine,
+                    ark_bn254::G1Projective,
+                )> {
+                    let pok = provekit_groth16::prover::bsb22_pok(
+                        &pk_view.commitment_keys,
+                        &committed_values,
+                        &challenge_wire_indices,
+                        &full_witness,
+                    )
+                    .context("while computing BSB22 proof of knowledge")?;
+                    let (ar, bs, bs1) = provekit_groth16::prover::prove_ar_bs_bs1(
+                        pk_view.g1_a,
+                        pk_view.g1_b,
+                        pk_view.g2_b,
+                        pk_view.non_inf_a,
+                        pk_view.non_inf_b,
+                        &full_witness,
+                        pk_view.g1_alpha,
+                        pk_view.g1_beta,
+                        pk_view.g2_beta,
+                        pk_view.g2_delta,
+                        r_delta,
+                        s_delta,
+                        s_scalar,
+                    )
+                    .context("while computing Ar/Bs/Bs1")?;
+                    Ok((pok, ar, bs, bs1))
+                },
+            )
+        };
+
+        let h = h.context("while computing quotient polynomial H")?;
+        let (commitment_pok, ar, bs, bs1) = branch_b?;
+
+        // Free stage-1-only bases before stage 2's MSM bucket allocator runs.
+        // No-op for the Mmap variant (nothing on the heap to free).
+        if let Groth16PkSource::Owned(ref mut inner) = pk {
+            drop(std::mem::take(&mut inner.g1_a));
+            drop(std::mem::take(&mut inner.g1_b));
+            drop(std::mem::take(&mut inner.g2_b));
+            drop(std::mem::take(&mut inner.infinity_a));
+            drop(std::mem::take(&mut inner.infinity_b));
+            drop(std::mem::take(&mut inner.non_inf_a));
+            drop(std::mem::take(&mut inner.non_inf_b));
+            drop(std::mem::take(&mut inner.commitment_keys));
+        }
+
+        // Stage 2: prove_krs only needs g1_k and g1_z from the PK.
+        let pk_view = pk.view();
+        let krs = provekit_groth16::prover::prove_krs(
+            pk_view.g1_k,
+            pk_view.g1_z,
+            &h,
+            &full_witness,
+            nb_public,
+            &groth16_ci,
+            &challenge_wire_indices,
+            domain_size,
+            ar,
+            bs1,
+            kr_delta,
+            r_scalar,
+            s_scalar,
+        )
+        .context("while computing Krs")?;
+        drop(pk_view);
+        drop(pk);
+
+        let proof = provekit_groth16::Proof {
+            ar,
+            bs,
+            krs,
+            commitments: pedersen_commitments,
+            commitment_pok,
+        };
+
+        // Serialize proof
+        let mut proof_bytes = Vec::new();
+        proof
+            .serialize_compressed(&mut proof_bytes)
+            .context("while serializing Groth16 proof")?;
+
+        Ok(NoirProof::Groth16 {
+            public_inputs,
+            groth16_proof: proof_bytes,
+        })
+    }
+}
+
 impl Prove for Prover {
     #[cfg(all(feature = "witness-generation", not(target_arch = "wasm32")))]
     fn prove(self, input_map: InputMap) -> Result<NoirProof> {
         match self {
             Prover::Noir(p) => p.prove(input_map),
             Prover::Mavros(p) => p.prove(input_map),
+            Prover::Groth16(p) => p.prove(input_map),
         }
     }
 
@@ -394,6 +787,7 @@ impl Prove for Prover {
         match self {
             Prover::Noir(p) => p.prove_with_toml(prover_toml),
             Prover::Mavros(p) => p.prove_with_toml(prover_toml),
+            Prover::Groth16(p) => p.prove_with_toml(prover_toml),
         }
     }
 
@@ -406,6 +800,7 @@ impl Prove for Prover {
             Prover::Mavros(_) => {
                 anyhow::bail!("Mavros prover is not supported on WASM")
             }
+            Prover::Groth16(p) => p.prove_with_witness(witness),
         }
     }
 }
diff --git a/provekit/prover/src/pkp_io.rs b/provekit/prover/src/pkp_io.rs
new file mode 100644
index 000000000..31e552e60
--- /dev/null
+++ b/provekit/prover/src/pkp_io.rs
@@ -0,0 +1,266 @@
+//! `.pkp` file I/O with split PK section.
+//!
+//! For the Groth16 variant the proving key is large (hundreds of MB) and its
+//! bytes are essentially uniform random (arkworks-serialized curve points), so
+//! they don't compress meaningfully. To avoid materializing the PK in any
+//! intermediate buffer during read, the file layout is:
+//!
+//! ```text
+//! [ header (21 bytes) ]
+//! [ zstd stream {
+//!     [ postcard-encoded Prover (PK serialized as zero-byte placeholder) ]
+//!     [ arkworks-encoded ProvingKey ]   // only for Groth16 variant
+//!   } ]
+//! ```
+//!
+//! Reading streams from disk → zstd Decoder → postcard for the metadata, then
+//! continues from the same decoder into arkworks `CanonicalDeserialize` for
+//! the PK. Peak memory tracks the deserialized struct size, not the file
+//! size.
+
+use {
+    crate::prover_types::Prover,
+    anyhow::{ensure, Context, Result},
+    ark_serialize::{CanonicalDeserialize, CanonicalSerialize},
+    bytes::BufMut as _,
+    provekit_common::{
+        binary_format::{
+            HEADER_SIZE, MAGIC_BYTES, PROVER_FORMAT, PROVER_VERSION, XZ_MAGIC, ZSTD_MAGIC,
+        },
+        file::MaybeHashAware,
+    },
+    std::{
+        fs::File,
+        io::{BufRead, BufReader, Read, Write},
+        path::Path,
+    },
+    tracing::{info, instrument},
+};
+
+/// Buffer size for the streaming codec. 256 KB is enough to amortize
+/// postcard's per-byte reads/writes; the actual codec throughput is bounded
+/// by zstd, not by buffer size.
+const DECODER_BUF: usize = 256 * 1024;
+const ENCODER_BUF: usize = DECODER_BUF;
+
+/// Zstd compression level matching the rest of the file IO layer.
+const ZSTD_LEVEL: i32 = 3;
+
+/// Write a `Prover` to disk in the .pkp format.
+#[instrument(skip(prover), fields(path = %path.display()))]
+pub fn write_pkp(prover: &Prover, path: &Path) -> Result<()> {
+    let file = File::create(path).context("while creating output file")?;
+    let mut writer =
+        write_pkp_to_writer(prover, std::io::BufWriter::with_capacity(ENCODER_BUF, file))?;
+    writer.flush().context("while flushing writer")?;
+    let inner = writer
+        .into_inner()
+        .context("while flushing buffered writer")?;
+    inner.sync_all().context("while syncing output file")?;
+
+    let size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
+    info!(?path, size, "Wrote .pkp");
+    Ok(())
+}
+
+fn write_pkp_to_writer<W: Write>(prover: &Prover, mut writer: W) -> Result<W> {
+    let hash_config = prover.maybe_hash_config();
+    let (major, minor) = PROVER_VERSION;
+
+    // Header: MAGIC(8) + FORMAT(8) + MAJOR(2) + MINOR(2) + HASH_CONFIG(1)
+    let mut header = Vec::with_capacity(HEADER_SIZE);
+    header.put(MAGIC_BYTES);
+    header.put(&PROVER_FORMAT[..]);
+    header.put_u16_le(major);
+    header.put_u16_le(minor);
+    header.put_u8(hash_config.map(|c| c.to_byte()).unwrap_or(0xff));
+    writer.write_all(&header).context("while writing header")?;
+
+    // Single zstd stream containing the postcard-encoded Prover (PK is a
+    // zero-byte placeholder thanks to `ProvingKey`'s custom Serialize) and,
+    // for the Groth16 variant, the raw arkworks-encoded PK appended after.
+    let mut encoder =
+        zstd::Encoder::new(writer, ZSTD_LEVEL).context("while initializing zstd encoder")?;
+
+    // Postcard-encode the Prover into a temporary buffer, then write it. The
+    // metadata is small (R1CS + builders + ABI), so this allocation is OK.
+    let postcard_bytes = postcard::to_allocvec(prover).context("while postcard-encoding Prover")?;
+    encoder
+        .write_all(&postcard_bytes)
+        .context("while writing postcard payload")?;
+
+    // For Groth16, append the raw arkworks PK directly into the same zstd
+    // stream. arkworks writes incrementally so we never materialize the full
+    // PK as a `Vec<u8>`.
+
+    // Only the `Owned` PK source is serializable through this path — the
+    // `Mmap` source has its own writer (`pkp_mmap_io::write_pkp_mmap`) and
+    // calling the legacy `write_pkp` on a mmap-backed prover is a usage
+    // error.
+    if let Prover::Groth16(g) = prover {
+        let pk = g.groth16_pk.as_owned().ok_or_else(|| {
+            anyhow::anyhow!(
+                "write_pkp: groth16_pk is mmap-backed; use write_pkp_mmap for that path"
+            )
+        })?;
+        pk.serialize_uncompressed(&mut encoder)
+            .context("while writing arkworks-encoded ProvingKey")?;
+
+        // As of PROVER_VERSION (1, 5), `r1cs` and `commitment_info` are
+        // `#[serde(skip)]` on `Groth16Prover` — postcard no longer
+        // carries them. Append them after the PK as length-prefixed
+        // postcard blobs so the legacy reader can pull them back.
+        let r1cs_bytes =
+            postcard::to_allocvec(&g.r1cs).context("while postcard-encoding r1cs (legacy path)")?;
+        encoder
+            .write_all(&(r1cs_bytes.len() as u64).to_le_bytes())
+            .context("while writing r1cs length")?;
+        encoder
+            .write_all(&r1cs_bytes)
+            .context("while writing r1cs bytes")?;
+
+        let ci_bytes = postcard::to_allocvec(&g.commitment_info)
+            .context("while postcard-encoding commitment_info (legacy path)")?;
+        encoder
+            .write_all(&(ci_bytes.len() as u64).to_le_bytes())
+            .context("while writing commitment_info length")?;
+        encoder
+            .write_all(&ci_bytes)
+            .context("while writing commitment_info bytes")?;
+    }
+
+    encoder.finish().context("while finishing zstd stream")
+}
+
+/// Read a `Prover` from disk in the .pkp format.
+///
+/// Auto-detects the underlying layout: legacy zstd / xz, or mmap. The mmap
+/// path delegates to [`crate::pkp_mmap_io::read_pkp_mmap`], which mmaps the
+/// file and returns a `Prover` whose Groth16 PK source is
+/// [`crate::prover_types::Groth16PkSource::Mmap`]. The legacy path is byte
+/// compatible with everything `write_pkp` has ever produced.
+#[instrument(fields(path = %path.display(), size = path.metadata().map(|m| m.len()).ok()))]
+pub fn read_pkp(path: &Path) -> Result<Prover> {
+    if crate::pkp_mmap_io::is_mmap_pkp(path).unwrap_or(false) {
+        return crate::pkp_mmap_io::read_pkp_mmap(path);
+    }
+    let file = BufReader::new(File::open(path).context("while opening input file")?);
+    read_pkp_from_reader(file)
+}
+
+/// Deserialize a `Prover` from in-memory bytes produced by [`serialize_pkp`]
+/// or read from a `.pkp` file.
+pub fn deserialize_pkp(data: &[u8]) -> Result<Prover> {
+    read_pkp_from_reader(std::io::Cursor::new(data))
+}
+
+/// Serialize a `Prover` to bytes in the same layout as a `.pkp` file. The
+/// output is byte-for-byte identical to what [`write_pkp`] writes to disk.
+pub fn serialize_pkp(prover: &Prover) -> Result<Vec<u8>> {
+    let mut out = Vec::new();
+    write_pkp_to_writer(prover, &mut out)?;
+    Ok(out)
+}
+
+fn read_pkp_from_reader<R: Read + BufRead>(mut reader: R) -> Result<Prover> {
+    // Header layout: MAGIC(8) + FORMAT(8) + MAJOR(2) + MINOR(2) + HASH_CONFIG(1)
+    let mut header_bytes = [0u8; HEADER_SIZE];
+    reader
+        .read_exact(&mut header_bytes)
+        .context("while reading header")?;
+    ensure!(&header_bytes[0..8] == MAGIC_BYTES, "Invalid magic bytes");
+    ensure!(header_bytes[8..16] == PROVER_FORMAT, "Invalid format");
+    let file_major = u16::from_le_bytes([header_bytes[16], header_bytes[17]]);
+    let file_minor = u16::from_le_bytes([header_bytes[18], header_bytes[19]]);
+    ensure!(
+        file_major == PROVER_VERSION.0,
+        "Incompatible major version: file is v{}.{}, this build expects v{}.{}",
+        file_major,
+        file_minor,
+        PROVER_VERSION.0,
+        PROVER_VERSION.1,
+    );
+    ensure!(
+        file_minor >= PROVER_VERSION.1,
+        "Incompatible minor version: file is v{}.{}, this build requires v{}.{} or newer",
+        file_major,
+        file_minor,
+        PROVER_VERSION.0,
+        PROVER_VERSION.1,
+    );
+    let _hash_config = header_bytes[20];
+
+    // Detect compression
+    let peek = reader
+        .fill_buf()
+        .context("while peeking compression magic")?;
+    ensure!(peek.len() >= 6, "File too small to detect compression");
+    let is_zstd = peek[..4] == ZSTD_MAGIC;
+    let is_xz = peek[..6] == XZ_MAGIC;
+    ensure!(
+        is_zstd || is_xz,
+        "Unknown compression format (first bytes: {:02X?})",
+        &peek[..6]
+    );
+
+    if is_zstd {
+        let decoder = zstd::Decoder::new(reader).context("while initializing zstd decoder")?;
+        read_split_stream(decoder)
+    } else {
+        let decoder = xz2::read::XzDecoder::new(reader);
+        read_split_stream(decoder)
+    }
+}
+
+/// Drive the postcard streaming + arkworks streaming reads off a single
+/// `Read`. Generic over the decoder type so zstd and xz reuse the same logic.
+fn read_split_stream<R: Read>(decoder: R) -> Result<Prover> {
+    let buffered = BufReader::with_capacity(DECODER_BUF, decoder);
+
+    // postcard streams the metadata; the PK field deserializes to
+    // `ProvingKey::empty()` (the custom serde adapter consumes zero bytes
+    // for it). 1 MB scratch is enough — no other field uses
+    // `deserialize_bytes` / `deserialize_byte_buf`.
+    let mut scratch = vec![0u8; 1024 * 1024];
+    let (mut prover, (buffered, _)): (Prover, _) =
+        postcard::from_io((buffered, scratch.as_mut_slice()))
+            .context("while postcard-decoding Prover")?;
+
+    // Phase 2: for Groth16, fill in the real PK by streaming arkworks bytes
+    // directly off the same decoder. The placeholder created by
+    // `Groth16PkSource::default()` (which is `Owned(ProvingKey::empty())`) is
+    // overwritten in place.
+    if let Prover::Groth16(ref mut g) = prover {
+        let mut buffered = buffered;
+        let pk = provekit_groth16::ProvingKey::deserialize_uncompressed_unchecked(&mut buffered)
+            .context("while reading arkworks-encoded ProvingKey")?;
+        g.groth16_pk = crate::prover_types::Groth16PkSource::Owned(pk);
+
+        // Read r1cs and commitment_info length-prefixed postcard blobs
+        // (PROVER_VERSION (1, 5) split).
+        let mut len_buf = [0u8; 8];
+        buffered
+            .read_exact(&mut len_buf)
+            .context("while reading r1cs length")?;
+        let r1cs_len = u64::from_le_bytes(len_buf) as usize;
+        let mut r1cs_bytes = vec![0u8; r1cs_len];
+        buffered
+            .read_exact(&mut r1cs_bytes)
+            .context("while reading r1cs bytes")?;
+        g.r1cs = postcard::from_bytes(&r1cs_bytes)
+            .context("while postcard-decoding r1cs (legacy path)")?;
+
+        buffered
+            .read_exact(&mut len_buf)
+            .context("while reading commitment_info length")?;
+        let ci_len = u64::from_le_bytes(len_buf) as usize;
+        let mut ci_bytes = vec![0u8; ci_len];
+        buffered
+            .read_exact(&mut ci_bytes)
+            .context("while reading commitment_info bytes")?;
+        g.commitment_info = postcard::from_bytes(&ci_bytes)
+            .context("while postcard-decoding commitment_info (legacy path)")?;
+    }
+
+    Ok(prover)
+}
diff --git a/provekit/prover/src/pkp_mmap_io.rs b/provekit/prover/src/pkp_mmap_io.rs
new file mode 100644
index 000000000..1700b060c
--- /dev/null
+++ b/provekit/prover/src/pkp_mmap_io.rs
@@ -0,0 +1,352 @@
+//! Mmap-backed `.pkp` I/O — a rapidsnark-style alternative to the
+//! zstd-compressed legacy path in [`crate::pkp_io`].
+//!
+//! The two formats coexist under the same `.pkp` extension; they're
+//! distinguished by a 4-byte sentinel that follows the 21-byte common header
+//! ([`provekit_groth16::mmap_pk::MMAP_SENTINEL`] = `b"MMAP"`). The legacy
+//! reader sees zstd / xz magic; the mmap reader sees `MMAP`. Either reader
+//! can detect the wrong format and bail out, and [`crate::pkp_io::read_pkp`]
+//! auto-dispatches to the right one.
+//!
+//! ## Layout (recap, see `provekit_groth16::mmap_pk` for full detail)
+//!
+//! ```text
+//! [ HEADER (21 bytes, common with legacy)              ]
+//! [ MMAP_SENTINEL "MMAP" (4 bytes)                     ]
+//! [ metadata_len (u64 LE, 8 bytes)                     ]
+//! [ postcard-encoded Prover (PK/r1cs/commitment_info all zero-byte placeholders) ]
+//! [ pad to 8                                           ]
+//! [ section table + section bodies (PK + Pedersen, raw layout) ]
+//! [ R1CS chunk (raw byte layout — read via memcpy)     ]
+//! [ commitment_info chunk (raw byte layout — read via memcpy) ]
+//! ```
+//!
+//! ## Why mmap
+//!
+//! The legacy path zstd-decompresses + arkworks-deserializes the proving key
+//! at startup, which materializes hundreds of megabytes in RAM and runs a
+//! per-point Montgomery conversion. The mmap path stores curve points in
+//! their in-memory Montgomery layout and exposes `&[G1Affine]` slices that
+//! point directly at file pages. The kernel pages bytes in lazily as the MSM
+//! touches them — same trick rapidsnark uses for its zkey loader.
+//!
+//! ## When to use which writer
+//!
+//! - [`crate::pkp_io::write_pkp`] (zstd): smaller artifact, slower load,
+//!   portable across arkworks versions (canonical-bytes serialization).
+//! - [`write_pkp_mmap`] (this module): larger artifact (no compression),
+//!   near-instant load, file format coupled to the current arkworks in-memory
+//!   layout.
+//!
+//! Both write the same `.pkp` extension. Pick based on whether load time or
+//! artifact size matters more for your deployment.
+
+use {
+    crate::prover_types::{Groth16PkSource, Groth16Prover, Prover},
+    acir::circuit::Program,
+    anyhow::{bail, ensure, Context, Result},
+    bytes::BufMut as _,
+    memmap2::Mmap,
+    provekit_common::{
+        binary_format::{HEADER_SIZE, MAGIC_BYTES, PROVER_FORMAT, PROVER_VERSION},
+        file::MaybeHashAware,
+        witness::{NoirWitnessGenerator, SplitWitnessBuilders},
+        NoirElement, R1CS,
+    },
+    provekit_groth16::mmap_pk::{MmapProvingKey, MMAP_SENTINEL},
+    std::{
+        fs::{File, OpenOptions},
+        io::{Read, Seek, SeekFrom, Write},
+        path::Path,
+        sync::Arc,
+    },
+    tracing::{info, info_span, instrument},
+};
+
+/// Write a `Prover::Groth16(...)` to disk in the mmap-friendly `.pkp` layout.
+///
+/// Errors if the prover is not the Groth16 variant, or if its PK source is
+/// already mmap-backed (the latter would require copying mmap'd bytes back
+/// out, which is supported but currently rejected — call [`crate::pkp_io::
+/// write_pkp`] for those edge cases).
+#[instrument(skip(prover), fields(path = %path.display()))]
+pub fn write_pkp_mmap(prover: &Prover, path: &Path) -> Result<()> {
+    let groth16 = match prover {
+        Prover::Groth16(g) => g,
+        _ => bail!("write_pkp_mmap: only the Groth16 prover variant is supported"),
+    };
+    let pk = match &groth16.groth16_pk {
+        Groth16PkSource::Owned(pk) => pk,
+        Groth16PkSource::Mmap(_) => bail!(
+            "write_pkp_mmap: source PK is already mmap-backed; rewriting from mmap is not \
+             supported"
+        ),
+    };
+
+    let mut file = File::create(path).context("creating mmap pkp file")?;
+
+    // 1. Common 21-byte header (same as legacy format).
+    let hash_config = prover.maybe_hash_config();
+    let (major, minor) = PROVER_VERSION;
+    let mut header = Vec::with_capacity(HEADER_SIZE);
+    header.put(MAGIC_BYTES);
+    header.put(&PROVER_FORMAT[..]);
+    header.put_u16_le(major);
+    header.put_u16_le(minor);
+    header.put_u8(hash_config.map(|c| c.to_byte()).unwrap_or(0xff));
+    file.write_all(&header).context("writing pkp header")?;
+
+    // 2. Mmap sentinel (4 bytes).
+    file.write_all(&MMAP_SENTINEL)
+        .context("writing mmap sentinel")?;
+
+    // 3. Postcard-encoded Prover metadata. The PK serializes as zero bytes
+    //    (Groth16PkSource::Serialize emits a unit), so the metadata blob is small
+    //    (R1CS + builders + ABI).
+    let metadata = postcard::to_allocvec(prover).context("postcard-encoding Prover metadata")?;
+    file.write_all(&(metadata.len() as u64).to_le_bytes())
+        .context("writing metadata length")?;
+    file.write_all(&metadata)
+        .context("writing postcard metadata")?;
+
+    // 4. Pad to 8-byte alignment so the section table starts cleanly.
+    let cur = file.stream_position()?;
+    let aligned = (cur + 7) / 8 * 8;
+    if aligned > cur {
+        let pad = vec![0u8; (aligned - cur) as usize];
+        file.write_all(&pad).context("padding to 8-byte align")?;
+    }
+
+    // 5. Section table + section bodies (raw Montgomery layout for big arrays).
+    //    Delegated to the groth16 crate so the format-on-disk convention lives next
+    //    to `MmapProvingKey::load`.
+    provekit_groth16::mmap_pk::write_pk_sections(pk, &mut file)
+        .context("writing mmap section bodies")?;
+
+    // 6. R1CS chunk (raw byte layout — read via memcpy, no postcard decode). Lives
+    //    after the PK section bodies because the PK section writer is monolithic
+    //    and easier left untouched.
+    provekit_groth16::mmap_pk::write_r1cs_chunk(&groth16.r1cs, &mut file)
+        .context("writing mmap r1cs chunk")?;
+
+    // 7. Commitment-info chunk: convert each Groth16CommitmentInfo's Vec<usize>
+    //    fields into Vec<u64> for portability and write as raw byte layout.
+    let ci_triples: Vec<provekit_groth16::mmap_pk::CommitmentInfoTriple> = groth16
+        .commitment_info
+        .iter()
+        .map(|ci| {
+            (
+                ci.public_committed.iter().map(|&x| x as u64).collect(),
+                ci.private_committed.iter().map(|&x| x as u64).collect(),
+                ci.challenge_indices.iter().map(|&x| x as u64).collect(),
+            )
+        })
+        .collect();
+    provekit_groth16::mmap_pk::write_commitment_info_chunk(&ci_triples, &mut file)
+        .context("writing mmap commitment_info chunk")?;
+
+    file.sync_all().context("syncing mmap pkp file")?;
+    let size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
+    info!(?path, size, "Wrote mmap .pkp");
+    Ok(())
+}
+
+/// Read a `.pkp` file written by [`write_pkp_mmap`]. The file is mmap'd, the
+/// metadata is postcard-decoded out of the mmap, and the proving key is
+/// constructed as [`Groth16PkSource::Mmap`] — slices pointing at file pages,
+/// no copy.
+#[instrument(fields(path = %path.display()))]
+pub fn read_pkp_mmap(path: &Path) -> Result<Prover> {
+    let file = OpenOptions::new()
+        .read(true)
+        .open(path)
+        .with_context(|| format!("opening {}", path.display()))?;
+
+    // SAFETY: `Mmap::map` requires that the file is not modified while
+    // mapped. We open read-only and never expose the mmap to writers; the
+    // file may still be modified by another process out-of-band, which is a
+    // mode rapidsnark accepts too. Document it and move on.
+    let mmap = {
+        let _s = info_span!("mmap_map").entered();
+        // SAFETY: see the doc comment above. We open read-only and never
+        // hand out a writable view.
+        unsafe { Mmap::map(&file).with_context(|| format!("mmap'ing {}", path.display()))? }
+    };
+
+    // Match rapidsnark's `madvise(MADV_SEQUENTIAL)` from
+    // rapidsnark/src/fileloader.cpp:50 — hint the kernel that the bulk
+    // curve-point sections will be streamed through during MSM/FFT, so it
+    // prefetches aggressively and discards pages we've already read past.
+    // Non-fatal if it fails: the worst case is the current behaviour
+    // (MADV_NORMAL).
+    #[cfg(unix)]
+    {
+        let _s = info_span!("mmap_advise_sequential").entered();
+        if let Err(e) = mmap.advise(memmap2::Advice::Sequential) {
+            tracing::debug!("mmap advise(Sequential) failed (non-fatal): {e}");
+        }
+    }
+
+    ensure!(
+        mmap.len() >= HEADER_SIZE + 4 + 8,
+        "mmap pkp: file too short for header+sentinel+metadata_len"
+    );
+
+    // 1. Validate the 21-byte common header.
+    let header = &mmap[..HEADER_SIZE];
+    ensure!(
+        &header[0..8] == MAGIC_BYTES,
+        "mmap pkp: invalid magic bytes"
+    );
+    ensure!(
+        header[8..16] == PROVER_FORMAT,
+        "mmap pkp: invalid format id"
+    );
+    let file_major = u16::from_le_bytes([header[16], header[17]]);
+    let file_minor = u16::from_le_bytes([header[18], header[19]]);
+    ensure!(
+        file_major == PROVER_VERSION.0,
+        "mmap pkp: incompatible major version (file v{}.{}, build expects v{}.{})",
+        file_major,
+        file_minor,
+        PROVER_VERSION.0,
+        PROVER_VERSION.1
+    );
+    ensure!(
+        file_minor >= PROVER_VERSION.1,
+        "mmap pkp: incompatible minor version (file v{}.{}, build requires v{}.{}+)",
+        file_major,
+        file_minor,
+        PROVER_VERSION.0,
+        PROVER_VERSION.1
+    );
+
+    // 2. Sentinel.
+    ensure!(
+        mmap[HEADER_SIZE..HEADER_SIZE + 4] == MMAP_SENTINEL,
+        "mmap pkp: missing MMAP sentinel (this is not an mmap-format .pkp)"
+    );
+    let mut pos = HEADER_SIZE + 4;
+
+    // 3. Metadata length, then postcard-decode the Prover.
+    let metadata_len = u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize;
+    pos += 8;
+    ensure!(
+        pos + metadata_len <= mmap.len(),
+        "mmap pkp: metadata extends past file end"
+    );
+    let metadata_bytes = &mmap[pos..pos + metadata_len];
+    pos += metadata_len;
+    // The mmap path is Groth16-only. Decode each field of the inner
+    // `Groth16Prover` one at a time via `postcard::take_from_bytes` so
+    // each field's cost shows up as its own span. `r1cs` and
+    // `commitment_info` are `#[serde(skip)]`, so the wire-order is:
+    //   variant_tag, program, split_witness_builders, witness_generator,
+    // groth16_pk.
+    let mut prover: Prover = {
+        let _s = info_span!("postcard_decode_prover", metadata_len).entered();
+
+        let (variant_tag, rest): (u32, &[u8]) =
+            postcard::take_from_bytes(metadata_bytes).context("reading Prover variant tag")?;
+        ensure!(
+            variant_tag == 2,
+            "mmap pkp: expected Groth16 variant (tag 2), got tag {}",
+            variant_tag
+        );
+
+        let (program, rest): (Program<NoirElement>, &[u8]) = {
+            let _s = info_span!("postcard_program").entered();
+            postcard::take_from_bytes(rest).context("decoding program")?
+        };
+        let (split_witness_builders, rest): (SplitWitnessBuilders, &[u8]) = {
+            let _s = info_span!("postcard_split_witness_builders").entered();
+            postcard::take_from_bytes(rest).context("decoding split_witness_builders")?
+        };
+        let (witness_generator, rest): (NoirWitnessGenerator, &[u8]) = {
+            let _s = info_span!("postcard_witness_generator").entered();
+            postcard::take_from_bytes(rest).context("decoding witness_generator")?
+        };
+        let (groth16_pk, _): (Groth16PkSource, &[u8]) = {
+            let _s = info_span!("postcard_groth16_pk_placeholder").entered();
+            postcard::take_from_bytes(rest).context("decoding groth16_pk placeholder")?
+        };
+
+        Prover::Groth16(Groth16Prover {
+            program,
+            r1cs: R1CS::default(),
+            split_witness_builders,
+            witness_generator,
+            groth16_pk,
+            commitment_info: Vec::new(),
+        })
+    };
+
+    // 4. Pad to 8-byte alignment for the section table start.
+    pos = (pos + 7) / 8 * 8;
+
+    // 5. Read R1CS chunk from the mmap (memcpy into owned R1CS). This must happen
+    //    BEFORE `MmapProvingKey::load` consumes the mmap. The chunk starts
+    //    immediately after the PK section bodies; we walk the PK section table to
+    //    find that offset.
+    let pk_end = {
+        let _s = info_span!("pk_sections_end_offset").entered();
+        provekit_groth16::mmap_pk::pk_sections_end_offset(&mmap, pos)
+            .context("finding end of PK sections")?
+    };
+    let (r1cs_loaded, r1cs_end) = {
+        let _s = info_span!("read_r1cs_chunk").entered();
+        let (r1cs, consumed) = provekit_groth16::mmap_pk::read_r1cs_chunk(&mmap[pk_end..])
+            .context("reading r1cs chunk")?;
+        (r1cs, pk_end + consumed)
+    };
+    let ci_triples = {
+        let _s = info_span!("read_commitment_info_chunk").entered();
+        let (triples, _consumed) =
+            provekit_groth16::mmap_pk::read_commitment_info_chunk(&mmap[r1cs_end..])
+                .context("reading commitment_info chunk")?;
+        triples
+    };
+
+    // 6. Construct the MmapProvingKey from the section bodies.
+    let mmap_pk = {
+        let _s = info_span!("mmap_pk_load").entered();
+        MmapProvingKey::load(mmap, pos).context("loading mmap-backed proving key sections")?
+    };
+
+    // 7. Replace the placeholder PK source, populate r1cs from memcpy, convert
+    //    commitment_info triples back to the prover-side type.
+    match &mut prover {
+        Prover::Groth16(g) => {
+            g.groth16_pk = Groth16PkSource::Mmap(Arc::new(mmap_pk));
+            g.r1cs = r1cs_loaded;
+            g.commitment_info = ci_triples
+                .into_iter()
+                .map(
+                    |(pub_v, priv_v, chal_v)| crate::prover_types::Groth16CommitmentInfo {
+                        public_committed:  pub_v.into_iter().map(|x| x as usize).collect(),
+                        private_committed: priv_v.into_iter().map(|x| x as usize).collect(),
+                        challenge_indices: chal_v.into_iter().map(|x| x as usize).collect(),
+                    },
+                )
+                .collect();
+        }
+        _ => bail!("mmap pkp: metadata decoded as non-Groth16 prover variant"),
+    }
+
+    Ok(prover)
+}
+
+/// Peek the first bytes of a `.pkp` file to check whether it's the mmap
+/// format (returns `true`) or the legacy zstd/xz format (returns `false`).
+/// Returns an error only if the file can't be read or is shorter than the
+/// header.
+pub fn is_mmap_pkp(path: &Path) -> Result<bool> {
+    let mut file = File::open(path).with_context(|| format!("opening {}", path.display()))?;
+    file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
+    let mut sentinel = [0u8; 4];
+    let n = file.read(&mut sentinel)?;
+    if n < 4 {
+        return Ok(false);
+    }
+    Ok(sentinel == MMAP_SENTINEL)
+}
diff --git a/provekit/prover/src/prover_types.rs b/provekit/prover/src/prover_types.rs
new file mode 100644
index 000000000..33d456a43
--- /dev/null
+++ b/provekit/prover/src/prover_types.rs
@@ -0,0 +1,300 @@
+//! Backend-aware Prover enum.
+//!
+//! Lives in `provekit_prover` (not `provekit_common`) so the Groth16 variant
+//! can hold a typed `provekit_groth16::ProvingKey` directly. Common cannot
+//! import `provekit_groth16` without creating a dependency cycle, so the union
+//! type that knows about every backend is rooted here.
+
+// `MaybeHashAware` lives behind `provekit_common::file::io`, which is gated to
+// non-wasm targets. The only consumer of the `MaybeHashAware for Prover` impl
+// is `pkp_io`, which is itself non-wasm-gated, so confine both to that target.
+#[cfg(not(target_arch = "wasm32"))]
+use provekit_common::{file::MaybeHashAware, HashConfig};
+#[cfg(not(target_arch = "wasm32"))]
+use std::sync::Arc;
+use {
+    acir::circuit::Program,
+    provekit_common::{
+        witness::{NoirWitnessGenerator, SplitWitnessBuilders},
+        MavrosProver, NoirElement, NoirProver, R1CS,
+    },
+    serde::{Deserialize, Deserializer, Serialize, Serializer},
+};
+
+/// BSB22 commitment info for ProveKit's Groth16 backend.
+///
+/// One Pedersen commitment over all private w1 wires,
+/// producing multiple challenges via `hash_to_fr_multi`.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct Groth16CommitmentInfo {
+    /// Indices of public wires hashed with the commitment.
+    pub public_committed:  Vec<usize>,
+    /// Indices of private/internal wires committed to via Pedersen.
+    pub private_committed: Vec<usize>,
+    /// Wire indices where the derived challenge values are stored.
+    pub challenge_indices: Vec<usize>,
+}
+
+/// Source of the Groth16 proving key bytes: either an in-memory owned
+/// [`provekit_groth16::ProvingKey`] (legacy zstd `.pkp` path) or an mmap-backed
+/// [`provekit_groth16::MmapProvingKey`] that points directly at file pages
+/// (rapidsnark-style mmap `.pkp` path). Wrapping the mmap variant in `Arc`
+/// keeps `Groth16Prover: Clone` cheap (just bumps the Arc refcount, never
+/// re-mmaps).
+///
+/// Serialization for both variants emits zero bytes — the PK is always loaded
+/// out-of-band by the matching reader (`pkp_io::read_pkp` for owned,
+/// `pkp_mmap_io::read_pkp_mmap` for mmap). This preserves the existing
+/// postcard wire format byte-for-byte.
+#[derive(Debug, Clone)]
+pub enum Groth16PkSource {
+    /// Standard owned proving key. Loaded from the legacy zstd `.pkp` path,
+    /// or built fresh by `provekit_groth16::setup`.
+    Owned(provekit_groth16::ProvingKey),
+    /// Mmap-backed proving key. Loaded from the mmap `.pkp` path. Slices
+    /// borrow into the file mapping; the `Arc` ensures the mapping outlives
+    /// every clone of this prover.
+    #[cfg(not(target_arch = "wasm32"))]
+    Mmap(Arc<provekit_groth16::MmapProvingKey>),
+}
+
+impl Default for Groth16PkSource {
+    fn default() -> Self {
+        Groth16PkSource::Owned(provekit_groth16::ProvingKey::empty())
+    }
+}
+
+// Wire format: zero bytes, identical to the existing `ProvingKey` Serialize
+// impl. The actual PK bytes live elsewhere (zstd-stream tail for the legacy
+// path, mmap section bodies for the mmap path).
+impl Serialize for Groth16PkSource {
+    fn serialize<S: Serializer>(&self, ser: S) -> Result<S::Ok, S::Error> {
+        ser.serialize_unit()
+    }
+}
+
+impl<'de> Deserialize<'de> for Groth16PkSource {
+    fn deserialize<D: Deserializer<'de>>(de: D) -> Result<Self, D::Error> {
+        let _: () = Deserialize::deserialize(de)?;
+        Ok(Groth16PkSource::default())
+    }
+}
+
+/// Borrowed view of a Groth16 proving key. Returned by
+/// [`Groth16PkSource::view`] so the proving code can be written once against
+/// slices and run against either the owned or the mmap-backed source.
+pub struct Groth16PkView<'a> {
+    pub domain_size:     u64,
+    pub g1_alpha:        ark_bn254::G1Affine,
+    pub g1_beta:         ark_bn254::G1Affine,
+    pub g1_delta:        ark_bn254::G1Affine,
+    pub g2_beta:         ark_bn254::G2Affine,
+    pub g2_delta:        ark_bn254::G2Affine,
+    pub g1_a:            &'a [ark_bn254::G1Affine],
+    pub g1_b:            &'a [ark_bn254::G1Affine],
+    pub g1_k:            &'a [ark_bn254::G1Affine],
+    pub g1_z:            &'a [ark_bn254::G1Affine],
+    pub g2_b:            &'a [ark_bn254::G2Affine],
+    pub infinity_a:      &'a [bool],
+    pub infinity_b:      &'a [bool],
+    /// Precomputed wire indices where `A(τ) != 0` (the complement of
+    /// `infinity_a`). Built at setup time for owned PKs, at mmap-load time
+    /// for mmap PKs. Lets `prove_ar_bs_bs1` build the MSM input by direct
+    /// indexing instead of re-filtering on every prove call.
+    pub non_inf_a:       &'a [usize],
+    /// Twin of `non_inf_a` for the B side.
+    pub non_inf_b:       &'a [usize],
+    /// One view per BSB22 commitment. Borrows basis arrays out of either
+    /// owned `pedersen::ProvingKey`s (legacy path) or mmap'd file pages
+    /// (rapidsnark-style raw layout), with no allocation of the bases
+    /// either way. The outer `Vec` allocation is one entry per
+    /// commitment — a few hundred bytes for typical circuits.
+    pub commitment_keys: Vec<provekit_groth16::pedersen::ProvingKeyView<'a>>,
+}
+
+impl Groth16PkSource {
+    pub fn view(&self) -> Groth16PkView<'_> {
+        match self {
+            Groth16PkSource::Owned(pk) => Groth16PkView {
+                domain_size:     pk.domain_size,
+                g1_alpha:        pk.g1_alpha,
+                g1_beta:         pk.g1_beta,
+                g1_delta:        pk.g1_delta,
+                g2_beta:         pk.g2_beta,
+                g2_delta:        pk.g2_delta,
+                g1_a:            &pk.g1_a,
+                g1_b:            &pk.g1_b,
+                g1_k:            &pk.g1_k,
+                g1_z:            &pk.g1_z,
+                g2_b:            &pk.g2_b,
+                infinity_a:      &pk.infinity_a,
+                infinity_b:      &pk.infinity_b,
+                non_inf_a:       &pk.non_inf_a,
+                non_inf_b:       &pk.non_inf_b,
+                commitment_keys: pk.commitment_keys.iter().map(|ck| ck.view()).collect(),
+            },
+            #[cfg(not(target_arch = "wasm32"))]
+            Groth16PkSource::Mmap(m) => Groth16PkView {
+                domain_size:     m.domain_size,
+                g1_alpha:        m.g1_alpha,
+                g1_beta:         m.g1_beta,
+                g1_delta:        m.g1_delta,
+                g2_beta:         m.g2_beta,
+                g2_delta:        m.g2_delta,
+                g1_a:            m.g1_a(),
+                g1_b:            m.g1_b(),
+                g1_k:            m.g1_k(),
+                g1_z:            m.g1_z(),
+                g2_b:            m.g2_b(),
+                infinity_a:      m.infinity_a(),
+                infinity_b:      m.infinity_b(),
+                non_inf_a:       &m.non_inf_a,
+                non_inf_b:       &m.non_inf_b,
+                commitment_keys: m.commitment_keys.iter().map(|ck| ck.view()).collect(),
+            },
+        }
+    }
+
+    /// Borrow the inner owned `ProvingKey`, if this source owns one. Returns
+    /// `None` for the mmap-backed variant (that path uses `view()` to access
+    /// borrowed fields).
+    pub fn as_owned(&self) -> Option<&provekit_groth16::ProvingKey> {
+        match self {
+            Groth16PkSource::Owned(pk) => Some(pk),
+            #[cfg(not(target_arch = "wasm32"))]
+            Groth16PkSource::Mmap(_) => None,
+        }
+    }
+
+    /// Mutable access to the inner owned `ProvingKey`. Used by `pkp_io` when
+    /// streaming arkworks bytes into the placeholder PK created by
+    /// `Groth16PkSource::default()` after postcard returns. Errors if the
+    /// source is mmap-backed.
+    pub fn as_owned_mut(&mut self) -> Option<&mut provekit_groth16::ProvingKey> {
+        match self {
+            Groth16PkSource::Owned(pk) => Some(pk),
+            #[cfg(not(target_arch = "wasm32"))]
+            Groth16PkSource::Mmap(_) => None,
+        }
+    }
+}
+
+impl From<provekit_groth16::ProvingKey> for Groth16PkSource {
+    fn from(pk: provekit_groth16::ProvingKey) -> Self {
+        Groth16PkSource::Owned(pk)
+    }
+}
+
+/// Groth16 prover: holds R1CS, witness builders, and the typed proving key.
+///
+/// `groth16_pk` is a [`Groth16PkSource`] — either an owned
+/// [`provekit_groth16::ProvingKey`] or an mmap-backed
+/// [`provekit_groth16::MmapProvingKey`]; both source variants serialize as
+/// zero bytes so the postcard wire format for that field is stable.
+///
+/// As of `PROVER_VERSION = (1, 5)`, `r1cs` and `commitment_info` are
+/// `#[serde(skip)]` and travel outside the postcard blob:
+/// - Legacy zstd path: appended as length-prefixed postcard chunks after the
+///   arkworks-encoded PK (see `pkp_io::write_pkp`).
+/// - Mmap path: raw-byte chunks following the PK section table (see
+///   `mmap_pk::write_r1cs_chunk` / `write_commitment_info_chunk`).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Groth16Prover {
+    pub program:                Program<NoirElement>,
+    /// R1CS constraint matrices. As of PROVER_VERSION (1, 5) this field is
+    /// NOT included in the postcard blob. The legacy zstd path
+    /// length-prefixes its postcard bytes after the PK in the same zstd
+    /// stream. The mmap path stores its three sparse matrices + interner
+    /// as raw mmap sections (memcpy'd into owned `Vec`s on load — see
+    /// `provekit_groth16::mmap_pk`). Default during postcard deserialize
+    /// is `R1CS::new()`; both readers then populate the real value.
+    #[serde(skip)]
+    pub r1cs:                   R1CS,
+    pub split_witness_builders: SplitWitnessBuilders,
+    pub witness_generator:      NoirWitnessGenerator,
+    /// Typed Groth16 proving key (owned or mmap-backed).
+    pub groth16_pk:             Groth16PkSource,
+    /// BSB22 commitment metadata (empty if circuit has no commitments).
+    /// Same treatment as `r1cs` above: omitted from postcard, transported
+    /// by the format-specific reader (length-prefixed postcard for the
+    /// legacy zstd path; raw mmap sections for the mmap path).
+    #[serde(skip)]
+    pub commitment_info:        Vec<Groth16CommitmentInfo>,
+}
+
+// INVARIANT: Variant order is wire-format-critical (postcard uses positional
+// discriminants). Do not reorder, cfg-gate, or insert variants without
+// verifying cross-target deserialization (native <-> WASM).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Prover {
+    Noir(NoirProver),
+    Mavros(MavrosProver),
+    Groth16(Groth16Prover),
+}
+
+impl Prover {
+    /// Convert a compilation output into the on-disk prover format.
+    pub fn from_noir_proof_scheme(scheme: provekit_common::NoirProofScheme) -> Self {
+        use provekit_common::NoirProofScheme;
+        match scheme {
+            NoirProofScheme::Noir(d) => Prover::Noir(NoirProver {
+                hash_config:            d.hash_config,
+                program:                d.program,
+                r1cs:                   d.r1cs,
+                split_witness_builders: d.split_witness_builders,
+                witness_generator:      d.witness_generator,
+                whir_for_witness:       d.whir_for_witness,
+            }),
+            NoirProofScheme::Mavros(d) => Prover::Mavros(MavrosProver {
+                abi:                d.abi,
+                num_public_inputs:  d.num_public_inputs,
+                whir_for_witness:   d.whir_for_witness,
+                witgen_binary:      d.witgen_binary,
+                ad_binary:          d.ad_binary,
+                constraints_layout: d.constraints_layout,
+                witness_layout:     d.witness_layout,
+                hash_config:        d.hash_config,
+            }),
+        }
+    }
+
+    pub fn abi(&self) -> &noirc_abi::Abi {
+        match self {
+            Prover::Noir(p) => p.witness_generator.abi(),
+            Prover::Mavros(p) => &p.abi,
+            Prover::Groth16(p) => p.witness_generator.abi(),
+        }
+    }
+
+    pub fn size(&self) -> (usize, usize) {
+        match self {
+            Prover::Noir(p) => (p.r1cs.num_constraints(), p.r1cs.num_witnesses()),
+            Prover::Mavros(p) => (
+                p.constraints_layout.algebraic_size,
+                p.witness_layout.algebraic_size,
+            ),
+            Prover::Groth16(p) => (p.r1cs.num_constraints(), p.r1cs.num_witnesses()),
+        }
+    }
+
+    /// Returns the WHIR scheme for backends that use it (Noir, Mavros).
+    /// Returns `None` for Groth16, which doesn't use WHIR.
+    pub fn whir_for_witness(&self) -> Option<&provekit_common::WhirR1CSScheme> {
+        match self {
+            Prover::Noir(p) => Some(&p.whir_for_witness),
+            Prover::Mavros(p) => Some(&p.whir_for_witness),
+            Prover::Groth16(_) => None,
+        }
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+impl MaybeHashAware for Prover {
+    fn maybe_hash_config(&self) -> Option<HashConfig> {
+        match self {
+            Prover::Noir(p) => Some(p.hash_config),
+            Prover::Mavros(p) => Some(p.hash_config),
+            Prover::Groth16(_) => None,
+        }
+    }
+}
diff --git a/provekit/prover/src/witness/witness_builder.rs b/provekit/prover/src/witness/witness_builder.rs
index 5f07e860c..fe4d5a8ae 100644
--- a/provekit/prover/src/witness/witness_builder.rs
+++ b/provekit/prover/src/witness/witness_builder.rs
@@ -165,8 +165,14 @@ impl WitnessBuilderSolver for WitnessBuilder {
                 }
             }
             WitnessBuilder::Challenge(witness_idx) => {
-                let challenge: FieldElement = transcript.verifier_message();
-                witness[*witness_idx] = Some(challenge);
+                // In Groth16 flow, BSB22 pre-sets challenge values via
+                // hash_to_fr_multi before w2 solving. These are derived from
+                // the Pedersen commitment (not the Fiat-Shamir transcript),
+                // so the values will differ. Skip if already set.
+                if witness[*witness_idx].is_none() {
+                    let challenge: FieldElement = transcript.verifier_message();
+                    witness[*witness_idx] = Some(challenge);
+                }
             }
             WitnessBuilder::LogUpDenominator(
                 witness_idx,
diff --git a/provekit/r1cs-compiler/src/binops.rs b/provekit/r1cs-compiler/src/binops.rs
index 3af2a629b..2991f88eb 100644
--- a/provekit/r1cs-compiler/src/binops.rs
+++ b/provekit/r1cs-compiler/src/binops.rs
@@ -317,10 +317,13 @@ pub(crate) fn add_combined_binop_constraints(
     );
     let multiplicities_first_witness = r1cs_compiler.add_witness_builder(multiplicities_wb);
 
-    let sz =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
-    let rs =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
+    // Draw (sz, rs) from the global BSB22 power chains so this LogUp
+    // shares the same two `Challenge` roots used by every other LogUp
+    // sub-circuit (ROM, RAM, spread, range_check). `rs_sqrd` / `rs_cubed`
+    // below are *internal* powers of this instance's `rs` (used to fold
+    // the 4-column AND/XOR table); they stay as local `add_product`s.
+    let sz = r1cs_compiler.next_sz_power();
+    let rs = r1cs_compiler.next_rs_power();
     let rs_sqrd = r1cs_compiler.add_product(rs, rs);
     let rs_cubed = r1cs_compiler.add_product(rs_sqrd, rs);
     let challenges = LookupChallenges {
diff --git a/provekit/r1cs-compiler/src/memory/ram.rs b/provekit/r1cs-compiler/src/memory/ram.rs
index 6dc88dab7..c3bb9196d 100644
--- a/provekit/r1cs-compiler/src/memory/ram.rs
+++ b/provekit/r1cs-compiler/src/memory/ram.rs
@@ -74,12 +74,16 @@ pub fn add_ram_checking(
     r1cs_compiler: &mut NoirToR1CSCompiler,
     block: &MemoryBlock,
 ) -> (u32, Vec<usize>) {
-    // Add two verifier challenges for the multiset check
-    let rs_challenge =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
+    // Draw this RAM block's (rs, sz) pair from the global BSB22 power
+    // chains on the compiler. Each RAM block gets a distinct `(β^k, α^k)`
+    // pair via in-circuit power chaining — the two roots `(α, β)` are the
+    // only `Challenge` builders allocated for the whole circuit.
+    // `rs_challenge_sqrd` is still computed within this LogUp (it's an
+    // *internal* power of `rs_challenge` used by the Spice multiset
+    // construction below), so it stays as an explicit `add_product`.
+    let rs_challenge = r1cs_compiler.next_rs_power();
     let rs_challenge_sqrd = r1cs_compiler.add_product(rs_challenge, rs_challenge);
-    let sz_challenge =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
+    let sz_challenge = r1cs_compiler.next_sz_power();
 
     // The current witnesses indices for the partial products of the read set (RS)
     // hash and the write set (WS) hash
diff --git a/provekit/r1cs-compiler/src/memory/rom.rs b/provekit/r1cs-compiler/src/memory/rom.rs
index 607520f9c..d65422d30 100644
--- a/provekit/r1cs-compiler/src/memory/rom.rs
+++ b/provekit/r1cs-compiler/src/memory/rom.rs
@@ -33,11 +33,13 @@ pub(crate) fn add_rom_checking(r1cs_compiler: &mut NoirToR1CSCompiler, block: &M
     );
     let access_counts_first_witness = r1cs_compiler.add_witness_builder(wb);
 
-    // Add two verifier challenges for the lookup
-    let rs_challenge =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
-    let sz_challenge =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
+    // Draw this ROM block's (rs, sz) pair from the global BSB22 power
+    // chains on the compiler instead of allocating two fresh `Challenge`
+    // builders. Multiple ROM blocks each get a distinct `(β^k, α^k)`
+    // through in-circuit power chaining; the two roots stay the only
+    // Fiat-Shamir samples. See [`NoirToR1CSCompiler::next_sz_power`].
+    let rs_challenge = r1cs_compiler.next_rs_power();
+    let sz_challenge = r1cs_compiler.next_sz_power();
 
     // Calculate the sum, over all reads, of 1/denominator
     let summands_for_reads = block
diff --git a/provekit/r1cs-compiler/src/noir_to_r1cs.rs b/provekit/r1cs-compiler/src/noir_to_r1cs.rs
index 0f963c16c..e8546833f 100644
--- a/provekit/r1cs-compiler/src/noir_to_r1cs.rs
+++ b/provekit/r1cs-compiler/src/noir_to_r1cs.rs
@@ -111,6 +111,36 @@ pub struct NoirToR1CSCompiler {
 
     /// The ACIR witness indices of the initial values of the memory blocks
     pub initial_memories: BTreeMap<usize, Vec<usize>>,
+
+    // BSB22 challenge power chains.
+    //
+    // Every LogUp-style sub-circuit (ROM, RAM, spread, binops, range_check)
+    // needs random challenges. Naively each instance allocates its own
+    // `WitnessBuilder::Challenge` pair, which inflates `N_CHALLENGE` and the
+    // verifier's on-chain MSM by 2 entries per instance.
+    //
+    // We instead allocate just two roots (`α = sz_root`, `β = rs_root`) once
+    // each, lazily on first use, and hand out per-instance challenges as
+    // in-circuit powers: instance #k gets `(α^k, β^k)`. The pairing equation
+    // enforces `α^k = α^(k-1) · α` via the `add_product` constraints, so:
+    //
+    //   - within one instance, `(α^k, β^k)` are statistically independent (α ⊥ β as Fiat-Shamir
+    //     samples), preserving each LogUp's required `sz`/`rs` independence.
+    //   - across instances, randomness is shared via powers — gnark's `multicommit` pattern (see
+    //     gnark/std/multicommit/nativecommit.go).
+    //
+    // Effect: `N_CHALLENGE` stays at 2 (just the two roots) regardless of
+    // how many LogUp instances the circuit has.
+    /// First Challenge wire for the `sz` power chain. Lazily allocated.
+    pub sz_root_challenge: Option<usize>,
+    /// Most recently emitted `α^k` from the sz chain (or `None` before any
+    /// call). After k calls to `next_sz_power()`, this is `α^k`.
+    sz_current_power:      Option<usize>,
+    /// First Challenge wire for the `rs` power chain. Lazily allocated, an
+    /// independent Fiat-Shamir sample from `sz_root_challenge`.
+    pub rs_root_challenge: Option<usize>,
+    /// Most recently emitted `β^k` from the rs chain.
+    rs_current_power:      Option<usize>,
 }
 
 pub(crate) fn ensure_field_element_fits_num_bits(
@@ -171,9 +201,61 @@ impl NoirToR1CSCompiler {
             acir_to_r1cs_witness_map: BTreeMap::new(),
             product_cache: std::collections::HashMap::new(),
             initial_memories: BTreeMap::new(),
+            sz_root_challenge: None,
+            sz_current_power: None,
+            rs_root_challenge: None,
+            rs_current_power: None,
         }
     }
 
+    /// Return the next `α^k` wire from the BSB22 `sz` power chain.
+    ///
+    /// First call allocates the root challenge `α` as a
+    /// `WitnessBuilder::Challenge` and returns it (so instance #1 sees the
+    /// raw Fiat-Shamir sample, identical to the legacy code-path).
+    /// Subsequent calls emit `α², α³, …` as private wires via
+    /// [`Self::add_product`] — instance #k sees `α^k`.
+    ///
+    /// Every LogUp-style sub-circuit that previously allocated its own
+    /// `sz_challenge` should consume this instead. See the doc on
+    /// [`Self::sz_root_challenge`].
+    pub(crate) fn next_sz_power(&mut self) -> usize {
+        let root = match self.sz_root_challenge {
+            Some(w) => w,
+            None => {
+                let w = self.add_witness_builder(WitnessBuilder::Challenge(self.num_witnesses()));
+                self.sz_root_challenge = Some(w);
+                w
+            }
+        };
+        let next = match self.sz_current_power {
+            None => root,
+            Some(prev) => self.add_product(prev, root),
+        };
+        self.sz_current_power = Some(next);
+        next
+    }
+
+    /// Twin of [`Self::next_sz_power`] for the `rs` chain. The two chains
+    /// share no Fiat-Shamir state — `α` and `β` are independently sampled,
+    /// preserving per-LogUp `(sz, rs)` independence.
+    pub(crate) fn next_rs_power(&mut self) -> usize {
+        let root = match self.rs_root_challenge {
+            Some(w) => w,
+            None => {
+                let w = self.add_witness_builder(WitnessBuilder::Challenge(self.num_witnesses()));
+                self.rs_root_challenge = Some(w);
+                w
+            }
+        };
+        let next = match self.rs_current_power {
+            None => root,
+            Some(prev) => self.add_product(prev, root),
+        };
+        self.rs_current_power = Some(next);
+        next
+    }
+
     /// Returns the R1CS and the witness map
     pub fn finalize(self) -> (R1CS, Vec<Option<NonZeroU32>>, Vec<WitnessBuilder>) {
         // Convert witness map to vector
diff --git a/provekit/r1cs-compiler/src/range_check.rs b/provekit/r1cs-compiler/src/range_check.rs
index 763576ac4..422652e9c 100644
--- a/provekit/r1cs-compiler/src/range_check.rs
+++ b/provekit/r1cs-compiler/src/range_check.rs
@@ -264,6 +264,18 @@ pub fn add_range_checks(
 
     // Phase 4: For each atomic bucket, add range check constraints.
     // Choose LogUp or naive based on whichever produces fewer witnesses.
+    //
+    // LogUp instances pull their `sz_challenge` from the compiler-wide
+    // power chain: instance #k gets `α^k`, where `α = sz_root_challenge`
+    // is allocated once (lazily) on the very first call to
+    // `next_sz_power` anywhere in the compile. The pairing equation
+    // enforces `wire[αⁿ] = wire[αⁿ⁻¹] · wire[α]` via the generated
+    // `add_product` constraints. Net effect: range_check contributes 0
+    // new `Challenge` builders regardless of the bucket count.
+    //
+    // Range checks only need a single LogUp randomness (the table is
+    // 1-column: `value ∈ [0, 2^bits)`), so we only consume the sz chain
+    // here; the rs chain is untouched by this pass.
     atomic_range_checks
         .iter()
         .enumerate()
@@ -283,7 +295,8 @@ pub fn add_range_checks(
             }
             let num_bits = num_bits as u32;
             if should_use_logup(num_bits, values_to_lookup.len()) {
-                add_range_check_via_lookup(r1cs, num_bits, &values_to_lookup);
+                let sz_challenge = r1cs.next_sz_power();
+                add_range_check_via_lookup(r1cs, num_bits, &values_to_lookup, sz_challenge);
             } else {
                 values_to_lookup.iter().for_each(|value| {
                     add_naive_range_check(r1cs, num_bits, *value);
@@ -297,10 +310,17 @@ pub fn add_range_checks(
 /// Helper function which computes all the terms of the summation for
 /// each side (LHS and RHS) of the log-derivative multiset check.
 /// Uses a fused constraint to check equality of both sums directly.
+///
+/// `sz_challenge` is the wire index of the LogUp randomness, supplied by
+/// the caller. In SHA-via-spread circuits the caller threads in an
+/// in-circuit power of the BSB22 root challenge (α², α³, …) so multiple
+/// buckets share a single Fiat-Shamir challenge; in other circuits the
+/// caller allocates a fresh `WitnessBuilder::Challenge` instead.
 fn add_range_check_via_lookup(
     r1cs_compiler: &mut NoirToR1CSCompiler,
     num_bits: u32,
     values_to_lookup: &[usize],
+    sz_challenge: usize,
 ) {
     // Add witnesses for the multiplicities
     let wb = WitnessBuilder::MultiplicitiesForRange(
@@ -309,10 +329,6 @@ fn add_range_check_via_lookup(
         values_to_lookup.into(),
     );
     let multiplicities_first_witness = r1cs_compiler.add_witness_builder(wb);
-    // Sample the Schwartz-Zippel challenge for the log derivative
-    // multiset check.
-    let sz_challenge =
-        r1cs_compiler.add_witness_builder(WitnessBuilder::Challenge(r1cs_compiler.num_witnesses()));
 
     // Collect table side terms: multiplicity / (X - table_value)
     // Uses fused single constraint: (X - table_value) × quotient = multiplicity
diff --git a/provekit/r1cs-compiler/src/spread.rs b/provekit/r1cs-compiler/src/spread.rs
index d0867dc52..0b1eafc2f 100644
--- a/provekit/r1cs-compiler/src/spread.rs
+++ b/provekit/r1cs-compiler/src/spread.rs
@@ -538,9 +538,19 @@ pub(crate) fn add_spread_table_constraints(
         query_inputs,
     ));
 
-    // Challenges: sz and rs
-    let sz = compiler.add_witness_builder(WitnessBuilder::Challenge(compiler.num_witnesses()));
-    let rs = compiler.add_witness_builder(WitnessBuilder::Challenge(compiler.num_witnesses()));
+    // Challenges: sz and rs.
+    //
+    // Drawn from the global BSB22 power chains on the compiler. Instance
+    // #k of the spread table sees `(α^k, β^k)`; the two chains have
+    // independent roots, so `(sz, rs)` are statistically independent
+    // within this instance — exactly what the spread LogUp's bilinear
+    // `sz - input - rs · spread` requires for soundness.
+    //
+    // Without these helpers each spread instance would have allocated two
+    // fresh `Challenge` builders, inflating `N_CHALLENGE`. See
+    // [`NoirToR1CSCompiler::next_sz_power`] and `next_rs_power`.
+    let sz = compiler.next_sz_power();
+    let rs = compiler.next_rs_power();
 
     // Query-side: for each lookup, compute 1/(sz - input - rs*spread)
     let mut logup_summands: Vec<(FieldElement, usize)> = Vec::new();
diff --git a/provekit/verifier/Cargo.toml b/provekit/verifier/Cargo.toml
index 28a593e7e..f95a9a03c 100644
--- a/provekit/verifier/Cargo.toml
+++ b/provekit/verifier/Cargo.toml
@@ -11,8 +11,11 @@ repository.workspace = true
 [dependencies]
 # Workspace crates
 provekit-common.workspace = true
+provekit-groth16.workspace = true
 
 # Cryptography and proof systems
+ark-bn254 = { version = "0.5.0", default-features = false, features = ["curve"] }
+ark-serialize.workspace = true
 ark-std.workspace = true
 whir.workspace = true
 
diff --git a/provekit/verifier/src/lib.rs b/provekit/verifier/src/lib.rs
index 1c3461fa8..72d748f44 100644
--- a/provekit/verifier/src/lib.rs
+++ b/provekit/verifier/src/lib.rs
@@ -4,6 +4,7 @@ use {
     crate::whir_r1cs::WhirR1CSVerifier,
     anyhow::{Context, Result},
     provekit_common::{NoirProof, Verifier},
+    provekit_groth16::VerifierGroth16Ext,
     tracing::instrument,
 };
 
@@ -14,13 +15,61 @@ pub trait Verify {
 impl Verify for Verifier {
     #[instrument(skip_all)]
     fn verify(&mut self, proof: &NoirProof) -> Result<()> {
-        provekit_common::register_ntt();
+        match proof {
+            NoirProof::Whir {
+                public_inputs,
+                whir_r1cs_proof,
+            } => {
+                anyhow::ensure!(
+                    self.whir_for_witness.is_some(),
+                    "proof/verifier backend mismatch: proof is WHIR but verifier was prepared for \
+                     Groth16"
+                );
 
-        self.whir_for_witness
-            .take()
-            .context("Verifier has already been consumed; cannot verify twice")?
-            .verify(&proof.whir_r1cs_proof, &proof.public_inputs, &self.r1cs)?;
+                provekit_common::register_ntt();
 
-        Ok(())
+                self.whir_for_witness
+                    .take()
+                    .context("Verifier has already been consumed; cannot verify twice")?
+                    .verify(whir_r1cs_proof, public_inputs, &self.r1cs)?;
+
+                Ok(())
+            }
+            NoirProof::Groth16 {
+                public_inputs,
+                groth16_proof,
+            } => {
+                use ark_serialize::CanonicalDeserialize;
+
+                let vk = self.groth16_vk_typed()?.context(
+                    "proof/verifier backend mismatch: proof is Groth16 but verifier was prepared \
+                     for WHIR",
+                )?;
+
+                tracing::debug!(
+                    g1_k_len = vk.g1_k.len(),
+                    commitment_keys_len = vk.commitment_keys.len(),
+                    public_and_commitment_committed_len = vk.public_and_commitment_committed.len(),
+                    "deserialized Groth16 VK"
+                );
+
+                let proof: provekit_groth16::Proof =
+                    CanonicalDeserialize::deserialize_compressed(&groth16_proof[..])
+                        .context("while deserializing Groth16 proof")?;
+
+                tracing::debug!(
+                    commitments_len = proof.commitments.len(),
+                    public_witness_len = public_inputs.0.len(),
+                    "deserialized Groth16 proof"
+                );
+
+                let public_witness: Vec<ark_bn254::Fr> = public_inputs.0.clone();
+
+                provekit_groth16::verifier::verify(&proof, &vk, &public_witness)
+                    .context("Groth16 verification failed")?;
+
+                Ok(())
+            }
+        }
     }
 }
diff --git a/scripts/__pycache__/csp_benchmark_helpers.cpython-311.pyc b/scripts/__pycache__/csp_benchmark_helpers.cpython-311.pyc
new file mode 100644
index 000000000..ae54e4085
Binary files /dev/null and b/scripts/__pycache__/csp_benchmark_helpers.cpython-311.pyc differ
diff --git a/scripts/csp_benchmark_helpers.py b/scripts/csp_benchmark_helpers.py
index 3bd141a0f..33471b580 100755
--- a/scripts/csp_benchmark_helpers.py
+++ b/scripts/csp_benchmark_helpers.py
@@ -2,15 +2,17 @@
 """Helpers for scripts/run_csp_benchmarks.sh.
 
 Subcommands:
-  parse-runs <bench_dir> <circuit>  Aggregate per-run measurements for one
-                                    circuit and emit a single CSV row to stdout.
+  parse-runs <bench_dir> <circuit> <backend>
+                                    Aggregate per-run measurements for one
+                                    (circuit, backend) pair and emit a single
+                                    CSV row to stdout.
   human-to-bytes <value>            Convert a human-formatted byte string from
                                     the prover trace ("1.23 GB", "456 MB", etc.)
                                     to an integer byte count. Used by tests.
 
 Bench layout produced by run_csp_benchmarks.sh::
 
-    <bench_dir>/per_circuit/<circuit>/
+    <bench_dir>/per_circuit/<circuit>/<backend>/
         prove_<i>.time          # `/usr/bin/time -f '%e %M'` output
         prove_<i>.stderr        # provekit-cli prove stderr (span_stats trace)
         verify_<i>.time
@@ -50,6 +52,16 @@
 SCHEME_SIZE_RE = re.compile(
     r"Read Noir proof scheme\b.*?\bconstraints=(\d+)\b.*?\bwitnesses=(\d+)\b"
 )
+# Matches a span-close line like
+#   "╯ run: 76.5 ms duration, 1.35 MB peak memory, ..."
+# emitted by `tooling/cli/src/span_stats.rs`. The outermost span is named
+# "run" (`#[instrument]` on `Command::run`); we take the LAST occurrence in
+# the trace because spans close in LIFO order — the outer one closes last.
+# Sub-ms precision here beats `gtime -f '%e'`, which rounds to 10 ms and
+# collapses Groth16 verify times for small circuits to "0.00".
+RUN_DURATION_RE = re.compile(
+    r"\brun:\s*([0-9]+(?:\.[0-9]+)?)\s*(ns|[μu]s|ms|s)\s+duration\b"
+)
 
 
 def human_to_bytes(value: str) -> int:
@@ -99,6 +111,35 @@ def parse_scheme_sizes(stderr_path: Path) -> tuple[int, int]:
     return int(match.group(1)), int(match.group(2))
 
 
+def parse_run_duration_ms(stderr_path: Path) -> float:
+    """Return the outermost `run` span's duration in milliseconds.
+
+    Returns 0.0 if the trace file is missing or contains no `run:` span-close
+    line. Used in preference to `gtime -f '%e'` for verify timing — gtime
+    rounds to 10 ms which collapses sub-10ms verifiers to zero.
+    """
+    if not stderr_path.is_file():
+        return 0.0
+    text = ANSI_RE.sub("", stderr_path.read_text(encoding="utf-8", errors="replace"))
+    matches = list(RUN_DURATION_RE.finditer(text))
+    if not matches:
+        return 0.0
+    last = matches[-1]
+    value = float(last.group(1))
+    unit = last.group(2)
+    # Convert to milliseconds. tracing's human formatter emits one of
+    # {ns, μs/us, ms, s} for sub-second / multi-second ranges.
+    if unit == "ns":
+        return value / 1_000_000.0
+    if unit in ("μs", "us"):
+        return value / 1_000.0
+    if unit == "ms":
+        return value
+    if unit == "s":
+        return value * 1_000.0
+    return 0.0
+
+
 def parse_time_file(time_path: Path) -> tuple[float, int]:
     """Read `/usr/bin/time -f '%e %M'` output: (wall_seconds, max_rss_kb).
 
@@ -129,8 +170,8 @@ def read_meta(meta_path: Path) -> dict[str, str]:
     return out
 
 
-def parse_runs(bench_dir: Path, circuit: str) -> str:
-    circuit_dir = bench_dir / "per_circuit" / circuit
+def parse_runs(bench_dir: Path, circuit: str, backend: str) -> str:
+    circuit_dir = bench_dir / "per_circuit" / circuit / backend
     meta = read_meta(circuit_dir / "meta.txt")
 
     prove_runs: list[tuple[float, int, int]] = []
@@ -156,7 +197,12 @@ def parse_runs(bench_dir: Path, circuit: str) -> str:
         if not time_path.is_file():
             break
         wall, _rss = parse_time_file(time_path)
-        verify_runs.append((wall, _rss))
+        # Prefer the trace-derived duration (sub-ms precision) over gtime's
+        # 10ms-resolution wall measurement. Fall back to the time-file value
+        # only if the trace is missing or the run span isn't present.
+        trace_ms = parse_run_duration_ms(circuit_dir / f"verify_{j}.stderr")
+        duration_ms = trace_ms if trace_ms > 0 else wall * 1000.0
+        verify_runs.append((duration_ms, _rss))
         j += 1
 
     if not prove_runs:
@@ -165,7 +211,7 @@ def parse_runs(bench_dir: Path, circuit: str) -> str:
     prove_time_ms = mean(r[0] for r in prove_runs) * 1000.0
     prover_rss_kb = mean(r[1] for r in prove_runs)
     prover_heap_bytes = mean(r[2] for r in prove_runs)
-    verifier_time_ms = mean(r[0] for r in verify_runs) * 1000.0 if verify_runs else 0.0
+    verifier_time_ms = mean(r[0] for r in verify_runs) if verify_runs else 0.0
 
     pkp_size = meta.get("pkp_size_bytes", "0")
     proof_size = meta.get("proof_size_bytes", "0")
@@ -173,6 +219,7 @@ def parse_runs(bench_dir: Path, circuit: str) -> str:
     return ",".join(
         [
             circuit,
+            backend,
             str(num_constraints),
             str(num_witnesses),
             f"{prove_time_ms:.1f}",
@@ -193,6 +240,7 @@ def main() -> int:
     p = sub.add_parser("parse-runs")
     p.add_argument("bench_dir", type=Path)
     p.add_argument("circuit")
+    p.add_argument("backend")
 
     p = sub.add_parser("human-to-bytes")
     p.add_argument("value")
@@ -200,7 +248,7 @@ def main() -> int:
     args = parser.parse_args()
 
     if args.cmd == "parse-runs":
-        row = parse_runs(args.bench_dir, args.circuit)
+        row = parse_runs(args.bench_dir, args.circuit, args.backend)
         if row:
             print(row)
     elif args.cmd == "human-to-bytes":
diff --git a/scripts/run_csp_benchmarks.sh b/scripts/run_csp_benchmarks.sh
index 912b49019..43ebeae73 100755
--- a/scripts/run_csp_benchmarks.sh
+++ b/scripts/run_csp_benchmarks.sh
@@ -1,23 +1,29 @@
 #!/usr/bin/env bash
 # run_csp_benchmarks.sh
 #
-# Run prove/verify benchmarks for noir-examples/csp-benchmarks/*. Each circuit
-# is compiled and prepared once, then prove + verify are each invoked
-# BENCH_RUNS times so the helper can average wall time, peak RSS, and
-# heap-peak bytes (parsed from the prover's tracing output).
+# Run prove/verify benchmarks for noir-examples/csp-benchmarks/* across the
+# selected backends. Each circuit is compiled once; then for each backend
+# `prepare` runs once and `prove` + `verify` are each invoked BENCH_RUNS
+# times so the helper can average wall time, peak RSS, and heap-peak bytes
+# (parsed from the prover's tracing output).
 #
 # Environment variables (all optional):
-#   PROVEKIT_BIN     Path to provekit-cli (default: target/release/provekit-cli)
-#   BENCH_ROOT       Path to csp-benchmarks (default: noir-examples/csp-benchmarks)
-#   BENCH_DIR        Output directory (default: csp-bench-logs)
-#   BENCH_RUNS       Iterations to average (default: 3)
-#   TEST_FILTER      Regex on circuit name
-#   MAX_TESTS        Cap on circuits (0 = unlimited)
+#   PROVEKIT_BIN         Path to provekit-cli (default: target/release/provekit-cli)
+#   BENCH_ROOT           Path to csp-benchmarks (default: noir-examples/csp-benchmarks)
+#   BENCH_DIR            Output directory (default: csp-bench-logs)
+#   BENCH_RUNS           Iterations to average (default: 3)
+#   BENCH_BACKENDS       Space-separated list of backends to benchmark
+#                        (default: "whir groth16")
+#   BENCH_SKIP_GROTH16   Regex of circuits to skip on the groth16 backend.
+#                        Useful when a circuit's trusted-setup PK exceeds the
+#                        runner's memory budget. Default empty (skip nothing).
+#   TEST_FILTER          Regex on circuit name
+#   MAX_TESTS            Cap on circuits (0 = unlimited)
 #
-# Output: BENCH_DIR/results.csv with one row per circuit:
-#   circuit,num_constraints,num_witnesses,prover_time_ms,prover_peak_rss_kb,
-#     prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,
-#     runs
+# Output: BENCH_DIR/results.csv with one row per (circuit, backend):
+#   circuit,backend,num_constraints,num_witnesses,prover_time_ms,
+#     prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,
+#     proof_size_bytes,pkp_size_bytes,runs
 
 set -euo pipefail
 
@@ -29,6 +35,8 @@ PROVEKIT_BIN="${PROVEKIT_BIN:-${REPO_ROOT}/target/release/provekit-cli}"
 BENCH_ROOT="${BENCH_ROOT:-${REPO_ROOT}/noir-examples/csp-benchmarks}"
 BENCH_DIR="${BENCH_DIR:-${REPO_ROOT}/csp-bench-logs}"
 BENCH_RUNS="${BENCH_RUNS:-3}"
+BENCH_BACKENDS="${BENCH_BACKENDS:-whir groth16}"
+BENCH_SKIP_GROTH16="${BENCH_SKIP_GROTH16:-}"
 TEST_FILTER="${TEST_FILTER:-}"
 MAX_TESTS="${MAX_TESTS:-0}"
 
@@ -58,22 +66,26 @@ if ! python3 -c "import tomllib" 2>/dev/null; then
   exit 1
 fi
 
-# `/usr/bin/time` is the GNU-style binary; macOS ships a different `time` shell
-# builtin so users may need `gtime` from `brew install gnu-time`. CI runs on
-# ubuntu-24.04-arm where /usr/bin/time is GNU.
+# Need GNU time (the `-f '%e %M'` format flag is GNU-specific). On Linux
+# `/usr/bin/time` is GNU; on macOS it's BSD which doesn't accept `-f`, and
+# GNU time is provided by `brew install gnu-time` as `gtime`. We probe each
+# candidate to confirm it actually accepts `-f` before picking it.
 TIME_BIN=""
-if [[ -x /usr/bin/time ]]; then
-  TIME_BIN=/usr/bin/time
-elif command -v gtime >/dev/null 2>&1; then
-  TIME_BIN="$(command -v gtime)"
-else
-  echo "ERROR: GNU /usr/bin/time not found (try: brew install gnu-time)" >&2
+for candidate in gtime /usr/bin/time; do
+  if cand_path="$(command -v "${candidate}" 2>/dev/null)" \
+     && "${cand_path}" -f '%e' true >/dev/null 2>&1; then
+    TIME_BIN="${cand_path}"
+    break
+  fi
+done
+if [[ -z "${TIME_BIN}" ]]; then
+  echo "ERROR: GNU time not found. On macOS: brew install gnu-time" >&2
   exit 1
 fi
 
 mkdir -p "${BENCH_DIR}/per_circuit"
 RESULTS_CSV="${BENCH_DIR}/results.csv"
-echo "circuit,num_constraints,num_witnesses,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs" > "${RESULTS_CSV}"
+echo "circuit,backend,num_constraints,num_witnesses,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs" > "${RESULTS_CSV}"
 
 shopt -s nullglob
 
@@ -109,35 +121,40 @@ except Exception:
 PY
 }
 
-attempted=0
-succeeded=0
-failed=0
+circuits_attempted=0
+rows_attempted=0
+rows_succeeded=0
+rows_failed=0
 
 for circuit in "${circuits[@]}"; do
   if [[ -n "${TEST_FILTER}" && ! "${circuit}" =~ ${TEST_FILTER} ]]; then
     continue
   fi
-  (( attempted += 1 ))
-  if [[ "${MAX_TESTS}" -gt 0 && "${attempted}" -gt "${MAX_TESTS}" ]]; then
+  (( circuits_attempted += 1 ))
+  if [[ "${MAX_TESTS}" -gt 0 && "${circuits_attempted}" -gt "${MAX_TESTS}" ]]; then
     break
   fi
 
   workdir="${BENCH_ROOT}/${circuit}"
-  out_dir="${BENCH_DIR}/per_circuit/${circuit}"
-  mkdir -p "${out_dir}"
 
   echo ""
-  echo "==> [${attempted}/${#circuits[@]}] ${circuit}"
+  echo "==> [${circuits_attempted}/${#circuits[@]}] ${circuit}"
 
   pkg_name="$(read_package_name "${workdir}")"
   if [[ -z "${pkg_name}" ]]; then
     pkg_name="${circuit}"
   fi
 
-  # 1) compile
-  if ! (cd "${workdir}" && nargo compile > "${out_dir}/compile.log" 2>&1); then
+  # 1) compile (once per circuit; shared across backends)
+  compile_log_dir="${BENCH_DIR}/per_circuit/${circuit}"
+  mkdir -p "${compile_log_dir}"
+  if ! (cd "${workdir}" && nargo compile > "${compile_log_dir}/compile.log" 2>&1); then
     echo "FAIL: nargo compile (${circuit})"
-    (( failed += 1 ))
+    # Compile failure means every backend row for this circuit is impossible.
+    for backend in ${BENCH_BACKENDS}; do
+      (( rows_attempted += 1 ))
+      (( rows_failed += 1 ))
+    done
     continue
   fi
 
@@ -149,91 +166,112 @@ for circuit in "${circuits[@]}"; do
       circuit_json="${candidate[0]}"
     else
       echo "FAIL: no compiled JSON in ${workdir}/target/"
-      (( failed += 1 ))
+      for backend in ${BENCH_BACKENDS}; do
+        (( rows_attempted += 1 ))
+        (( rows_failed += 1 ))
+      done
       continue
     fi
   fi
 
-  pkp_path="${out_dir}/prover.pkp"
-  pkv_path="${out_dir}/verifier.pkv"
-  proof_path="${out_dir}/proof.np"
+  for backend in ${BENCH_BACKENDS}; do
+    (( rows_attempted += 1 ))
 
-  # 2) prepare
-  if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prepare "${circuit_json}" \
-        --pkp "${pkp_path}" --pkv "${pkv_path}") > "${out_dir}/prepare.log" 2>&1; then
-    echo "FAIL: provekit-cli prepare (${circuit})"
-    (( failed += 1 ))
-    continue
-  fi
+    if [[ "${backend}" == "groth16" && -n "${BENCH_SKIP_GROTH16}" \
+          && "${circuit}" =~ ${BENCH_SKIP_GROTH16} ]]; then
+      echo "SKIP: ${circuit} on ${backend} (matched BENCH_SKIP_GROTH16)"
+      continue
+    fi
 
-  pkp_size_bytes="$(stat -c '%s' "${pkp_path}" 2>/dev/null || stat -f '%z' "${pkp_path}")"
-
-  # 3) prove × BENCH_RUNS — write each run's stderr separately so the helper
-  #    can parse the tracing output's "peak memory" lines.
-  prove_ok=1
-  for ((i=1; i<=BENCH_RUNS; i++)); do
-    if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
-            -o "${out_dir}/prove_${i}.time" \
-            "${PROVEKIT_BIN}" prove \
-              --prover "${pkp_path}" \
-              --input "${workdir}/Prover.toml" \
-              -o "${proof_path}") 2> "${out_dir}/prove_${i}.stderr"; then
-      echo "FAIL: provekit-cli prove run ${i} (${circuit})"
-      prove_ok=0
-      break
+    out_dir="${BENCH_DIR}/per_circuit/${circuit}/${backend}"
+    mkdir -p "${out_dir}"
+
+    pkp_path="${out_dir}/prover.pkp"
+    pkv_path="${out_dir}/verifier.pkv"
+    proof_path="${out_dir}/proof.np"
+
+    echo "  -- backend: ${backend}"
+
+    # 2) prepare (with backend selection)
+    if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prepare "${circuit_json}" \
+            --backend "${backend}" \
+            --pkp "${pkp_path}" --pkv "${pkv_path}") > "${out_dir}/prepare.log" 2>&1; then
+      echo "FAIL: provekit-cli prepare ${backend} (${circuit})"
+      (( rows_failed += 1 ))
+      continue
     fi
-  done
-  if [[ "${prove_ok}" -ne 1 ]]; then
-    (( failed += 1 ))
-    continue
-  fi
 
-  proof_size_bytes="$(stat -c '%s' "${proof_path}" 2>/dev/null || stat -f '%z' "${proof_path}")"
-
-  # 4) verify × BENCH_RUNS
-  verify_ok=1
-  for ((i=1; i<=BENCH_RUNS; i++)); do
-    if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
-            -o "${out_dir}/verify_${i}.time" \
-            "${PROVEKIT_BIN}" verify \
-              --verifier "${pkv_path}" \
-              --proof "${proof_path}") \
-            2> "${out_dir}/verify_${i}.stderr"; then
-      echo "FAIL: provekit-cli verify run ${i} (${circuit})"
-      verify_ok=0
-      break
+    pkp_size_bytes="$(stat -c '%s' "${pkp_path}" 2>/dev/null || stat -f '%z' "${pkp_path}")"
+
+    # 3) prove × BENCH_RUNS — write each run's stderr separately so the helper
+    #    can parse the tracing output's "peak memory" lines.
+    prove_ok=1
+    for ((i=1; i<=BENCH_RUNS; i++)); do
+      if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
+              -o "${out_dir}/prove_${i}.time" \
+              "${PROVEKIT_BIN}" prove \
+                --prover "${pkp_path}" \
+                --input "${workdir}/Prover.toml" \
+                -o "${proof_path}") 2> "${out_dir}/prove_${i}.stderr"; then
+        echo "FAIL: provekit-cli prove ${backend} run ${i} (${circuit})"
+        prove_ok=0
+        break
+      fi
+    done
+    if [[ "${prove_ok}" -ne 1 ]]; then
+      (( rows_failed += 1 ))
+      continue
     fi
-  done
-  if [[ "${verify_ok}" -ne 1 ]]; then
-    (( failed += 1 ))
-    continue
-  fi
 
-  cat > "${out_dir}/meta.txt" <<EOF
+    proof_size_bytes="$(stat -c '%s' "${proof_path}" 2>/dev/null || stat -f '%z' "${proof_path}")"
+
+    # 4) verify × BENCH_RUNS
+    verify_ok=1
+    for ((i=1; i<=BENCH_RUNS; i++)); do
+      if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
+              -o "${out_dir}/verify_${i}.time" \
+              "${PROVEKIT_BIN}" verify \
+                --verifier "${pkv_path}" \
+                --proof "${proof_path}") \
+              2> "${out_dir}/verify_${i}.stderr"; then
+        echo "FAIL: provekit-cli verify ${backend} run ${i} (${circuit})"
+        verify_ok=0
+        break
+      fi
+    done
+    if [[ "${verify_ok}" -ne 1 ]]; then
+      (( rows_failed += 1 ))
+      continue
+    fi
+
+    cat > "${out_dir}/meta.txt" <<EOF
 pkp_size_bytes=${pkp_size_bytes}
 proof_size_bytes=${proof_size_bytes}
 EOF
 
-  row="$(python3 "${HELPER}" parse-runs "${BENCH_DIR}" "${circuit}")"
-  if [[ -n "${row}" ]]; then
-    echo "${row}" >> "${RESULTS_CSV}"
-    echo "OK: ${row}"
-    (( succeeded += 1 ))
-  else
-    echo "FAIL: helper produced no row for ${circuit}"
-    (( failed += 1 ))
-  fi
+    row="$(python3 "${HELPER}" parse-runs "${BENCH_DIR}" "${circuit}" "${backend}")"
+    if [[ -n "${row}" ]]; then
+      echo "${row}" >> "${RESULTS_CSV}"
+      echo "OK: ${row}"
+      (( rows_succeeded += 1 ))
+    else
+      echo "FAIL: helper produced no row for ${circuit}/${backend}"
+      (( rows_failed += 1 ))
+    fi
+  done
 done
 
 echo ""
 echo "----- csp-benchmarks summary -----"
-echo "Discovered : ${#circuits[@]}"
-echo "Attempted  : ${attempted}"
-echo "Succeeded  : ${succeeded}"
-echo "Failed     : ${failed}"
-echo "Results    : ${RESULTS_CSV}"
-
-if [[ "${failed}" -gt 0 ]]; then
+echo "Discovered      : ${#circuits[@]}"
+echo "Circuits tried  : ${circuits_attempted}"
+echo "Backends        : ${BENCH_BACKENDS}"
+echo "Rows attempted  : ${rows_attempted}"
+echo "Rows succeeded  : ${rows_succeeded}"
+echo "Rows failed     : ${rows_failed}"
+echo "Results         : ${RESULTS_CSV}"
+
+if [[ "${rows_failed}" -gt 0 ]]; then
   exit 1
 fi
 exit 0
diff --git a/tooling/cli/Cargo.toml b/tooling/cli/Cargo.toml
index 3e9f91f67..53eec9328 100644
--- a/tooling/cli/Cargo.toml
+++ b/tooling/cli/Cargo.toml
@@ -12,20 +12,20 @@ repository.workspace = true
 # Workspace crates
 provekit-common.workspace = true
 provekit-gnark.workspace = true
+provekit-groth16.workspace = true
 provekit-prover = { workspace = true, features = ["witness-generation", "parallel"] }
 provekit-r1cs-compiler.workspace = true
 provekit-verifier.workspace = true
 
 # Noir language
 acir.workspace = true
-nargo.workspace = true
 nargo_toml.workspace = true
-noir_artifact_cli.workspace = true
 noirc_abi.workspace = true
 noirc_driver.workspace = true
 
 # Cryptography and proof systems
 ark-ff.workspace = true
+ark-serialize.workspace = true
 
 # 3rd party
 anyhow.workspace = true
diff --git a/tooling/cli/src/cmd/analyze_pkp.rs b/tooling/cli/src/cmd/analyze_pkp.rs
index 1729ba73d..b538a21fb 100644
--- a/tooling/cli/src/cmd/analyze_pkp.rs
+++ b/tooling/cli/src/cmd/analyze_pkp.rs
@@ -2,7 +2,7 @@ use {
     super::Command,
     anyhow::{Context, Result},
     argh::FromArgs,
-    provekit_common::{file::read, Prover},
+    provekit_prover::{read_pkp, Prover},
     std::path::PathBuf,
     tracing::instrument,
 };
@@ -19,10 +19,16 @@ pub struct Args {
 impl Command for Args {
     #[instrument(skip_all)]
     fn run(&self) -> Result<()> {
-        let prover: Prover = read(&self.pkp_path).context("while reading PKP file")?;
+        let prover: Prover = read_pkp(&self.pkp_path).context("while reading PKP file")?;
 
-        let Prover::Noir(p) = prover else {
-            anyhow::bail!("analyze-pkp is not currently supported for Mavros compiler");
+        let p = match prover {
+            Prover::Noir(p) => p,
+            Prover::Mavros(_) => {
+                anyhow::bail!("analyze-pkp is not currently supported for the Mavros compiler");
+            }
+            Prover::Groth16(_) => {
+                anyhow::bail!("analyze-pkp is not currently supported for the Groth16 backend");
+            }
         };
 
         let program_size = postcard::to_allocvec(&p.program)
diff --git a/tooling/cli/src/cmd/generate_gnark_inputs.rs b/tooling/cli/src/cmd/generate_gnark_inputs.rs
index d07614fe5..b18e09f5e 100644
--- a/tooling/cli/src/cmd/generate_gnark_inputs.rs
+++ b/tooling/cli/src/cmd/generate_gnark_inputs.rs
@@ -1,6 +1,6 @@
 use {
     crate::Command,
-    anyhow::{Context, Result},
+    anyhow::{bail, Context, Result},
     argh::FromArgs,
     provekit_common::{file::read, NoirProof, Verifier},
     provekit_gnark::write_gnark_parameters_to_file,
@@ -47,6 +47,20 @@ impl Command for Args {
         // Read the proof
         let proof: NoirProof = read(&self.proof_path).context("while reading proof")?;
 
+        // The gnark recursive verifier only consumes WHIR proofs; refuse a
+        // Groth16 proof up front so we don't hit the panic in
+        // `NoirProof::whir_r1cs_proof()`.
+        let (public_inputs, whir_r1cs_proof) = match &proof {
+            NoirProof::Whir {
+                public_inputs,
+                whir_r1cs_proof,
+            } => (public_inputs, whir_r1cs_proof),
+            NoirProof::Groth16 { .. } => bail!(
+                "generate-gnark-inputs requires a WHIR proof; got a Groth16 proof which the gnark \
+                 recursive verifier does not consume"
+            ),
+        };
+
         let wfw = verifier
             .whir_for_witness
             .as_ref()
@@ -54,13 +68,13 @@ impl Command for Args {
 
         write_gnark_parameters_to_file(
             &wfw.whir_witness.blinded_commitment,
-            &proof.whir_r1cs_proof,
+            whir_r1cs_proof,
             wfw.m_0,
             wfw.m,
             wfw.a_num_terms,
             wfw.num_challenges,
             wfw.w1_size,
-            &proof.public_inputs,
+            public_inputs,
             &self.params_for_recursive_verifier,
         );
 
diff --git a/tooling/cli/src/cmd/prepare.rs b/tooling/cli/src/cmd/prepare.rs
index bcfd34d91..6e089e639 100644
--- a/tooling/cli/src/cmd/prepare.rs
+++ b/tooling/cli/src/cmd/prepare.rs
@@ -1,23 +1,12 @@
 use {
     super::{util::resolve_key_path, Command},
-    anyhow::{anyhow, bail, Context as _, Result},
+    anyhow::{Context as _, Result},
     argh::FromArgs,
-    nargo::{
-        insert_all_files_for_workspace_into_file_manager,
-        ops::{check_program, collect_errors, compile_program, optimize_program, report_errors},
-        parse_all,
-    },
-    nargo_toml::{find_root, get_package_manifest, resolve_workspace_from_toml, PackageSelection},
-    noir_artifact_cli::fs::artifact::save_program_to_file,
-    noirc_driver::{CompilationResult, CompileOptions, CrateName, NOIR_ARTIFACT_VERSION_STRING},
-    provekit_common::{file::write, HashConfig, Prover, Verifier},
+    provekit_common::{file::write, HashConfig, Verifier},
+    provekit_prover::{write_pkp, write_pkp_mmap, Groth16CommitmentInfo, Groth16Prover, Prover},
     provekit_r1cs_compiler::{MavrosCompiler, NoirCompiler},
-    rayon::prelude::*,
-    std::{
-        path::{Path, PathBuf},
-        str::FromStr,
-    },
-    tracing::instrument,
+    std::{path::PathBuf, str::FromStr},
+    tracing::{info, instrument},
 };
 
 #[derive(PartialEq, Eq, Debug)]
@@ -38,6 +27,24 @@ impl argh::FromArgValue for Compiler {
     }
 }
 
+#[derive(PartialEq, Eq, Debug)]
+enum Backend {
+    Whir,
+    Groth16,
+}
+
+impl argh::FromArgValue for Backend {
+    fn from_arg_value(value: &str) -> std::result::Result<Self, String> {
+        match value {
+            "whir" => Ok(Backend::Whir),
+            "groth16" => Ok(Backend::Groth16),
+            other => Err(format!(
+                "Unknown backend: {other}. Use \"whir\" or \"groth16\"."
+            )),
+        }
+    }
+}
+
 /// Compile a Noir program and build its prover and verifier keys.
 #[derive(FromArgs, PartialEq, Eq, Debug)]
 #[argh(subcommand, name = "prepare")]
@@ -55,49 +62,17 @@ pub struct Args {
     #[argh(option, long = "compiler", default = "Compiler::Noir")]
     compiler: Compiler,
 
-    /// name of the package to compile (noir only; default: enclosing package)
-    #[argh(option)]
-    package: Option<String>,
-
-    /// compile every package in the workspace (noir only)
-    #[argh(switch)]
-    workspace: bool,
-
-    /// override the target directory for compiled artifacts (noir only)
-    #[argh(option)]
-    target_dir: Option<PathBuf>,
-
-    /// treat warnings as errors (noir only)
-    #[argh(switch)]
-    deny_warnings: bool,
-
-    /// suppress warnings (noir only)
-    #[argh(switch)]
-    silence_warnings: bool,
-
-    /// print the ACIR for the compiled circuit (noir only)
-    #[argh(switch)]
-    print_acir: bool,
-
-    /// skip the under-constrained-values check (noir only)
-    #[argh(switch)]
-    skip_underconstrained_check: bool,
-
-    /// skip the Brillig call-constraints check (noir only)
-    #[argh(switch)]
-    skip_brillig_constraints_check: bool,
-
-    /// force a full recompilation, ignoring cached artifacts (noir only)
-    #[argh(switch)]
-    force: bool,
+    /// proof backend to use: "whir" (default) or "groth16"
+    #[argh(option, long = "backend", default = "Backend::Whir")]
+    backend: Backend,
 
     /// output path for the ProveKit Prover (PKP) key (default:
-    /// `<circuit>.pkp`)
+    /// `<circuit>.pkp` for Noir, `noir_proof_scheme.pkp` for Mavros)
     #[argh(option, long = "pkp", short = 'p')]
     pkp_path: Option<PathBuf>,
 
     /// output path for the ProveKit Verifier (PKV) key (default:
-    /// `<circuit>.pkv`)
+    /// `<circuit>.pkv` for Noir, `noir_proof_scheme.pkv` for Mavros)
     #[argh(option, long = "pkv", short = 'v')]
     pkv_path: Option<PathBuf>,
 
@@ -105,158 +80,194 @@ pub struct Args {
     /// blake3, poseidon2)
     #[argh(option, long = "hash", default = "String::from(\"skyscraper\")")]
     hash: String,
+
+    /// use the mmap-friendly .pkp layout (Groth16 only). The file uses the
+    /// same .pkp extension as the legacy zstd format; readers auto-detect.
+    /// Larger artifact, near-instant load (rapidsnark-style).
+    #[argh(switch, long = "mmap")]
+    mmap: bool,
 }
 
 impl Command for Args {
     #[instrument(skip_all)]
     fn run(&self) -> Result<()> {
-        let hash_config = HashConfig::from_str(&self.hash).map_err(|e| anyhow!("{}", e))?;
-        match self.compiler {
-            Compiler::Noir => self.run_noir(hash_config),
-            Compiler::Mavros => self.run_mavros(hash_config),
+        let hash_config = HashConfig::from_str(&self.hash).map_err(|e| anyhow::anyhow!("{}", e))?;
+        let scheme = match self.compiler {
+            Compiler::Noir => NoirCompiler::from_file(&self.program_path, hash_config)
+                .context("while compiling Noir program")?,
+            Compiler::Mavros => {
+                let r1cs_path = self
+                    .r1cs_path
+                    .as_ref()
+                    .context("--r1cs is required when using the mavros compiler")?;
+                MavrosCompiler::compile(&self.program_path, r1cs_path, hash_config)
+                    .context("while compiling with Mavros")?
+            }
+        };
+
+        // Default key paths must match what `prove` and `verify` look up by
+        // default. For Noir that's `<package>.pkp` / `<package>.pkv` derived
+        // from Nargo.toml; Mavros has no manifest, so fall back to the legacy
+        // `noir_proof_scheme.*` names.
+        let resolve_path = |opt: Option<&PathBuf>, ext: &str| -> Result<PathBuf> {
+            match (opt, &self.compiler) {
+                (Some(p), _) => Ok(p.clone()),
+                (None, Compiler::Noir) => resolve_key_path(None, ext),
+                (None, Compiler::Mavros) => Ok(PathBuf::from(format!("noir_proof_scheme.{ext}"))),
+            }
+        };
+        let pkp_path = resolve_path(self.pkp_path.as_ref(), "pkp")?;
+        let pkv_path = resolve_path(self.pkv_path.as_ref(), "pkv")?;
+
+        if self.mmap && self.backend != Backend::Groth16 {
+            anyhow::bail!("--mmap is only supported with --backend groth16");
         }
-    }
-}
 
-impl Args {
-    fn run_noir(&self, hash_config: HashConfig) -> Result<()> {
-        // Canonicalize so compiled artifacts embed absolute source paths,
-        // matching `nargo compile` byte-for-byte in the `file_map` field.
-        let program_dir = std::fs::canonicalize(&self.program_path)
-            .with_context(|| format!("canonicalizing {}", self.program_path.display()))?;
-        let workspace_dir = find_root(&program_dir, true)?;
-        let package_dir = find_root(&program_dir, false)?;
+        match self.backend {
+            Backend::Whir => {
+                let prover = Prover::from_noir_proof_scheme(scheme.clone());
+                let verifier = Verifier::from_noir_proof_scheme(scheme);
 
-        let selection = self.package_selection(&workspace_dir, &package_dir)?;
-        let mut workspace = resolve_workspace_from_toml(
-            &get_package_manifest(&workspace_dir)?,
-            selection,
-            Some(NOIR_ARTIFACT_VERSION_STRING.to_owned()),
-        )?;
-        workspace.target_dir = self.target_dir.clone();
+                write_pkp(&prover, &pkp_path).context("while writing Provekit Prover")?;
+                write(&verifier, &pkv_path).context("while writing Provekit Verifier")?;
+            }
+            Backend::Groth16 => {
+                use {ark_serialize::CanonicalSerialize, provekit_common::NoirProofScheme};
 
-        let options = self.compile_options();
-        let mut file_manager = workspace.new_file_manager();
-        insert_all_files_for_workspace_into_file_manager(&workspace, &mut file_manager);
-        let parsed_files = parse_all(&file_manager);
+                // Extract R1CS and witness builders from the compiled scheme
+                let NoirProofScheme::Noir(d) = scheme else {
+                    anyhow::bail!("Groth16 backend is not supported with the Mavros compiler");
+                };
 
-        let binary_packages: Vec<_> = workspace
-            .into_iter()
-            .filter(|p| p.is_binary())
-            .cloned()
-            .collect();
+                let abi = d.witness_generator.abi.clone();
+                let mut r1cs = d.r1cs;
+                let program = d.program;
+                let split_witness_builders = d.split_witness_builders;
+                let witness_generator = d.witness_generator;
+                let w1_size = d.whir_for_witness.w1_size;
+                let challenge_offsets = d.whir_for_witness.challenge_offsets.clone();
 
-        if binary_packages.is_empty() {
-            bail!("no binary packages found in workspace");
-        }
-        if binary_packages.len() > 1 && (self.pkp_path.is_some() || self.pkv_path.is_some()) {
-            bail!("--pkp/--pkv cannot be used with multiple binary packages");
-        }
+                // The Noir compiler doesn't set num_public_inputs on the R1CS
+                // (WHIR handles public inputs separately). For Groth16, we need
+                // it to classify wires as public vs private. Compute from ABI.
+                {
+                    use noirc_abi::AbiVisibility;
+                    let mut n_public: usize = abi
+                        .parameters
+                        .iter()
+                        .filter(|p| p.is_public())
+                        .map(|p| p.typ.field_count() as usize)
+                        .sum();
+                    if let Some(ret) = &abi.return_type {
+                        if matches!(ret.visibility, AbiVisibility::Public) {
+                            n_public += ret.abi_type.field_count() as usize;
+                        }
+                    }
+                    r1cs.num_public_inputs = n_public;
+                }
+                let num_public = 1 + r1cs.num_public_inputs;
 
-        let target_dir = workspace.target_directory_path();
+                // Build BSB22 commitment info: WHIR-style, one Pedersen commitment
+                // over all private w1 wires, producing N challenges via hash_to_fr_multi.
+                let num_challenges = challenge_offsets.len();
+                let private_w1_wires: Vec<usize> = (num_public..w1_size).collect();
+                let public_committed: Vec<usize> = (1..num_public).collect();
 
-        let program_results: Vec<CompilationResult<_>> = binary_packages
-            .par_iter()
-            .map(|package| {
-                let (program, warnings) = compile_program(
-                    &file_manager,
-                    &parsed_files,
-                    &workspace,
-                    package,
-                    &options,
-                    None,
-                )?;
-                let program = optimize_program(program);
-                check_program(&program)?;
-                let artifact = program.into();
-                save_program_to_file(&artifact, &package.name, &target_dir)
-                    .expect("saving program artifact");
-                Ok((artifact, warnings))
-            })
-            .collect();
+                let (commitment_info, groth16_ci, num_challenges_per_commitment) =
+                    if num_challenges > 0 && !private_w1_wires.is_empty() {
+                        // Single commitment: any internal ordering of
+                        // `challenge_indices` is fine as long as the prover
+                        // (which iterates `ci.challenge_indices`) and the
+                        // setup (which iterates `challenge_wire_indices`)
+                        // agree. We sort by wire index for determinism.
+                        let mut sorted_challenge_indices: Vec<usize> = challenge_offsets
+                            .iter()
+                            .map(|&offset| w1_size + offset)
+                            .collect();
+                        sorted_challenge_indices.sort_unstable();
 
-        let artifacts = report_errors(
-            collect_errors(program_results),
-            &file_manager,
-            options.deny_warnings,
-            options.silence_warnings,
-        )?;
+                        let ci = Groth16CommitmentInfo {
+                            public_committed:  public_committed.clone(),
+                            private_committed: private_w1_wires.clone(),
+                            challenge_indices: sorted_challenge_indices.clone(),
+                        };
+                        let g16_ci = vec![provekit_groth16::CommitmentInfo {
+                            public_and_commitment_committed: public_committed,
+                            private_committed:               private_w1_wires.clone(),
+                            challenge_indices:               sorted_challenge_indices,
+                            nb_public_committed:             r1cs.num_public_inputs,
+                        }];
+                        let ncpc = vec![num_challenges];
+                        (vec![ci], g16_ci, ncpc)
+                    } else {
+                        (vec![], vec![], vec![])
+                    };
 
-        for (package, artifact) in binary_packages.iter().zip(artifacts) {
-            let scheme = NoirCompiler::from_program(artifact, hash_config)
-                .context("while building Noir proof scheme")?;
-            let pkp_path = self
-                .pkp_path
-                .clone()
-                .unwrap_or_else(|| format!("{}.pkp", package.name).into());
-            let pkv_path = self
-                .pkv_path
-                .clone()
-                .unwrap_or_else(|| format!("{}.pkv", package.name).into());
-            write(&Prover::from_noir_proof_scheme(scheme.clone()), &pkp_path)
-                .context("while writing prover key")?;
-            write(&Verifier::from_noir_proof_scheme(scheme), &pkv_path)
-                .context("while writing verifier key")?;
-        }
-        Ok(())
-    }
+                info!(
+                    num_challenges,
+                    num_private_committed = private_w1_wires.len(),
+                    num_public_inputs = r1cs.num_public_inputs,
+                    w1_size,
+                    "Running Groth16 trusted setup..."
+                );
+                let (pk, vk) = provekit_groth16::setup::setup(
+                    &r1cs,
+                    &groth16_ci,
+                    &num_challenges_per_commitment,
+                )
+                .context("while running Groth16 trusted setup")?;
 
-    fn run_mavros(&self, hash_config: HashConfig) -> Result<()> {
-        let r1cs_path = self
-            .r1cs_path
-            .as_ref()
-            .context("--r1cs is required when using the mavros compiler")?;
-        let scheme = MavrosCompiler::compile(&self.program_path, r1cs_path, hash_config)
-            .context("while compiling with Mavros")?;
-        let pkp_path = resolve_key_path(self.pkp_path.as_deref(), "pkp")?;
-        let pkv_path = resolve_key_path(self.pkv_path.as_deref(), "pkv")?;
-        write(&Prover::from_noir_proof_scheme(scheme.clone()), &pkp_path)
-            .context("while writing prover key")?;
-        write(&Verifier::from_noir_proof_scheme(scheme), &pkv_path)
-            .context("while writing verifier key")?;
-        Ok(())
-    }
+                // The PK is held in typed form (`provekit_groth16::ProvingKey`)
+                // and round-trips through arkworks bytes via the custom Serde
+                // adapter when the .pkp is written. Only the VK still
+                // serializes to bytes here, since `Verifier` keeps it as
+                // `Vec<u8>` for cross-language interop.
+                let mut vk_bytes = Vec::new();
+                vk.serialize_uncompressed(&mut vk_bytes)
+                    .context("while serializing Groth16 verifying key")?;
 
-    fn compile_options(&self) -> CompileOptions {
-        CompileOptions {
-            deny_warnings: self.deny_warnings,
-            silence_warnings: self.silence_warnings,
-            print_acir: self.print_acir,
-            skip_underconstrained_check: self.skip_underconstrained_check,
-            skip_brillig_constraints_check: self.skip_brillig_constraints_check,
-            force_compile: self.force,
-            ..CompileOptions::default()
-        }
-    }
+                info!(
+                    vk_size = vk_bytes.len(),
+                    vk_g1_k_len = vk.g1_k.len(),
+                    vk_commitment_keys_len = vk.commitment_keys.len(),
+                    vk_public_and_commitment_committed_len =
+                        vk.public_and_commitment_committed.len(),
+                    "Groth16 setup complete"
+                );
 
-    fn package_selection(
-        &self,
-        workspace_dir: &Path,
-        package_dir: &Path,
-    ) -> Result<PackageSelection> {
-        if self.workspace {
-            return Ok(PackageSelection::All);
-        }
-        if let Some(name) = &self.package {
-            let crate_name: CrateName = name
-                .parse()
-                .map_err(|e| anyhow!("invalid package name `{name}`: {e}"))?;
-            return Ok(PackageSelection::Selected(crate_name));
-        }
-        // When CWD is inside a sub-package of a multi-package workspace, narrow
-        // to that package rather than compiling the whole workspace.
-        if workspace_dir != package_dir {
-            let inner = resolve_workspace_from_toml(
-                &get_package_manifest(package_dir)?,
-                PackageSelection::DefaultOrAll,
-                Some(NOIR_ARTIFACT_VERSION_STRING.to_owned()),
-            )?;
-            let package = inner
-                .into_iter()
-                .next()
-                .expect("a package manifest resolves to exactly one member");
-            return Ok(PackageSelection::Selected(package.name.clone()));
+                // Build + write the Verifier first; this owns the only live
+                // copy of `r1cs`. Then move that `r1cs` out of the Verifier
+                // into the Prover via partial move — no clone. The previous
+                // version cloned `r1cs` for the Prover and kept both structs
+                // resident simultaneously, doubling peak prepare-time RAM and
+                // OOMing CI hosts on SHA-style (hundreds-of-MB-R1CS) circuits.
+                let verifier = Verifier {
+                    hash_config,
+                    r1cs,
+                    whir_for_witness: None,
+                    abi,
+                    groth16_vk: Some(vk_bytes),
+                };
+                write(&verifier, &pkv_path).context("while writing Provekit Verifier")?;
+
+                let prover = Prover::Groth16(Groth16Prover {
+                    program,
+                    r1cs: verifier.r1cs,
+                    split_witness_builders,
+                    witness_generator,
+                    groth16_pk: pk.into(),
+                    commitment_info,
+                });
+
+                if self.mmap {
+                    write_pkp_mmap(&prover, &pkp_path)
+                        .context("while writing mmap-format Provekit Prover")?;
+                } else {
+                    write_pkp(&prover, &pkp_path).context("while writing Provekit Prover")?;
+                }
+            }
         }
-        Ok(PackageSelection::DefaultOrAll)
+
+        Ok(())
     }
 }
diff --git a/tooling/cli/src/cmd/prove.rs b/tooling/cli/src/cmd/prove.rs
index 7030da6a2..49af4f4e9 100644
--- a/tooling/cli/src/cmd/prove.rs
+++ b/tooling/cli/src/cmd/prove.rs
@@ -2,11 +2,8 @@ use {
     super::{util::resolve_key_path, Command},
     anyhow::{Context, Result},
     argh::FromArgs,
-    provekit_common::{
-        file::{read, write},
-        Prover,
-    },
-    provekit_prover::Prove,
+    provekit_common::file::{read, write},
+    provekit_prover::{read_pkp, Prove, Prover},
     std::path::PathBuf,
     tracing::{info, instrument},
 };
@@ -49,7 +46,7 @@ impl Command for Args {
             .clone()
             .unwrap_or_else(|| PathBuf::from("./Prover.toml"));
 
-        let prover: Prover = read(&prover_path).context("while reading Provekit Prover")?;
+        let prover: Prover = read_pkp(&prover_path).context("while reading Provekit Prover")?;
         let (constraints, witnesses) = prover.size();
         info!(constraints, witnesses, "Read Noir proof scheme");
 
diff --git a/tooling/cli/src/cmd/show_inputs.rs b/tooling/cli/src/cmd/show_inputs.rs
index 4f238b9cc..d02771d5a 100644
--- a/tooling/cli/src/cmd/show_inputs.rs
+++ b/tooling/cli/src/cmd/show_inputs.rs
@@ -35,7 +35,7 @@ impl Command for Args {
         let proof: NoirProof = read(&self.proof_path).context("while reading proof")?;
 
         let abi = &verifier.abi;
-        let values = &proof.public_inputs.0;
+        let values = &proof.public_inputs().0;
 
         println!("Public Inputs:");
         println!("==============");
diff --git a/tooling/provekit-bench/benches/bench.rs b/tooling/provekit-bench/benches/bench.rs
index 058057dfa..dbec1c76a 100644
--- a/tooling/provekit-bench/benches/bench.rs
+++ b/tooling/provekit-bench/benches/bench.rs
@@ -3,8 +3,8 @@ use {
     anyhow::Context,
     core::hint::black_box,
     divan::Bencher,
-    provekit_common::{file::read, NoirProof, Prover, Verifier},
-    provekit_prover::Prove,
+    provekit_common::{file::read, NoirProof, Verifier},
+    provekit_prover::{read_pkp, Prove, Prover},
     provekit_verifier::Verify,
     std::path::Path,
 };
@@ -13,7 +13,7 @@ use {
 fn read_poseidon_1000(bencher: Bencher) {
     let crate_dir: &Path = "../../noir-examples/poseidon-rounds".as_ref();
     let proof_prover_path = crate_dir.join("noir-provekit-prover.pkp");
-    bencher.bench(|| read::<Prover>(&proof_prover_path));
+    bencher.bench(|| read_pkp(&proof_prover_path));
 }
 
 #[divan::bench]
@@ -21,7 +21,7 @@ fn prove_poseidon_1000(bencher: Bencher) {
     let crate_dir: &Path = "../../noir-examples/poseidon-rounds".as_ref();
     let proof_prover_path = crate_dir.join("noir-provekit-prover.pkp");
 
-    let prover: Prover = read(&proof_prover_path)
+    let prover: Prover = read_pkp(&proof_prover_path)
         .with_context(|| format!("Reading {}", proof_prover_path.display()))
         .expect("Reading prover");
 
@@ -42,7 +42,7 @@ fn prove_poseidon_1000_with_io(bencher: Bencher) {
     let witness_path = crate_dir.join("Prover.toml");
 
     bencher.bench(|| {
-        let prover: Prover = read(&proof_prover_path)
+        let prover: Prover = read_pkp(&proof_prover_path)
             .with_context(|| {
                 format!(
                     "Failed to read scheme from path: {} (working dir: {:?})",
diff --git a/tooling/provekit-bench/tests/compiler.rs b/tooling/provekit-bench/tests/compiler.rs
index d4481875b..d5513e107 100644
--- a/tooling/provekit-bench/tests/compiler.rs
+++ b/tooling/provekit-bench/tests/compiler.rs
@@ -4,8 +4,8 @@ use {
     nargo_cli::cli::compile_cmd::compile_workspace_full,
     nargo_toml::{resolve_workspace_from_toml, PackageSelection},
     noirc_driver::CompileOptions,
-    provekit_common::{HashConfig, Prover, Verifier},
-    provekit_prover::Prove,
+    provekit_common::{HashConfig, Verifier},
+    provekit_prover::{Prove, Prover},
     provekit_r1cs_compiler::NoirCompiler,
     provekit_verifier::Verify,
     serde::Deserialize,
@@ -283,7 +283,7 @@ fn test_public_input_binding_exploit() {
 
     // Tamper: the committed polynomial encodes result=16 at position 1, but we
     // claim result=42. The verifier should reject this.
-    proof.public_inputs = PublicInputs::from_vec(vec![FieldElement::from(42u64)]);
+    *proof.public_inputs_mut() = PublicInputs::from_vec(vec![FieldElement::from(42u64)]);
 
     let result = verifier.verify(&proof);
     assert!(
diff --git a/tooling/provekit-ffi/src/ffi.rs b/tooling/provekit-ffi/src/ffi.rs
index e156f5d09..8bcbef554 100644
--- a/tooling/provekit-ffi/src/ffi.rs
+++ b/tooling/provekit-ffi/src/ffi.rs
@@ -10,8 +10,8 @@ use {
         utils::c_str_to_str,
     },
     noirc_abi::input_parser::Format,
-    provekit_common::{file, HashConfig, NoirProof, Prover, Verifier},
-    provekit_prover::Prove,
+    provekit_common::{file, HashConfig, NoirProof, Verifier},
+    provekit_prover::{deserialize_pkp, read_pkp, serialize_pkp, write_pkp, Prove, Prover},
     provekit_r1cs_compiler::NoirCompiler,
     provekit_verifier::Verify,
     std::{
@@ -240,7 +240,7 @@ pub unsafe extern "C" fn pk_load_prover(path: *const c_char, out: *mut *mut PKPr
 
         let result = (|| -> Result<*mut PKProver, PKStatus> {
             let path = c_str_to_str(path)?;
-            let prover: Prover = file::read(Path::new(&path)).map_err(|e| {
+            let prover: Prover = read_pkp(Path::new(&path)).map_err(|e| {
                 set_last_error(format!("{e:#}"));
                 PKStatus::SchemeReadError
             })?;
@@ -322,7 +322,7 @@ pub unsafe extern "C" fn pk_load_prover_bytes(
             // SAFETY: ptr/len validity is guaranteed by the caller (documented in #
             // Safety).
             let data = std::slice::from_raw_parts(ptr, len);
-            let prover: Prover = file::deserialize(data).map_err(|e| {
+            let prover: Prover = deserialize_pkp(data).map_err(|e| {
                 set_last_error(format!("{e:#}"));
                 PKStatus::SchemeReadError
             })?;
@@ -401,7 +401,7 @@ pub unsafe extern "C" fn pk_save_prover(prover: *const PKProver, path: *const c_
         let result = (|| -> Result<(), PKStatus> {
             let path = c_str_to_str(path)?;
             // SAFETY: prover is guaranteed non-null and valid by caller contract.
-            file::write(&(*prover).prover, Path::new(&path)).map_err(|e| {
+            write_pkp(&(*prover).prover, Path::new(&path)).map_err(|e| {
                 set_last_error(format!("{e:#}"));
                 PKStatus::FileWriteError
             })
@@ -472,7 +472,7 @@ pub unsafe extern "C" fn pk_serialize_prover(
         *out_buf = PKBuf::empty();
 
         // SAFETY: prover is guaranteed non-null and valid by caller contract.
-        match file::serialize(&(*prover).prover) {
+        match serialize_pkp(&(*prover).prover) {
             Ok(bytes) => {
                 *out_buf = PKBuf::from_vec(bytes);
                 PKStatus::Success.into()
diff --git a/tooling/provekit-ffi/src/types.rs b/tooling/provekit-ffi/src/types.rs
index 90d471a6d..145a7a278 100644
--- a/tooling/provekit-ffi/src/types.rs
+++ b/tooling/provekit-ffi/src/types.rs
@@ -1,7 +1,8 @@
 //! Type definitions for ProveKit FFI bindings.
 
 use {
-    provekit_common::{Prover, Verifier},
+    provekit_common::Verifier,
+    provekit_prover::Prover,
     std::{os::raw::c_int, ptr},
 };
 
diff --git a/tooling/provekit-wasm/src/format.rs b/tooling/provekit-wasm/src/format.rs
index b96b36595..e66e4f224 100644
--- a/tooling/provekit-wasm/src/format.rs
+++ b/tooling/provekit-wasm/src/format.rs
@@ -4,8 +4,9 @@ use {
             HEADER_SIZE, MAGIC_BYTES, PROVER_FORMAT, PROVER_VERSION, VERIFIER_FORMAT,
             VERIFIER_VERSION, XZ_MAGIC, ZSTD_MAGIC,
         },
-        Prover, Verifier,
+        Verifier,
     },
+    provekit_prover::Prover,
     wasm_bindgen::prelude::*,
 };
 
diff --git a/tooling/provekit-wasm/src/prover.rs b/tooling/provekit-wasm/src/prover.rs
index e14d70a52..f03fa43a0 100644
--- a/tooling/provekit-wasm/src/prover.rs
+++ b/tooling/provekit-wasm/src/prover.rs
@@ -9,9 +9,9 @@ use {
     base64::{engine::general_purpose::STANDARD as BASE64, Engine as _},
     provekit_common::{
         binary_format::{HEADER_SIZE, MAGIC_BYTES},
-        NoirElement, NoirProof, Prover as ProverCore,
+        NoirElement, NoirProof,
     },
-    provekit_prover::Prove,
+    provekit_prover::{Prove, Prover as ProverCore},
     std::{cell::RefCell, collections::BTreeMap},
     wasm_bindgen::prelude::*,
 };
@@ -70,7 +70,7 @@ impl Prover {
     pub fn get_circuit(&self) -> Result<Box<[u8]>, JsError> {
         let noir_prover = match self.inner_ref()? {
             ProverCore::Noir(p) => p,
-            ProverCore::Mavros(_) => {
+            ProverCore::Mavros(_) | ProverCore::Groth16(_) => {
                 return Err(JsError::new("Only Noir provers are supported in WASM"))
             }
         };
diff --git a/tooling/verifier-server/src/services/verification.rs b/tooling/verifier-server/src/services/verification.rs
index 312f64034..f77d66edb 100644
--- a/tooling/verifier-server/src/services/verification.rs
+++ b/tooling/verifier-server/src/services/verification.rs
@@ -82,6 +82,23 @@ impl VerificationService {
             .to_str()
             .ok_or_else(|| AppError::Internal("Invalid gnark params path".to_string()))?;
 
+        // The gnark recursive verifier path only handles WHIR proofs; reject a
+        // Groth16 proof here rather than letting `whir_r1cs_proof()` panic and
+        // kill the HTTP service.
+        let (public_inputs, whir_r1cs_proof) = match proof {
+            NoirProof::Whir {
+                public_inputs,
+                whir_r1cs_proof,
+            } => (public_inputs, whir_r1cs_proof),
+            NoirProof::Groth16 { .. } => {
+                return Err(AppError::Internal(
+                    "Groth16 proofs are not supported on the gnark recursion endpoint; submit a \
+                     WHIR proof or use the Groth16 verifier directly"
+                        .to_string(),
+                ));
+            }
+        };
+
         let whir_scheme = verifier
             .whir_for_witness
             .as_ref()
@@ -89,13 +106,13 @@ impl VerificationService {
 
         write_gnark_parameters_to_file(
             &whir_scheme.whir_witness.blinded_commitment,
-            &proof.whir_r1cs_proof,
+            whir_r1cs_proof,
             whir_scheme.m_0,
             whir_scheme.m,
             whir_scheme.a_num_terms,
             whir_scheme.num_challenges,
             whir_scheme.w1_size,
-            &proof.public_inputs,
+            public_inputs,
             gnark_params_path,
         );