diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49154e0b..3f9bf8e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA)
     else()
         set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
     endif()
+    list(APPEND GPU_LIBRARIES CUDA::nvml)
 else()
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index b4e4fcdf..a3a2f808 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -475,7 +475,7 @@ All examples are in [`examples/torch-integration/`](../../examples/torch-integra
 
 The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
 
-**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
+**Reference implementation:** MSCCL++ ships a ready-to-use autotuner in [`python/mscclpp_benchmark/bench_collective.py`](../../python/mscclpp_benchmark/bench_collective.py). It benchmarks every candidate algorithm, block count, and thread count per message size, writes the winning configuration to a JSON file, and can replay that file at runtime. The sections below explain the underlying mechanism; see that benchmark for the complete, maintained implementation.
 
 ### How It Works
 
@@ -656,9 +656,20 @@ def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
                 self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
 ```
 
-### Running the Tuning Example
+### Running the Autotuner
+
+MSCCL++'s built-in autotuner benchmarks every candidate configuration and saves the best one to JSON. Run it across the ranks of your job, then reuse the generated config:
 
 ```bash
-MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
-  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
+# Autotune and save the tuned config
+mpirun -np 8 --allow-run-as-root \
+  python3 -m mscclpp_benchmark.bench_collective \
+  --collective allreduce --dtype float16 --autotune \
+  --write-config /tmp/mscclpp_tuned_configs.json
+
+# Replay the tuned config in a benchmark
+mpirun -np 8 --allow-run-as-root \
+  python3 -m mscclpp_benchmark.bench_collective \
+  --collective allreduce --dtype float16 \
+  --config-path /tmp/mscclpp_tuned_configs.json
 ```
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index 02df3685..13802f80 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -79,7 +79,7 @@ __global__ void __launch_bounds__(1024)
 
 struct Context {
   int rank;
-  int workSize;
+  int worldSize;
   int nRanksPerNode;
 
   std::vector<mscclpp::RegisteredMemory> registeredMemories;
@@ -140,7 +140,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                           size_t inputSize, cudaStream_t stream) {
     auto algoCtx = std::static_pointer_cast<Context>(ctx);
     int rank = algoCtx->rank;
-    int worldSize = algoCtx->workSize;
+    int worldSize = algoCtx->worldSize;
 
     int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
     allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize);
@@ -154,16 +154,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                              void* output, size_t inputSize, mscclpp::DataType dtype) {
     auto ctx = std::make_shared<Context>();
     ctx->rank = comm->bootstrap()->getRank();
-    ctx->workSize = comm->bootstrap()->getNranks();
+    ctx->worldSize = comm->bootstrap()->getNranks();
     ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
     // register memories
     mscclpp::RegisteredMemory inputBufRegMem =
         comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc);
     mscclpp::RegisteredMemory outputBufRegMem =
-        comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc);
+        comm->registerMemory(output, inputSize * ctx->worldSize, mscclpp::Transport::CudaIpc);
     std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
-    for (int i = 0; i < ctx->workSize; i++) {
+    for (int i = 0; i < ctx->worldSize; i++) {
       if (i == ctx->rank) continue;
       comm->sendMemory(outputBufRegMem, i, 0);
       remoteRegMemories.push_back(comm->recvMemory(i, 0));
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index 907b3ada..5ba2935f 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024)
 
 struct Context {
   int rank;
-  int workSize;
+  int worldSize;
   int nRanksPerNode;
 
   std::vector<mscclpp::RegisteredMemory> registeredMemories;
@@ -108,7 +108,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                           cudaStream_t stream) {
     auto algoCtx = std::static_pointer_cast<Context>(ctx);
     int rank = algoCtx->rank;
-    int worldSize = algoCtx->workSize;
+    int worldSize = algoCtx->worldSize;
 
     int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
     allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes);
@@ -122,16 +122,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                              void* output, size_t inputBytes, mscclpp::DataType dtype) {
     auto ctx = std::make_shared<Context>();
     ctx->rank = comm->bootstrap()->getRank();
-    ctx->workSize = comm->bootstrap()->getNranks();
+    ctx->worldSize = comm->bootstrap()->getNranks();
     ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
     // register memories
     mscclpp::RegisteredMemory inputBufRegMem =
         comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc);
     mscclpp::RegisteredMemory outputBufRegMem =
-        comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc);
+        comm->registerMemory(output, inputBytes * ctx->worldSize, mscclpp::Transport::CudaIpc);
     std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
-    for (int i = 0; i < ctx->workSize; i++) {
+    for (int i = 0; i < ctx->worldSize; i++) {
       if (i == ctx->rank) continue;
       comm->sendMemory(outputBufRegMem, i, 0);
       remoteRegMemories.push_back(comm->recvMemory(i, 0));
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
deleted file mode 100644
index b96087c2..00000000
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
-
-import os
-import ipaddress
-
-import netifaces as ni
-import torch
-import mscclpp
-import mscclpp.ext
-import mscclpp.utils as mscclpp_utils
-
-# -- Helpers ------------------------------------------------------------------
-
-
-def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
-    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
-    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
-    # Work around by importing as uint8 and reinterpreting via .view().
-    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
-    if dtype in _DLPACK_UNSUPPORTED:
-        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
-        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
-    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
-    return torch.utils.dlpack.from_dlpack(dlpack)
-
-
-def _load_algorithms(scratch: torch.Tensor, rank: int):
-    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
-        scratch_buffer=scratch.data_ptr(),
-        scratch_buffer_size=scratch.nbytes,
-        rank=rank,
-    )
-
-
-def _interfaces_for_ip(ip: str):
-    target = ipaddress.ip_address(ip)
-    for iface in ni.interfaces():
-        addrs = ni.ifaddresses(iface)
-        if ni.AF_INET in addrs:
-            for link in addrs[ni.AF_INET]:
-                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
-                    return iface
-    return None
-
-
-def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
-    if op == torch.distributed.ReduceOp.SUM:
-        return mscclpp.ReduceOp.SUM
-    if op == torch.distributed.ReduceOp.MIN:
-        return mscclpp.ReduceOp.MIN
-    raise ValueError(f"unsupported op: {op}")
-
-
-def _round_pow2(size: int) -> int:
-    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
-    size = max(size, 1024)
-    size = min(size, 256 << 20)
-    return 1 << (size - 1).bit_length()
-
-
-# -- CustomizedComm -----------------------------------------------------------
-
-
-class CustomizedComm:
-    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
-
-    _TUNE_N_WARMUP = 5
-    _TUNE_N_GRAPH_LAUNCHES = 10
-    _TUNE_N_OPS_PER_GRAPH = 100
-    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 56, 64, 128]
-    _CANDIDATE_NTHREADS = [512, 768, 1024]
-    _NBLOCKS_LIMIT = {
-        "default_allreduce_nvls_packet": 16,
-        "default_allreduce_packet": 56,
-        "default_allreduce_allpair_packet": 64,
-        "default_allreduce_fullmesh": 64,
-        "default_allgather_fullmesh2": 32,
-    }
-
-    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
-        self.comm = comm
-        self.rank = comm.my_rank
-        self.world_size = comm.nranks
-        self.symmetric_memory = symmetric_memory
-        self._nvls = mscclpp.is_nvls_supported()
-
-        self._scratch = _make_tensor(1 << 27, torch.float16)
-        self._barrier_tensor = _make_tensor(4096, torch.float32)
-
-        algos = _load_algorithms(self._scratch, self.rank)
-        self._algos = {(a.collective, a.name): a for a in algos}
-
-        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
-        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
-        self._tune_buf = None
-        self._time_buf = None
-
-    def _algo(self, collective: str, name: str):
-        return self._algos.get((collective, name))
-
-    def _default_ar_config(self):
-        """Fallback allreduce config for barrier / timing sync."""
-        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
-        if self._nvls and pkt:
-            return (pkt, 0, 0)
-        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
-
-    # -- low-level execute --
-
-    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
-        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
-        ret = algo.execute(
-            comm=self.comm.communicator,
-            input_buffer=tensor.data_ptr(),
-            output_buffer=tensor.data_ptr(),
-            input_size=tensor.nbytes,
-            output_size=tensor.nbytes,
-            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=op,
-            stream=s,
-            nblocks=nb,
-            nthreads_per_block=nt,
-            symmetric_memory=sym,
-            accum_dtype=accum_dtype,
-        )
-        if ret != 0:
-            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
-        return ret
-
-    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
-        if sym is None:
-            sym = self.symmetric_memory
-        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
-        ret = algo.execute(
-            comm=self.comm.communicator,
-            input_buffer=inp.data_ptr(),
-            output_buffer=out.data_ptr(),
-            input_size=inp.nbytes,
-            output_size=out.nbytes,
-            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
-            op=mscclpp.ReduceOp.NOP,
-            stream=s,
-            nblocks=nb,
-            nthreads_per_block=nt,
-            symmetric_memory=sym,
-        )
-        if ret != 0:
-            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
-        return ret
-
-    def _barrier_internal(self):
-        a, nb, nt = self._default_ar_config()
-        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
-
-    # -- lazy tuning --
-
-    def _ensure_tune_bufs(self):
-        if self._tune_buf is None:
-            self._tune_buf = _make_tensor(1 << 27, torch.float16)
-            self._tune_buf.normal_()
-            self._time_buf = _make_tensor(4096, torch.float32)
-        return self._tune_buf
-
-    def _ar_candidates(self, size: int):
-        out = []
-        if size <= 4 << 20:
-            a = self._algo("allreduce", "default_allreduce_nvls_packet")
-            if self._nvls and a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_packet")
-            if a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_allpair_packet")
-            if a:
-                out.append(a)
-        if size >= 512 << 10:
-            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-            if self._nvls and self.symmetric_memory and a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
-            if a:
-                out.append(a)
-        if torch.version.hip is not None:
-            a = self._algo("allreduce", "default_allreduce_fullmesh")
-            if a:
-                out.append(a)
-        return out
-
-    def _ag_candidates(self):
-        a = self._algo("allgather", "default_allgather_fullmesh2")
-        return [a] if a else []
-
-    def _run_tune(self, collective, algo, buf, size, nb, nt):
-        """Single tune invocation for either collective."""
-        if collective == "allreduce":
-            return algo.execute(
-                comm=self.comm.communicator,
-                input_buffer=buf.data_ptr(),
-                output_buffer=buf.data_ptr(),
-                input_size=size,
-                output_size=size,
-                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
-                op=mscclpp.ReduceOp.SUM,
-                stream=torch.cuda.current_stream().cuda_stream,
-                nblocks=nb,
-                nthreads_per_block=nt,
-                symmetric_memory=True,
-            )
-        else:
-            total = size * self.world_size
-            out_ptr = buf.data_ptr()
-            return algo.execute(
-                comm=self.comm.communicator,
-                input_buffer=out_ptr + self.rank * size,
-                output_buffer=out_ptr,
-                input_size=size,
-                output_size=total,
-                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
-                op=mscclpp.ReduceOp.NOP,
-                stream=torch.cuda.current_stream().cuda_stream,
-                nblocks=nb,
-                nthreads_per_block=nt,
-                symmetric_memory=False,
-            )
-
-    def _tune_size(self, collective: str, target_size: int):
-        """Auto-tune one (collective, target_size) pair and cache result."""
-        buf = self._ensure_tune_bufs()
-        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
-
-        best_time, best_cfg = float("inf"), None
-        used = set()
-        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
-
-        for algo in cands:
-            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
-            for nb in self._CANDIDATE_NBLOCKS:
-                if nb > nb_limit:
-                    continue
-                for nt in self._CANDIDATE_NTHREADS:
-                    # Feasibility — sync result across ranks so all agree
-                    ret = run(algo, nb, nt)
-                    torch.cuda.synchronize()
-                    self._time_buf[0] = float(ret)
-                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
-                    if self._time_buf[0].item() != 0:
-                        continue
-                    used.add(algo)
-
-                    # Warmup
-                    for _ in range(self._TUNE_N_WARMUP):
-                        run(algo, nb, nt)
-
-                    # CUDA-graph timed benchmark
-                    cs = torch.cuda.Stream()
-                    cs.wait_stream(torch.cuda.current_stream())
-                    g = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(g, stream=cs):
-                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
-                            run(algo, nb, nt)
-
-                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-                    start.record(cs)
-                    with torch.cuda.stream(cs):
-                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
-                            g.replay()
-                    end.record(cs)
-                    end.synchronize()
-                    elapsed = start.elapsed_time(end)
-
-                    # Cross-rank timing sync
-                    self._time_buf.fill_(elapsed)
-                    torch.cuda.current_stream().wait_stream(cs)
-                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
-                    avg = self._time_buf[self.rank].item() / self.world_size
-
-                    if avg < best_time:
-                        best_time, best_cfg = avg, (algo, nb, nt)
-
-        if best_cfg:
-            self._tune_cache[collective][target_size] = best_cfg
-            if self.rank == 0:
-                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
-                print(
-                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
-                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
-                    flush=True,
-                )
-        else:
-            fb = (
-                self._default_ar_config()
-                if collective == "allreduce"
-                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
-            )
-            self._tune_cache[collective][target_size] = fb
-
-        torch.cuda.synchronize()
-        self._barrier_internal()
-        for a in used:
-            a.reset()
-
-    # -- public API --
-
-    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
-        sz = _round_pow2(tensor.nbytes)
-        if sz not in self._tune_cache["allreduce"]:
-            self._tune_size("allreduce", sz)
-        a, nb, nt = self._tune_cache["allreduce"][sz]
-        self._exec_ar(
-            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
-        )
-
-    def all_gather(self, output_tensor, input_tensor, stream=None):
-        sz = _round_pow2(input_tensor.nbytes)
-        if sz not in self._tune_cache["allgather"]:
-            self._tune_size("allgather", sz)
-        a, nb, nt = self._tune_cache["allgather"][sz]
-        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
-
-    def barrier(self):
-        self._barrier_internal()
-
-    def destroy(self):
-        self._algos.clear()
-        self._tune_cache = {"allreduce": {}, "allgather": {}}
-        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
-
-
-# -- Benchmarks (standalone) --------------------------------------------------
-
-
-def _bench_sizes(low=5 * 1024, high=80 << 20):
-    sizes, c = [], low
-    while c <= high:
-        sizes.append(c)
-        c *= 2
-    return sizes
-
-
-def benchmark_allreduce(
-    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
-):
-    sizes = _bench_sizes()
-    if comm.rank == 0:
-        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
-        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
-
-    cs = torch.cuda.Stream()
-    buf = _make_tensor(1 << 27, dtype)
-    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
-
-    for size in sizes:
-        nelems = size // buf.element_size()
-        t = buf[: size // buf.element_size()]
-        comm.all_reduce(t, accum_dtype=accum_dtype)
-        torch.cuda.synchronize()
-
-        cs.wait_stream(torch.cuda.current_stream())
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g, stream=cs):
-            for _ in range(n_iter):
-                comm.all_reduce(t, accum_dtype=accum_dtype)
-        with torch.cuda.stream(cs):
-            for _ in range(n_warmup):
-                g.replay()
-            comm.barrier()
-        cs.synchronize()
-
-        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-        s.record(cs)
-        with torch.cuda.stream(cs):
-            for _ in range(n_graph_launches):
-                g.replay()
-        e.record(cs)
-        e.synchronize()
-
-        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
-        if comm.rank == 0:
-            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
-
-
-def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
-    sizes = _bench_sizes()
-    if comm.rank == 0:
-        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
-        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
-
-    cs = torch.cuda.Stream()
-    buf = _make_tensor(1 << 27, dtype)
-    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
-
-    for prs in sizes:
-        total = prs * comm.world_size
-        if total > buf.nbytes:
-            break
-        nt = total // buf.element_size()
-        npr = prs // buf.element_size()
-        out = buf[:nt]
-        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
-
-        comm.all_gather(out, inp)
-        torch.cuda.synchronize()
-
-        cs.wait_stream(torch.cuda.current_stream())
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g, stream=cs):
-            for _ in range(n_iter):
-                comm.all_gather(out, inp)
-        with torch.cuda.stream(cs):
-            for _ in range(n_warmup):
-                g.replay()
-            comm.barrier()
-        cs.synchronize()
-
-        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-        s.record(cs)
-        with torch.cuda.stream(cs):
-            for _ in range(n_graph_launches):
-                g.replay()
-        e.record(cs)
-        e.synchronize()
-
-        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
-        if comm.rank == 0:
-            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
-
-
-# -- Bootstrap & main ---------------------------------------------------------
-
-
-def init_dist() -> mscclpp.CommGroup:
-    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
-    if addr:
-        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
-        port = os.environ["MSCCLPP_MASTER_PORT"]
-        iface = _interfaces_for_ip(addr)
-        if not iface:
-            raise ValueError(f"No interface for {addr}")
-        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
-    import torch.distributed as dist
-
-    dist.init_process_group(backend="gloo")
-    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
-
-
-def main():
-    local = int(os.environ["LOCAL_RANK"])
-    torch.cuda.set_device(local)
-
-    dtype_str = os.environ.get("DTYPE", "float16")
-    dtype = getattr(torch, dtype_str, torch.float16)
-    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
-    accum_str = os.environ.get("ACCUM_DTYPE")
-    accum_dtype = accum_map.get(accum_str) if accum_str else None
-
-    comm_group = init_dist()
-    cc = CustomizedComm(comm_group)
-
-    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
-    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
-    cc.barrier()
-    torch.cuda.synchronize()
-
-    benchmark_allgather(cc, dtype=dtype)
-    cc.barrier()
-    torch.cuda.synchronize()
-
-    cc.destroy()
-    print(f"rank {local} completed successfully.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 45b56bcc..4c14f1ee 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -46,6 +46,10 @@ class Bootstrap {
   /// @return The total number of ranks per node.
   virtual int getNranksPerNode() const = 0;
 
+  /// Return the number of ranks in this rank's GPU IPC domain.
+  /// @return The number of ranks in the GPU IPC domain.
+  virtual int getNranksPerIpcDomain() const;
+
   /// Send arbitrary data to another process.
   ///
   /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
@@ -144,6 +148,9 @@ class TcpBootstrap : public Bootstrap {
   /// Return the total number of ranks per node.
   int getNranksPerNode() const override;
 
+  /// Return the number of ranks in this rank's GPU IPC domain.
+  int getNranksPerIpcDomain() const override;
+
   /// Send arbitrary data to another process.
   ///
   /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index a6dd306b..7415119f 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -37,7 +37,7 @@ class Env {
   const std::string logLevel;
 
   /// Env name: `MSCCLPP_LOG_SUBSYS`. The log subsystem, a comma-separated list of subsystems to enable
-  /// logging for. Possible values are ENV, GPU, NET, CONN, EXEC, NCCL, ALL (default).
+  /// logging for. Possible values are ENV, GPU, NET, CONN, EXEC, NCCL, ALGO, ALL (default).
   /// If the first character is '^', it inverts the mask, i.e., enables all subsystems except those specified.
   /// For example, "^NET,CONN" enables all subsystems except NET and CONN.
   const std::string logSubsys;
diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index b8d096e2..b289bd4d 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -31,6 +31,7 @@ using CUmemorytype = hipMemoryType;
 constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
 constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
 constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
+constexpr auto cudaErrorInvalidValue = hipErrorInvalidValue;
 constexpr auto cudaSuccess = hipSuccess;
 constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 82fa3ec0..926bd6ca 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -1,10 +1,11 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #ifndef MSCCLPP_GPU_UTILS_HPP_
 #define MSCCLPP_GPU_UTILS_HPP_
 
 #include <memory>
+#include <optional>
 #include <unordered_map>
 
 #include "env.hpp"
@@ -403,6 +404,96 @@ class GpuBuffer {
   std::shared_ptr<T> memory_;
 };
 
+namespace detail {
+
+class GpuBufferPoolStorage;
+
+}  // namespace detail
+
+/// A deterministic sub-allocation pool for GPU communication buffers.
+///
+/// The pool allocates one `GpuBuffer<char>` slab and returns reference-counted buffers for sub-ranges in that slab.
+/// If all ranks create pools with the same size and make the same allocation calls, returned buffers have identical
+/// offsets from the slab base pointer. Python bindings use this property to build symmetric memory buffers while
+/// relying on normal Python reference counting to return buffers to the local pool.
+///
+class GpuBufferPool {
+ public:
+  /// Represents one buffer returned by `GpuBufferPool`.
+  ///
+  /// This object owns a sub-range of a pool allocation. Destroying the last reference to this object returns the
+  /// sub-range to the pool. Destruction only updates local pool bookkeeping and does not perform any
+  /// cross-rank synchronization.
+  class Buffer {
+   public:
+    Buffer(const Buffer&) = delete;
+    Buffer& operator=(const Buffer&) = delete;
+    Buffer(Buffer&&) = delete;
+    Buffer& operator=(Buffer&&) = delete;
+
+    /// Destructor. Returns this buffer's sub-range to the owning pool when the last reference is destroyed.
+    ~Buffer();
+
+    /// Returns the number of bytes requested for this buffer.
+    /// @return Number of bytes in this buffer.
+    size_t bytes() const;
+
+    /// Returns this buffer's byte offset from the pool base pointer.
+    /// @return Byte offset from the pool base pointer.
+    size_t offset() const;
+
+    /// Returns the device pointer to this buffer.
+    /// @return Device pointer to this buffer.
+    char* data() const;
+
+    /// Returns the device id of the underlying pool allocation.
+    /// @return Device id of the underlying pool allocation.
+    int deviceId() const;
+
+   private:
+    friend class detail::GpuBufferPoolStorage;
+    Buffer(std::shared_ptr<detail::GpuBufferPoolStorage> storage, size_t offset, size_t bytes);
+
+    std::shared_ptr<detail::GpuBufferPoolStorage> storage_;
+    size_t offset_;
+    size_t bytes_;
+  };
+
+  /// Constructs a pool backed by a single `GpuBuffer<char>`.
+  /// @param bytes Number of bytes to reserve in the pool.
+  /// @param granularity Granularity used to size the underlying `GpuBuffer`.
+  explicit GpuBufferPool(size_t bytes, GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum);
+
+  /// Allocates a sub-range from the pool.
+  /// @param bytes Number of bytes to allocate.
+  /// @param alignment Alignment in bytes for the returned offset.
+  /// @return A reference-counted pooled buffer.
+  std::shared_ptr<Buffer> allocate(size_t bytes, size_t alignment = 256);
+
+  /// Returns the number of bytes in the underlying pool allocation.
+  /// @return Number of bytes in the pool.
+  size_t bytes() const;
+
+  /// Returns the number of bytes that are available for new buffers.
+  /// @return Number of free bytes.
+  size_t freeBytes() const;
+
+  /// Returns the number of bytes currently held by active pooled buffers.
+  /// @return Number of active bytes.
+  size_t activeBytes() const;
+
+  /// Returns the device pointer to the pool base.
+  /// @return Device pointer to the pool base.
+  char* data();
+
+  /// Returns the device id of the underlying pool allocation.
+  /// @return Device id of the underlying pool allocation.
+  int deviceId() const;
+
+ private:
+  std::shared_ptr<detail::GpuBufferPoolStorage> storage_;
+};
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_GPU_UTILS_HPP_
diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index b52b6572..fcdd7fdd 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -37,7 +37,10 @@ struct SwitchChannelDeviceHandle {
     SwitchChannelDeviceHandle::multimemStore(val, reinterpret_cast<T*>(mcPtr) + index);
   }
 
-  template <typename VectorType>
+  /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the
+  /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the
+  /// `.acc::f16` variant of the instruction is used. For all other types `AccumT` is ignored.
+  template <typename VectorType, typename AccumT = void>
   MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) {
     VectorType val;
     if constexpr (std::is_same_v<VectorType, i32x1>) {
@@ -80,32 +83,78 @@ struct SwitchChannelDeviceHandle {
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+    }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000)
+    else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];"
+            : "=r"(val.words[0])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
-          : "=r"(val.words[0]), "=r"(val.words[1])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e4m3x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
-          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e5m2x4 %0, [%1];"
+            : "=r"(val.words[0])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
-          : "=r"(val.words[0]), "=r"(val.words[1])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e5m2x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
-          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
-          : "l"(ptr)
-          : "memory");
-    } else {
-      static_assert(dependentFalse<VectorType>, "Not supported type");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      }
+    }
+#endif
+    else {
+      static_assert(dependentFalse<VectorType>, "Unsupported vector type for multimemLoadReduce");
     }
     return val;
   };
@@ -148,7 +197,9 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
+    }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000)
+    else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
@@ -168,8 +219,10 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else {
-      static_assert(dependentFalse<VectorType>, "Not supported type");
+    }
+#endif
+    else {
+      static_assert(dependentFalse<VectorType>, "Unsupported vector type for multimemStore");
     }
   };
 
@@ -194,7 +247,7 @@ struct SwitchChannelDeviceHandle {
     } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
       asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
     } else {
-      static_assert(dependentFalse<T>, "Not supported type");
+      static_assert(dependentFalse<TValue>, "Unsupported vector type for multimemStoreReduce");
     }
   };
 #endif  // defined(MSCCLPP_DEVICE_CUDA)
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index a94f9863..7e9af6c1 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -56,6 +56,7 @@ void register_core(nb::module_& m) {
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
       .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
+      .def("get_n_ranks_per_ipc_domain", &Bootstrap::getNranksPerIpcDomain)
       .def(
           "send",
           [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index d6527502..ba7ca9e9 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <dlpack/dlpack.h>
 #include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/vector.h>
 
+#include <memory>
 #include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
@@ -15,6 +17,13 @@ using namespace mscclpp;
 
 constexpr int BYTE_BITS = 8;
 
+struct DlpackContext {
+  DLManagedTensor managedTensor{};
+  std::vector<int64_t> shape;
+  std::vector<int64_t> strides;
+  std::shared_ptr<void> owner;
+};
+
 static DLDeviceType getDeviceType() {
 #if defined(MSCCLPP_USE_ROCM)
   return kDLROCM;
@@ -52,65 +61,77 @@ static DLDataType getDlType(std::string type) {
   }
 }
 
-static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::vector<int64_t>& shape,
-                            std::vector<int64_t>& strides) {
+static void dlpackDeleter(DLManagedTensor* self) { delete static_cast<DlpackContext*>(self->manager_ctx); }
+
+static void dlpackCapsuleDestructor(PyObject* capsule) {
+  if (PyCapsule_IsValid(capsule, "used_dltensor")) {
+    return;
+  }
+  if (!PyCapsule_IsValid(capsule, "dltensor")) {
+    return;
+  }
+  DLManagedTensor* managedTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
+  if (managedTensor == nullptr) {
+    return;
+  }
+  if (managedTensor->deleter) {
+    managedTensor->deleter(managedTensor);
+  }
+}
+
+static nb::capsule makeDlpack(void* data, size_t bytes, int deviceId, std::shared_ptr<void> owner, std::string dataType,
+                              std::vector<int64_t> shape, std::vector<int64_t> strides) {
   DLDataType dtype = getDlType(dataType);
-  int64_t* tensorShape = shape.size() > 0 ? new int64_t[shape.size()] : new int64_t[1];
-  int64_t* tensorStrides = strides.size() > 0 ? new int64_t[strides.size()] : nullptr;
-  if (shape.size() == 0) {
-    tensorShape[0] = (int64_t)(buffer.nelems() / ((dtype.bits * dtype.lanes + 7) / BYTE_BITS));
-  } else {
-    for (size_t i = 0; i < shape.size(); ++i) {
-      tensorShape[i] = shape[i];
+  auto ctx = std::make_unique<DlpackContext>();
+  size_t elementBytes = (dtype.bits * dtype.lanes + 7) / BYTE_BITS;
+  if (shape.empty()) {
+    if (bytes % elementBytes != 0) {
+      throw Error("DLPack buffer size must be divisible by the element size.", ErrorCode::InvalidUsage);
     }
+    ctx->shape.push_back((int64_t)(bytes / elementBytes));
+  } else {
+    ctx->shape = std::move(shape);
   }
-  for (size_t i = 0; i < strides.size(); ++i) {
-    tensorStrides[i] = strides[i];
+  ctx->strides = std::move(strides);
+  if (!ctx->strides.empty() && ctx->strides.size() != ctx->shape.size()) {
+    throw Error("DLPack strides must have the same length as shape.", ErrorCode::InvalidUsage);
   }
+  ctx->owner = std::move(owner);
 
-  DLManagedTensor* dlManagedTensor = new DLManagedTensor();
-  dlManagedTensor->dl_tensor.data = buffer.data();
+  DLManagedTensor* dlManagedTensor = &ctx->managedTensor;
+  dlManagedTensor->dl_tensor.data = data;
   dlManagedTensor->dl_tensor.device.device_type = getDeviceType();
-  dlManagedTensor->dl_tensor.device.device_id = buffer.deviceId();
-  dlManagedTensor->dl_tensor.ndim = shape.size() == 0 ? 1 : shape.size();
-  dlManagedTensor->dl_tensor.strides = tensorStrides;
-  dlManagedTensor->dl_tensor.shape = tensorShape;
+  dlManagedTensor->dl_tensor.device.device_id = deviceId;
+  dlManagedTensor->dl_tensor.ndim = static_cast<int>(ctx->shape.size());
+  dlManagedTensor->dl_tensor.strides = ctx->strides.empty() ? nullptr : ctx->strides.data();
+  dlManagedTensor->dl_tensor.shape = ctx->shape.data();
   dlManagedTensor->dl_tensor.byte_offset = 0;
   dlManagedTensor->dl_tensor.dtype = dtype;
-  dlManagedTensor->manager_ctx = new GpuBuffer<char>(buffer);
-  dlManagedTensor->deleter = [](DLManagedTensor* self) {
-    delete static_cast<GpuBuffer<char>*>(self->manager_ctx);
-    self->manager_ctx = nullptr;
-    self->dl_tensor.data = nullptr;
-    if (self->dl_tensor.shape != nullptr) {
-      delete[] self->dl_tensor.shape;
-      self->dl_tensor.shape = nullptr;
-      if (self->dl_tensor.strides) {
-        delete[] self->dl_tensor.strides;
-        self->dl_tensor.strides = nullptr;
-      }
-    }
-    delete self;
-  };
+  dlManagedTensor->manager_ctx = ctx.get();
+  dlManagedTensor->deleter = dlpackDeleter;
 
-  PyObject* dlCapsule = PyCapsule_New(static_cast<void*>(dlManagedTensor), "dltensor", [](PyObject* capsule) {
-    if (PyCapsule_IsValid(capsule, "used_dltensor")) {
-      return;
-    }
-    if (!PyCapsule_IsValid(capsule, "dltensor")) {
-      return;
-    }
-    DLManagedTensor* managedTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
-    if (managedTensor == nullptr) {
-      return;
-    }
-    if (managedTensor->deleter) {
-      managedTensor->deleter(managedTensor);
-    }
-  });
+  PyObject* dlCapsule = PyCapsule_New(static_cast<void*>(dlManagedTensor), "dltensor", dlpackCapsuleDestructor);
+  if (dlCapsule == nullptr) {
+    throw Error("Failed to create DLPack capsule.", ErrorCode::InvalidUsage);
+  }
+  ctx.release();
   return nb::steal<nb::capsule>(dlCapsule);
 }
 
+static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::vector<int64_t>& shape,
+                            std::vector<int64_t>& strides) {
+  auto owner = std::make_shared<GpuBuffer<char>>(buffer);
+  return makeDlpack(buffer.data(), buffer.nelems(), buffer.deviceId(), std::move(owner), dataType, shape, strides);
+}
+
+static nb::capsule toDlpack(std::shared_ptr<GpuBufferPool::Buffer> buffer, std::string dataType,
+                            std::vector<int64_t>& shape, std::vector<int64_t>& strides) {
+  void* data = buffer->data();
+  size_t bytes = buffer->bytes();
+  int deviceId = buffer->deviceId();
+  return makeDlpack(data, bytes, deviceId, std::move(buffer), dataType, shape, strides);
+}
+
 void register_gpu_utils(nb::module_& m) {
   m.def("is_nvls_supported", &isNvlsSupported);
 
@@ -131,4 +152,25 @@ void register_gpu_utils(nb::module_& m) {
             return toDlpack(self, dataType, shape, strides);
           },
           nb::arg("data_type"), nb::arg("shape") = std::vector<int64_t>(), nb::arg("strides") = std::vector<int64_t>());
+
+  nb::class_<GpuBufferPool::Buffer>(m, "CppRawGpuBufferPoolBuffer")
+      .def("bytes", &GpuBufferPool::Buffer::bytes)
+      .def("offset", &GpuBufferPool::Buffer::offset)
+      .def("data", [](GpuBufferPool::Buffer& self) { return reinterpret_cast<uintptr_t>(self.data()); })
+      .def("device_id", &GpuBufferPool::Buffer::deviceId)
+      .def(
+          "to_dlpack",
+          [](std::shared_ptr<GpuBufferPool::Buffer> self, std::string dataType, std::vector<int64_t> shape,
+             std::vector<int64_t> strides) { return toDlpack(std::move(self), dataType, shape, strides); },
+          nb::arg("data_type"), nb::arg("shape") = std::vector<int64_t>(), nb::arg("strides") = std::vector<int64_t>());
+
+  nb::class_<GpuBufferPool>(m, "CppRawGpuBufferPool")
+      .def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("bytes"),
+           nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
+      .def("bytes", &GpuBufferPool::bytes)
+      .def("free_bytes", &GpuBufferPool::freeBytes)
+      .def("active_bytes", &GpuBufferPool::activeBytes)
+      .def("data", [](GpuBufferPool& self) { return reinterpret_cast<uintptr_t>(self.data()); })
+      .def("device_id", &GpuBufferPool::deviceId)
+      .def("allocate", &GpuBufferPool::allocate, nb::arg("bytes"), nb::arg("alignment") = 256);
 }
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 09408171..0a414937 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -50,6 +50,8 @@
     CppExecutionPlan as ExecutionPlan,
     CppPacketType as PacketType,
     CppRawGpuBuffer as RawGpuBuffer,
+    CppRawGpuBufferPool as RawGpuBufferPool,
+    CppRawGpuBufferPoolBuffer as RawGpuBufferPoolBuffer,
     CppReduceOp as ReduceOp,
     env,
     is_nvls_supported,
@@ -84,6 +86,8 @@
     "ExecutionPlan",
     "PacketType",
     "RawGpuBuffer",
+    "RawGpuBufferPool",
+    "RawGpuBufferPoolBuffer",
     "ReduceOp",
     "env",
     "version",
@@ -100,6 +104,7 @@
     "AlgorithmCollection",
     "CommGroup",
     "GpuBuffer",
+    "GpuBufferPool",
     "GpuBufferGranularity",
 ]
 
diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py
index e07424f5..33d630b7 100644
--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -6,9 +6,14 @@
 
 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity
+from mscclpp._mscclpp import (
+    CppRawGpuBuffer,
+    CppRawGpuBufferPool,
+    CppRawGpuBufferPoolBuffer,
+    CppGpuBufferGranularity,
+)
 
-__all__ = ["GpuBuffer", "GpuBufferGranularity"]
+__all__ = ["GpuBuffer", "GpuBufferPool", "GpuBufferGranularity"]
 
 GpuBufferGranularity = CppGpuBufferGranularity
 
@@ -35,3 +40,62 @@ def __new__(
         buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
         memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
+
+
+class GpuBufferPool:
+    """A GPU buffer pool that returns raw buffers backed by one communication-friendly allocation.
+
+    All ranks should create the same-sized pool and call :meth:`allocate` in the same order to get matching offsets.
+    """
+
+    def __init__(
+        self,
+        nbytes: int,
+        granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
+    ):
+        if nbytes <= 0:
+            raise ValueError("Pool size must be positive.")
+        self._pool = CppRawGpuBufferPool(int(nbytes), granularity)
+
+    @property
+    def bytes(self) -> int:
+        """Number of bytes in the underlying pool allocation."""
+        return self._pool.bytes()
+
+    @property
+    def free_bytes(self) -> int:
+        """Number of bytes available for new buffers."""
+        return self._pool.free_bytes()
+
+    @property
+    def active_bytes(self) -> int:
+        """Number of bytes currently held by live raw buffers."""
+        return self._pool.active_bytes()
+
+    @property
+    def data(self) -> int:
+        """Device pointer to the pool base allocation."""
+        return self._pool.data()
+
+    @property
+    def device_id(self) -> int:
+        """CUDA/HIP device id of the pool allocation."""
+        return self._pool.device_id()
+
+    def allocate(
+        self,
+        nbytes: int,
+        alignment: int = 256,
+    ) -> CppRawGpuBufferPoolBuffer:
+        """Allocate a raw buffer from the pool.
+
+        Args:
+            nbytes: Number of bytes to allocate.
+            alignment: Required byte alignment of the allocation offset from the pool base.
+        """
+        if nbytes <= 0:
+            raise ValueError("Buffer size must be positive.")
+        if alignment <= 0:
+            raise ValueError("Alignment must be positive.")
+
+        return self._pool.allocate(int(nbytes), int(alignment))
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index d42349dd..875e07f1 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -73,6 +73,7 @@ def __init__(
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
         self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
+        self.ipc_domain_n_ranks = self.bootstrap.get_n_ranks_per_ipc_domain()
 
     def barrier(self):
         self.bootstrap.barrier()
diff --git a/python/mscclpp_benchmark/bench_collective.py b/python/mscclpp_benchmark/bench_collective.py
index c526438d..21b61152 100644
--- a/python/mscclpp_benchmark/bench_collective.py
+++ b/python/mscclpp_benchmark/bench_collective.py
@@ -199,7 +199,6 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl
             "default_allreduce_nvls_packet",
             max_message_size=512 * 1024,
             max_nblocks=16,
-            supported_skus=("H100", "GB300"),
             requires_nvls=True,
         ),
         CandidateSpec(
@@ -216,6 +215,10 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl
             "default_allreduce_rsag_zero_copy",
             min_message_size=512 * 1024 + 1,
         ),
+        CandidateSpec(
+            "default_allreduce_rsag",
+            min_message_size=512 * 1024 + 1,
+        ),
         CandidateSpec(
             "default_allreduce_fullmesh",
             min_message_size=512 * 1024 + 1,
@@ -228,7 +231,6 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl
             CandidateSpec(
                 "default_allreduce_nvls_zero_copy",
                 max_nblocks=32,
-                supported_skus=("H100", "GB300"),
                 requires_nvls=True,
                 requires_symmetric_memory=True,
             ),
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 6b3119cb..ecb52d9d 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 from concurrent.futures import ThreadPoolExecutor
 import os
@@ -30,7 +30,7 @@
     Device,
     DeviceType,
 )
-from mscclpp import CommGroup, GpuBuffer
+from mscclpp import CommGroup, GpuBuffer, GpuBufferPool
 from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -60,6 +60,61 @@ def all_ranks_on_the_same_node(mpi_group: MpiGroup):
     return last_rank_ip == root_ip
 
 
+@parametrize_mpi_groups(1)
+def test_gpu_buffer_pool(mpi_group: MpiGroup):
+    pool = GpuBufferPool(4096)
+    base_ptr = pool.data
+    buf = pool.allocate(16 * np.dtype(np.int32).itemsize, alignment=512)
+    offset = buf.data() - base_ptr
+    assert offset % 512 == 0
+    assert pool.active_bytes == buf.bytes()
+    del buf
+    assert pool.active_bytes == 0
+    assert pool.free_bytes == pool.bytes
+
+    padding_pool = GpuBufferPool(1024)
+    padding_first = padding_pool.allocate(100, alignment=1)
+    padding_second = padding_pool.allocate(100, alignment=256)
+    padding_third = padding_pool.allocate(1, alignment=1)
+    assert padding_first.offset() == 0
+    assert padding_second.offset() == 256
+    assert padding_third.offset() == 356
+
+
+@parametrize_mpi_groups(1)
+def test_gpu_buffer_pool_to_torch_dlpack(mpi_group: MpiGroup):
+    torch = pytest.importorskip("torch")
+
+    pool = GpuBufferPool(4096)
+    buf = pool.allocate(16 * torch.empty((), dtype=torch.float32).element_size(), alignment=512)
+    tensor = torch.utils.dlpack.from_dlpack(buf.to_dlpack(data_type=str(torch.float32), shape=[16]))
+    assert tensor.data_ptr() == buf.data()
+    assert tensor.numel() == 16
+    assert tensor.dtype == torch.float32
+    del buf
+    assert pool.active_bytes == tensor.numel() * tensor.element_size()
+    del tensor
+    assert pool.active_bytes == 0
+
+
+@parametrize_mpi_groups(2, 4, 8)
+def test_gpu_buffer_pool_symmetric_offsets(mpi_group: MpiGroup):
+    pool = GpuBufferPool(8192)
+    offsets = []
+
+    first = pool.allocate(64, alignment=256)
+    offsets.append(first.offset())
+    second = pool.allocate(128, alignment=512)
+    offsets.append(second.offset())
+    del first
+
+    reused = pool.allocate(32, alignment=128)
+    offsets.append(reused.offset())
+
+    gathered_offsets = mpi_group.comm.allgather(offsets)
+    assert all(rank_offsets == gathered_offsets[0] for rank_offsets in gathered_offsets)
+
+
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("ifIpPortTrio", [f"{ethernet_interface_name}:localhost:50000", ethernet_interface_name, ""])
 def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index c0713daa..c0d34188 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -52,8 +52,16 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
   AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
   auto it = contexts_.find(ctxKey);
   if (it == contexts_.end()) {
+    INFO(ALGO, name_, " context cache MISS (creating new context, this triggers collective setup): rank=",
+         comm->bootstrap()->getRank(), ", baseSendBuff=", ctxKey.baseSendBuff, ", baseRecvBuff=", ctxKey.baseRecvBuff,
+         ", baseSendSize=", ctxKey.baseSendSize, ", baseRecvSize=", ctxKey.baseRecvSize,
+         ", numContexts(before)=", contexts_.size());
     auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
     contexts_[ctxKey] = ctx;
+  } else {
+    INFO(ALGO, name_, " context cache HIT (reusing context, no collective setup): rank=", comm->bootstrap()->getRank(),
+         ", baseSendBuff=", ctxKey.baseSendBuff, ", baseRecvBuff=", ctxKey.baseRecvBuff,
+         ", numContexts=", contexts_.size());
   }
   return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
                            nThreadsPerBlock, extras, accumDtype);
diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc
index b3032e50..ffdd9c1c 100644
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
@@ -50,6 +50,8 @@ MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector<int>& ranks) {
   }
 }
 
+MSCCLPP_API_CPP int Bootstrap::getNranksPerIpcDomain() const { return getNranksPerNode(); }
+
 MSCCLPP_API_CPP void Bootstrap::send(const std::vector<char>& data, int peer, int tag) {
   size_t size = data.size();
   send((void*)&size, sizeof(size_t), peer, tag);
@@ -83,6 +85,7 @@ class TcpBootstrap::Impl {
   int getRank();
   int getNranks();
   int getNranksPerNode();
+  int getNranksPerIpcDomain();
   void allGather(void* allData, int size);
   void broadcast(void* data, int size, int root);
   void send(void* data, int size, int peer, int tag);
@@ -95,6 +98,7 @@ class TcpBootstrap::Impl {
   int rank_;
   int nRanks_;
   int nRanksPerNode_;
+  int nRanksPerIpcDomain_;
   bool netInitialized;
   std::unique_ptr<Socket> listenSockRoot_;
   std::unique_ptr<Socket> listenSock_;
@@ -148,6 +152,7 @@ TcpBootstrap::Impl::Impl(int rank, int nRanks)
     : rank_(rank),
       nRanks_(nRanks),
       nRanksPerNode_(0),
+      nRanksPerIpcDomain_(0),
       netInitialized(false),
       peerCommAddresses_(nRanks, SocketAddress()),
       barrierArr_(nRanks, 0),
@@ -451,6 +456,24 @@ int TcpBootstrap::Impl::getNranksPerNode() {
   return nRanksPerNode_;
 }
 
+int TcpBootstrap::Impl::getNranksPerIpcDomain() {
+  if (nRanksPerIpcDomain_ > 0) return nRanksPerIpcDomain_;
+  std::vector<uint64_t> ipcDomainHashes(nRanks_);
+  ipcDomainHashes[rank_] = getIpcDomainHash();
+  allGather(ipcDomainHashes.data(), sizeof(uint64_t));
+
+  int nRanksPerIpcDomain = 0;
+  for (int i = 0; i < nRanks_; ++i) {
+    if (ipcDomainHashes[i] == ipcDomainHashes[rank_]) {
+      ++nRanksPerIpcDomain;
+    }
+  }
+  INFO(MSCCLPP_INIT, "rank %d IPC domain fabric hash 0x%016llx nRanksPerIpcDomain %d", rank_,
+       static_cast<unsigned long long>(ipcDomainHashes[rank_]), nRanksPerIpcDomain);
+  nRanksPerIpcDomain_ = nRanksPerIpcDomain;
+  return nRanksPerIpcDomain_;
+}
+
 void TcpBootstrap::Impl::allGather(void* allData, int size) {
   char* data = static_cast<char*>(allData);
   int rank = rank_;
@@ -592,6 +615,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getNranks() const { return pimpl_->getNranks()
 
 MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() const { return pimpl_->getNranksPerNode(); }
 
+MSCCLPP_API_CPP int TcpBootstrap::getNranksPerIpcDomain() const { return pimpl_->getNranksPerIpcDomain(); }
+
 MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) {
   pimpl_->send(data, size, peer, tag);
 }
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index fcecc4dd..15c6af4e 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -389,6 +389,7 @@ struct Executor::Impl {
           nvlsConnection->bindAllocatedMemory((CUdeviceptr)bufferInfo.first, bufferInfo.second);
       context.nvlsChannels.push_back(switchChannel);
     }
+    this->comm->bootstrap()->barrier();
   }
 
   void setupSemaphores(ExecutionContext& context, const ExecutionPlan& plan) {
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 1ce61322..f599325b 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -2,8 +2,13 @@
 // Licensed under the MIT License.
 
 #include <cstring>
+#include <iterator>
+#include <map>
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
 
 #include "gpu_utils_internal.hpp"
 
@@ -274,8 +279,171 @@ void gpuMemset(void* ptr, int value, size_t bytes) {
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 }
 
+class GpuBufferPoolStorage : public std::enable_shared_from_this<GpuBufferPoolStorage> {
+ public:
+  GpuBufferPoolStorage(size_t bytes, GpuBufferGranularity granularity);
+  std::shared_ptr<GpuBufferPool::Buffer> allocate(size_t bytes, size_t alignment);
+  void release(size_t offset) noexcept;
+  size_t bytes() const;
+  size_t freeBytes() const;
+  size_t activeBytes() const;
+  char* data();
+  int deviceId() const;
+
+ private:
+  struct Block {
+    size_t offset;
+    size_t bytes;
+    size_t reservedOffset;
+    size_t reservedBytes;
+  };
+
+  static size_t alignUp(size_t offset, size_t alignment);
+  Block reserveBlock(size_t bytes, size_t alignment);
+  void releaseBlock(size_t offset, size_t bytes) noexcept;
+
+  mutable std::mutex mutex_;
+  GpuBuffer<char> buffer_;
+  std::map<size_t, size_t> freeBlocks_;
+  std::unordered_map<size_t, Block> activeBlocks_;
+};
+
+GpuBufferPoolStorage::GpuBufferPoolStorage(size_t bytes, GpuBufferGranularity granularity)
+    : buffer_(bytes, granularity) {
+  if (bytes == 0) {
+    throw Error("GpuBufferPool size must be positive.", ErrorCode::InvalidUsage);
+  }
+  freeBlocks_[0] = buffer_.bytes();
+}
+
+size_t GpuBufferPoolStorage::alignUp(size_t offset, size_t alignment) {
+  if (alignment == 0) {
+    throw Error("GpuBufferPool allocation alignment must be positive.", ErrorCode::InvalidUsage);
+  }
+  size_t remainder = offset % alignment;
+  if (remainder == 0) {
+    return offset;
+  }
+  return offset + alignment - remainder;
+}
+
+GpuBufferPoolStorage::Block GpuBufferPoolStorage::reserveBlock(size_t bytes, size_t alignment) {
+  if (bytes == 0) {
+    throw Error("GpuBufferPool allocation size must be positive.", ErrorCode::InvalidUsage);
+  }
+  for (auto it = freeBlocks_.begin(); it != freeBlocks_.end(); ++it) {
+    size_t blockOffset = it->first;
+    size_t blockBytes = it->second;
+    size_t alignedOffset = alignUp(blockOffset, alignment);
+    size_t prefixBytes = alignedOffset - blockOffset;
+    if (prefixBytes > blockBytes || bytes > blockBytes - prefixBytes) {
+      continue;
+    }
+
+    size_t suffixOffset = alignedOffset + bytes;
+    size_t reservedBytes = prefixBytes + bytes;
+    size_t suffixBytes = blockBytes - reservedBytes;
+    freeBlocks_.erase(it);
+    if (suffixBytes > 0) {
+      freeBlocks_[suffixOffset] = suffixBytes;
+    }
+    Block block{alignedOffset, bytes, blockOffset, reservedBytes};
+    activeBlocks_[alignedOffset] = block;
+    return block;
+  }
+  throw Error("GpuBufferPool does not have enough free memory for the requested allocation.", ErrorCode::InvalidUsage);
+}
+
+void GpuBufferPoolStorage::releaseBlock(size_t offset, size_t bytes) noexcept {
+  auto next = freeBlocks_.lower_bound(offset);
+  if (next != freeBlocks_.begin()) {
+    auto prev = std::prev(next);
+    if (prev->first + prev->second == offset) {
+      offset = prev->first;
+      bytes += prev->second;
+      next = freeBlocks_.erase(prev);
+    }
+  }
+  if (next != freeBlocks_.end() && offset + bytes == next->first) {
+    bytes += next->second;
+    freeBlocks_.erase(next);
+  }
+  freeBlocks_[offset] = bytes;
+}
+
+std::shared_ptr<GpuBufferPool::Buffer> GpuBufferPoolStorage::allocate(size_t bytes, size_t alignment) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  Block block = reserveBlock(bytes, alignment);
+  return std::shared_ptr<GpuBufferPool::Buffer>(new GpuBufferPool::Buffer(shared_from_this(), block.offset, bytes));
+}
+
+void GpuBufferPoolStorage::release(size_t offset) noexcept {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto active = activeBlocks_.find(offset);
+  if (active == activeBlocks_.end()) {
+    return;
+  }
+  Block block = active->second;
+  activeBlocks_.erase(active);
+  releaseBlock(block.reservedOffset, block.reservedBytes);
+}
+
+size_t GpuBufferPoolStorage::bytes() const { return buffer_.bytes(); }
+
+size_t GpuBufferPoolStorage::freeBytes() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  size_t freeBytes = 0;
+  for (auto const& block : freeBlocks_) {
+    freeBytes += block.second;
+  }
+  return freeBytes;
+}
+
+size_t GpuBufferPoolStorage::activeBytes() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  size_t activeBytes = 0;
+  for (auto const& block : activeBlocks_) {
+    activeBytes += block.second.bytes;
+  }
+  return activeBytes;
+}
+
+char* GpuBufferPoolStorage::data() { return buffer_.data(); }
+
+int GpuBufferPoolStorage::deviceId() const { return buffer_.deviceId(); }
+
 }  // namespace detail
 
+GpuBufferPool::Buffer::Buffer(std::shared_ptr<detail::GpuBufferPoolStorage> storage, size_t offset, size_t bytes)
+    : storage_(std::move(storage)), offset_(offset), bytes_(bytes) {}
+
+GpuBufferPool::Buffer::~Buffer() { storage_->release(offset_); }
+
+size_t GpuBufferPool::Buffer::bytes() const { return bytes_; }
+
+size_t GpuBufferPool::Buffer::offset() const { return offset_; }
+
+char* GpuBufferPool::Buffer::data() const { return storage_->data() + offset_; }
+
+int GpuBufferPool::Buffer::deviceId() const { return storage_->deviceId(); }
+
+GpuBufferPool::GpuBufferPool(size_t bytes, GpuBufferGranularity granularity)
+    : storage_(std::make_shared<detail::GpuBufferPoolStorage>(bytes, granularity)) {}
+
+std::shared_ptr<GpuBufferPool::Buffer> GpuBufferPool::allocate(size_t bytes, size_t alignment) {
+  return storage_->allocate(bytes, alignment);
+}
+
+size_t GpuBufferPool::bytes() const { return storage_->bytes(); }
+
+size_t GpuBufferPool::freeBytes() const { return storage_->freeBytes(); }
+
+size_t GpuBufferPool::activeBytes() const { return storage_->activeBytes(); }
+
+char* GpuBufferPool::data() { return storage_->data(); }
+
+int GpuBufferPool::deviceId() const { return storage_->deviceId(); }
+
 bool isNvlsSupported() {
   if (env()->forceDisableNvls) {
     return false;
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index cb808bc8..e9095ada 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -525,7 +525,15 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3
   if constexpr (std::is_same_v<T, uint8_t>) {
     assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type");
     return;
-  } else {
+  }
+#if defined(__FP8_TYPES_EXIST__) && \
+    (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
+  else if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    assert(false && "FP8 MULTI_LOAD_REDUCE_STORE requires sm_100a or newer");
+    return;
+  }
+#endif
+  else {
     static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
     const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
     if (size <= 0) {
diff --git a/src/core/include/utils_internal.hpp b/src/core/include/utils_internal.hpp
index c5c67e26..c6934194 100644
--- a/src/core/include/utils_internal.hpp
+++ b/src/core/include/utils_internal.hpp
@@ -37,6 +37,7 @@ int64_t busIdToInt64(const std::string busId);
 uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
+uint64_t getIpcDomainHash();
 void getRandomData(void* buffer, size_t bytes);
 
 struct netIf {
diff --git a/src/core/logger.cc b/src/core/logger.cc
index e13ba1b9..f33ea81b 100644
--- a/src/core/logger.cc
+++ b/src/core/logger.cc
@@ -61,6 +61,8 @@ static LogSubsysSet stringToLogSubsysSet(const std::string& subsysStr) {
       set.set(static_cast<size_t>(LogSubsys::EXEC));
     } else if (token == "NCCL") {
       set.set(static_cast<size_t>(LogSubsys::NCCL));
+    } else if (token == "ALGO") {
+      set.set(static_cast<size_t>(LogSubsys::ALGO));
     } else if (token == "ALL") {
       set.set();  // all bits
     }
diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index 8cc55430..adbf8e5b 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -6,6 +6,10 @@
 #include <signal.h>
 #include <unistd.h>
 
+#if defined(MSCCLPP_USE_CUDA)
+#include <nvml.h>
+#endif
+
 #include <cstring>
 #include <fstream>
 #include <iostream>
@@ -175,6 +179,67 @@ uint64_t getPidHash(void) {
   return *pidHash;
 }
 
+#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN)
+namespace {
+
+class NvmlState {
+ public:
+  NvmlState() : initialized_(nvmlInit_v2() == NVML_SUCCESS) {}
+
+  ~NvmlState() {
+    if (initialized_) {
+      (void)nvmlShutdown();
+    }
+  }
+
+  bool isInitialized() const { return initialized_; }
+
+ private:
+  bool initialized_ = false;
+};
+
+uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) {
+  char hashData[NVML_GPU_FABRIC_UUID_LEN + sizeof(fabricInfo.cliqueId)];
+  std::memcpy(hashData, fabricInfo.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+  std::memcpy(hashData + NVML_GPU_FABRIC_UUID_LEN, &fabricInfo.cliqueId, sizeof(fabricInfo.cliqueId));
+  return getHash(hashData, sizeof(hashData));
+}
+
+bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) {
+  // Use the current CUDA device; callers must set the rank's device before querying.
+  int deviceId;
+  char pciBusId[] = "00000000:00:00.0";
+  if (cudaGetDevice(&deviceId) != cudaSuccess ||
+      cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) {
+    return false;
+  }
+
+  static NvmlState nvml;
+  nvmlDevice_t nvmlDevice;
+  nvmlGpuFabricInfo_t fabricInfo = {};
+  if (!nvml.isInitialized() || nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS ||
+      nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS ||
+      fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) {
+    return false;
+  }
+
+  ipcDomainHash = getFabricHash(fabricInfo);
+  return true;
+}
+
+}  // namespace
+#endif
+
+uint64_t getIpcDomainHash(void) {
+#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN)
+  uint64_t ipcDomainHash;
+  if (tryGetNvmlIpcDomainHash(ipcDomainHash)) {
+    return ipcDomainHash;
+  }
+#endif
+  return getHostHash();
+}
+
 int parseStringList(const char* string, netIf* ifList, int maxList) {
   if (!string) return 0;
 
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index d1b4e731..49688f47 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -16,8 +16,8 @@ constexpr int kMaxThreadsPerBlock = 1024;
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                      int rank, int nRanksPerNode, [[maybe_unused]] int worldSize, size_t nelems) {
-  const int nPeer = nRanksPerNode - 1;
+                      int rank, int nRanksPerIpcDomain, [[maybe_unused]] int worldSize, size_t nelems) {
+  const int nPeer = nRanksPerIpcDomain - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by 16
   const size_t nInt4 = nelems * sizeof(int) / sizeof(int4);
@@ -33,10 +33,11 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t restNInt4 = nInt4 % nInt4PerChunk;
   const size_t scratchChunkRankOffset = nInt4PerChunk * rank;
 
-  __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeer) {
-    channels[lid] = memoryChans[lid];
+  // Peer count may exceed WARP_SIZE on MNNVL.
+  for (int i = lid; i < nPeer; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -138,11 +139,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr<void> ct
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->nRanksPerNode, ctx->workSize, nElem);
+        ctx->nRanksPerIpcDomain, ctx->worldSize, nElem);
   } else {
     allgatherFullmesh<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->nRanksPerNode, ctx->workSize, nElem);
+        ctx->nRanksPerIpcDomain, ctx->worldSize, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -156,8 +157,8 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
                                                               void*, size_t inputSize, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, kMaxBlocks);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 3500c0c4..11f211ca 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -11,12 +11,12 @@ namespace collective {
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode,
-                       size_t nelemsPerGPU) {
+                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize,
+                       size_t nRanksPerIpcDomain, size_t nelemsPerGPU) {
   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t lid = tid % WARP_SIZE;
   const size_t wid = tid / WARP_SIZE;
-  const size_t nPeer = nRanksPerNode - 1;
+  const size_t nPeer = nRanksPerIpcDomain - 1;
 
   // Round down to multiple of peer count.
   const size_t nThread = (blockDim.x * gridDim.x) / WARP_SIZE / nPeer * nPeer * WARP_SIZE;
@@ -162,12 +162,12 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr<void> c
   size_t channelOutOffset = *static_cast<size_t*>(ctx->extras["channel_out_offset"].get());
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh2<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
-        (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->nRanksPerNode, nElem);
+        (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->worldSize,
+        ctx->nRanksPerIpcDomain, nElem);
   } else {
     allgatherFullmesh2<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
-        (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->nRanksPerNode, nElem);
+        (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->worldSize,
+        ctx->nRanksPerIpcDomain, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -181,8 +181,8 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
                                                                void* output, size_t inputSize, mscclpp::DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->memorySemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 49058f59..0e34be71 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -14,14 +14,11 @@ namespace collective {
 
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
+                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
                                   uint32_t flagSize) {
-  // This version of allreduce only works for single nodes
-  if (worldSize != nRanksPerNode) return;
-
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
-  const int nPeers = nRanksPerNode - 1;
+  const int nPeers = nRanksPerIpcDomain - 1;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
@@ -71,25 +68,25 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
   }
 }
 
-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int worldSize) {
-  if (inputSize < worldSize * sizeof(int)) {
-    return {worldSize - 1, (worldSize - 1) * WARP_SIZE};
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain) {
+  if (inputSize < nRanksPerIpcDomain * sizeof(int)) {
+    return {nRanksPerIpcDomain - 1, (nRanksPerIpcDomain - 1) * WARP_SIZE};
   }
-  return {(worldSize - 1) * 4, 512};
+  return {(nRanksPerIpcDomain - 1) * 4, 512};
 }
 
 template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
     allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
+        nRanksPerIpcDomain, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
   }
 };
@@ -108,16 +105,22 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
                                                        const std::unordered_map<std::string, uintptr_t>&,
                                                        DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  if (algoCtx->worldSize != algoCtx->nRanksPerIpcDomain) {
+    WARN(ALGO,
+         "AllreduceAllpairPacket requires worldSize to match nRanksPerIpcDomain, got worldSize=", algoCtx->worldSize,
+         ", nRanksPerIpcDomain=", algoCtx->nRanksPerIpcDomain);
+    return CommResult::CommInvalidArgument;
+  }
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->nRanksPerIpcDomain);
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
     WARN(ALGO, "Requested block number ", blockAndThreadNum.first, " exceeds the maximum supported block number ",
          maxBlockNum_, ".");
     return CommResult::CommInvalidArgument;
   }
-  const int nPeers = algoCtx->nRanksPerNode - 1;
+  const int nPeers = algoCtx->nRanksPerIpcDomain - 1;
   // The kernel maps peer sends by warpId, so every peer needs a full warp.
   if (blockAndThreadNum.second % WARP_SIZE != 0 || blockAndThreadNum.second / WARP_SIZE < nPeers) {
     WARN(ALGO,
@@ -138,8 +141,8 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
-                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
-                algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerIpcDomain,
+                algoCtx->worldSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
                 this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
@@ -153,8 +156,8 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   auto ctx = std::make_shared<AlgorithmCtx>();
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 24d2a31c..eb872624 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,12 +9,23 @@
 namespace mscclpp {
 namespace collective {
 
+namespace {
+// Per-context cache of input-side MemoryChannels keyed by input pointer.
+// Lifetime is tied to AlgorithmCtx, so entries are released when the ctx is
+// evicted from the framework's context cache (avoids unbounded growth across
+// allreduce calls that pass different input buffers).
+using InputChannelsCache =
+    std::unordered_map<const void*,
+                       std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>;
+constexpr const char* kInputChannelsExtraKey = "inputChannels";
+}  // namespace
+
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
-                      int nRanksPerNode, int worldSize, size_t nelems) {
-  const int nPeer = nRanksPerNode - 1;
+                      int nRanksPerIpcDomain, int worldSize, size_t nelems) {
+  const int nPeer = nRanksPerIpcDomain - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
   const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
@@ -49,12 +60,13 @@ __global__ void __launch_bounds__(512, 1)
   const size_t blockOffset = nInt4PerChunk * blockIdx.x;
   const size_t scratchChunkRankOffset = chunkSizePerRank * rank;
 
-  __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
-  __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
+  __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeer) {
-    channels[lid] = memoryChans[lid];
-    outChannels[lid] = memoryOutChans[lid];
+  // Peer count may exceed WARP_SIZE on MNNVL.
+  for (int i = lid; i < nPeer; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
+    outChannels[i] = memoryOutChans[i];
   }
   __syncwarp();
 
@@ -156,7 +168,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
-                          size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
+                          size_t channelOutDataOffset, size_t, int rank, int nRanksPerIpcDomain, int worldSize,
                           size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
                           int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<MemoryChannel>;
@@ -165,7 +177,7 @@ struct AllreduceAllconnectAdapter {
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
     allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
-        channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
+        channelOutDataOffset, rank, nRanksPerIpcDomain, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -194,17 +206,17 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
     MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
     channelOutOffset = (char*)output - (char*)recvBasePtr;
   }
-  std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
-  if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
-    inputChannelHandles = this->memoryChannelsMap_[input].second;
-  } else {
+  auto& inputChannelsCache = *static_cast<InputChannelsCache*>(ctx->extras.at(kInputChannelsExtraKey).get());
+  auto it = inputChannelsCache.find(input);
+  if (it == inputChannelsCache.end()) {
     RegisteredMemory localMemory = comm_->registerMemory(const_cast<void*>(input), inputSize, Transport::CudaIpc);
     std::vector<MemoryChannel> channels =
         setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory,
                             nChannelsPerConnection_);
-    this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels));
+    auto handles = setupMemoryChannelDeviceHandles(channels);
+    it = inputChannelsCache.emplace(input, std::make_pair(std::move(channels), std::move(handles))).first;
   }
-  inputChannelHandles = this->memoryChannelsMap_[input].second;
+  std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles = it->second.second;
 
   AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
@@ -222,7 +234,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
-                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
+                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize,
                 stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
@@ -248,8 +260,8 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
                                                               void* output, size_t size, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->outputSemaphores_;
@@ -266,6 +278,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
                                             nChannelsPerConnection_);
   ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
+  ctx->extras.insert({kInputChannelsExtraKey, std::make_shared<InputChannelsCache>()});
   return ctx;
 }
 
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 2d71cd63..ea34ce69 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -20,15 +20,15 @@ __global__ void __launch_bounds__(1024, 1)
                                [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                                [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
                                [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
-                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
+                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = nRanksPerNode - 1;
-  int nBlocksForCopy = nRanksPerNode * 2;
-  int nBlocksForReduce = nRanksPerNode;
+  int nPeers = nRanksPerIpcDomain - 1;
+  int nBlocksForCopy = nRanksPerIpcDomain * 2;
+  int nBlocksForReduce = nRanksPerIpcDomain;
   int copyReduceRatio = nBlocksForCopy / nBlocksForReduce;
-  size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode;
-  size_t sizePerRank = size / nRanksPerNode;
+  size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
   assert(sizePerRank % alignment == 0);
   uint32_t sizePerBlock =
       ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment;
@@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1)
         deviceSemaphore[bid + 2 * nBlocksForCopy].acquire();
       }
       __syncthreads();
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank;
         char* srcData = (char*)src + blockOffset;
@@ -125,7 +125,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels->wait();
       }
       __syncthreads();
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize +
                                  (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock +
@@ -150,7 +150,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -166,9 +166,9 @@ struct NvlsBlockPipelineAdapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsBlockPipeline<T>
-            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        allreduceNvlsBlockPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+            input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
+            nRanksPerIpcDomain);
         return cudaGetLastError();
       }
   }
@@ -176,7 +176,10 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  int nBaseChannels = 64;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
+  int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
+  // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel).
+  int nBaseChannels = std::max(64, 4 * nRanksPerIpcDomain);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -187,12 +190,15 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm)
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                           void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                           cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                           const std::unordered_map<std::string, uintptr_t>& extras,
-                                                           DataType accumDtype) {
+CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    return CommResult::CommInvalidArgument;
+  }
   AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
@@ -200,11 +206,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->nRanksPerNode * 5, 1024};
+    blockAndThreadNum = {ctx->nRanksPerIpcDomain * 5, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
@@ -221,12 +227,12 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
                                                                        void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index a616485e..98d9e1a3 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -82,7 +82,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   int nSwitchChannels = 1;
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
   this->switchChannels_ =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
@@ -93,8 +93,8 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
                                                                 size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
@@ -123,7 +123,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
@@ -154,4 +154,4 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
       });
 }
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 3bb054da..b11fef8e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -18,15 +18,15 @@ __global__ void __launch_bounds__(1024, 1)
                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                               [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
                               [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                              [[maybe_unused]] int nRanksPerNode) {
+                              [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = nRanksPerNode - 1;
+  int nPeers = nRanksPerIpcDomain - 1;
   int nBlocks = gridDim.x;
   int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / nRanksPerNode;
-  size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
+  size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain;
   const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment;
   size_t start = bid * maxSizePerBlock;
   size_t end = min(start + maxSizePerBlock, sizePerRank);
@@ -53,19 +53,20 @@ __global__ void __launch_bounds__(1024, 1)
     lastIterSize = sizePerBlock % copyPerIter;
   }
 
-  const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x * 2;
+  const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x * 2;
   auto memoryChans = memoryChannels + chanOffset;
-  __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_NRANKS_PER_NODE - 1) * 2];
+  __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeers * 2) {
-    channels[lid] = memoryChans[lid];
+  // Peer count may exceed WARP_SIZE on MNNVL.
+  for (int i = lid; i < nPeers * 2; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   for (int it = 0; it < nIter; it++) {
     const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter;
     if (warpId < endCopyWid) {
       int tidInCopy = threadIdx.x;
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -96,7 +97,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels[tidInRecvCopy + nPeers].wait();
       }
       asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory");
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -113,7 +114,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -129,17 +130,20 @@ struct NvlsWarpPipelineAdapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsWarpPipeline<T>
-            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        allreduceNvlsWarpPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+            input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
+            nRanksPerIpcDomain);
         return cudaGetLastError();
       }
   }
 };
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
-  nSwitchChannels_ = 8;
-  int nBaseChannels = 64;
+  nSwitchChannels_ = NUM_NVLS_CONNECTION;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
+  int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
+  // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain.
+  int nBaseChannels = std::max(64, 8 * nRanksPerIpcDomain);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -155,6 +159,10 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
     [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    return CommResult::CommInvalidArgument;
+  }
   AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
@@ -162,11 +170,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->nRanksPerNode * 4, 1024};
+    blockAndThreadNum = {ctx->nRanksPerIpcDomain * 4, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error));
@@ -183,12 +191,12 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
                                                                       void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index e7f2028f..57858c17 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -6,25 +6,25 @@
 #include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
 constexpr int MAX_NBLOCKS = 32;
 
-template <typename T>
+template <typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>* memoryChannels,
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOut,
                   [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset,
-                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
+                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  int nPeers = nRanksPerNode - 1;
+  int nPeers = nRanksPerIpcDomain - 1;
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / nRanksPerNode;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
   const size_t minAlign = 16;
   // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
   size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
@@ -40,12 +40,13 @@ __global__ void __launch_bounds__(1024, 1)
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
 
-  const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x;
+  const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x;
   auto memoryChans = memoryChannels + chanOffset;
-  __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nRanksPerNode - 1) {
-    channels[lid] = memoryChans[lid];
+  // Peer count may exceed WARP_SIZE on MNNVL.
+  for (int i = lid; i < nRanksPerIpcDomain - 1; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   if (threadIdx.x < nPeers) {
@@ -56,8 +57,8 @@ __global__ void __launch_bounds__(1024, 1)
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
   if (curBlockSize > 0) {
-    handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
-                               threadIdx.x, blockDim.x);
+    handleMultiLoadReduceStore<T, AccumT>(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset,
+                                          curBlockSize, threadIdx.x, blockDim.x);
   }
   __syncthreads();
   if (threadIdx.x < nPeers) {
@@ -72,7 +73,7 @@ struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
-                          size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t channelOutOffset, size_t, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -80,17 +81,11 @@ struct NvlsAdapter {
     } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
       // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
       return cudaErrorNotSupported;
-    } else
-#if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
-        if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-      return cudaErrorNotSupported;
-    } else
-#endif
-    {
+    } else {
       using ChannelType = DeviceHandle<mscclpp::BaseMemoryChannel>;
-      allreduceNvls<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
-                                                                 nvlsOutChannels, channelInOffset, channelOutOffset,
-                                                                 inputSize, rank, nRanksPerNode);
+      allreduceNvls<T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+          (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize,
+          rank, nRanksPerIpcDomain);
       return cudaGetLastError();
     }
   }
@@ -102,6 +97,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   cudaDeviceProp deviceProp;
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
   nSwitchChannels_ = 32;
   this->conns_ = setupConnections(comm);
   // setup semaphores
@@ -120,13 +116,17 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                               [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
                                               mscclpp::DataType accumDtype) {
   if (!symmetricMemory_) {
-    WARN("AllreduceNvls requires symmetric memory for now.");
+    WARN(ALGO, "AllreduceNvls requires symmetric memory.");
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN(ALGO, "FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    return CommResult::CommInvalidArgument;
+  }
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   size_t sendBytes, recvBytes;
@@ -142,24 +142,28 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024};
+    numBlocksAndThreads = {::min(ctx->nRanksPerIpcDomain, MAX_NBLOCKS), 1024};
     // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with
     // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which
     // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS].
     if (computeCapabilityMajor_ == 10) {
-      numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->workSize, MAX_NBLOCKS));
+      numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->worldSize, MAX_NBLOCKS));
     }
   }
   if (numBlocksAndThreads.first > MAX_NBLOCKS) {
-    WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS);
+    WARN(ALGO, "Number of blocks exceeds maximum supported value of ", MAX_NBLOCKS);
     return CommResult::CommInvalidArgument;
   }
-  cudaError_t error =
-      allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
-                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
-                inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+  cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr,
+                                nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank,
+                                ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0,
+                                numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
+    if (error == cudaErrorNotSupported) {
+      WARN(ALGO, "AllreduceNvls does not support the requested data type.");
+      return CommResult::CommInvalidArgument;
+    }
+    WARN(ALGO, "AllreduceNvls failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -179,20 +183,24 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
                                                           const void* input, void* output, size_t, mscclpp::DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+  INFO(ALGO, "AllreduceNvls init context: rank=", ctx->rank, ", sendBasePtr=", (void*)sendBasePtr,
+       ", recvBasePtr=", (void*)recvBasePtr, ", sendBytes=", sendBytes, ", recvBytes=", recvBytes,
+       ", inputOffset=", (char*)input - (char*)sendBasePtr, ", outputOffset=", (char*)output - (char*)recvBasePtr);
 
   // setup channels
-  ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
+  ctx->switchChannels =
+      setupNvlsChannels(comm, this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
   if (input != output) {
     auto nvlsOutConnections = this->nvlsOutConnections_;
     std::vector<mscclpp::SwitchChannel> outChannels =
-        setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
+        setupNvlsChannels(comm, this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
     ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
   }
 
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 414c2b1f..696bdc3d 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -15,7 +15,7 @@ namespace collective {
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
+                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize,
                     size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                     ,
@@ -23,9 +23,6 @@ __global__ void __launch_bounds__(1024, 1)
 #else
     ) {
 #endif
-  // This version of allreduce only works for single nodes
-  if (worldSize != nRanksPerNode) return;
-
 #if defined(ENABLE_NPKIT)
   extern __shared__ int4 NpkitSharedMem[];
   NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem);
@@ -56,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
   else
     nelems = nelems / (sizeof(int) / sizeof(T));
 
-  const int nPeers = nRanksPerNode - 1;
+  const int nPeers = nRanksPerIpcDomain - 1;
   const size_t nPkts = nelems / 2;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
@@ -81,10 +78,11 @@ __global__ void __launch_bounds__(1024, 1)
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
-  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = tid % WARP_SIZE;
-  if (lid < nPeers) {
-    channels[lid] = memoryChannels[lid];
+  // Peer count may exceed WARP_SIZE on MNNVL.
+  for (int i = lid; i < nPeers; i += WARP_SIZE) {
+    channels[i] = memoryChannels[i];
   }
   __syncwarp();
   // step 1: write to scratch buffer
@@ -156,31 +154,32 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
                           int nBlocks = 0, int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    // Optimize the number of blocks to be multiple of (worldSize - 1)
-    nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
+    // Optimize the number of blocks to be multiple of the IPC-domain peer count.
+    const int nPeers = nRanksPerIpcDomain - 1;
+    nBlocks = nBlocks / nPeers * nPeers;
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
-        NpKit::GetCpuTimestamp());
+        nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff,
+        NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
+        nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
     return cudaGetLastError();
   }
 };
 
-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize,
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain, int worldSize,
                                                           [[maybe_unused]] DataType dtype) {
-  int nBlocks = (nRanksPerNode - 1) * 4;
+  int nBlocks = (nRanksPerIpcDomain - 1) * 4;
   int nThreadsPerBlock = 1024;
   if (inputSize >= 32768) {
     nBlocks = (worldSize - 1) * 8;
@@ -198,12 +197,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
 
   // FP8-specific tuning for 32KB-256KB range
   {
-    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
-#if defined(__FP8_TYPES_EXIST__)
-    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
-            dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
-#endif
-    if (isFp8) {
+    if (isFp8DataType(dtype)) {
       if (inputSize < (64 << 10)) {
         nThreadsPerBlock = 64;
       } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
@@ -231,9 +225,19 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
                                                 const std::unordered_map<std::string, uintptr_t>&,
                                                 DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (ctx->worldSize != ctx->nRanksPerIpcDomain) {
+    WARN(ALGO, "AllreducePacket requires worldSize to match nRanksPerIpcDomain, got worldSize=", ctx->worldSize,
+         ", nRanksPerIpcDomain=", ctx->nRanksPerIpcDomain);
+    return CommResult::CommInvalidArgument;
+  }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode, dtype);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->nRanksPerIpcDomain, ctx->worldSize, dtype);
+  } else {
+    const int nPeers = ctx->nRanksPerIpcDomain - 1;
+    if (blockAndThreadNum.first < nPeers) {
+      return CommResult::CommInvalidArgument;
+    }
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
     WARN(ALGO, "Requested block number ", blockAndThreadNum.first, " exceeds the maximum supported block number ",
@@ -261,8 +265,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
-                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
+                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize,
+                inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
@@ -276,8 +280,8 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   auto ctx = std::make_shared<AlgorithmCtx>();
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index db471b93..2d6d4dec 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_rsag.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
@@ -28,21 +30,21 @@ namespace collective {
 //
 // Data is processed in int4-sized (16-byte) units for coalesced memory access,
 // with special handling for any remainder elements at the tail.
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerNode,
+                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerIpcDomain,
                   int worldSize, size_t nelems) {
   int blockId = blockIdx.x;
-  uint32_t nPeers = nRanksPerNode - 1;
+  uint32_t nPeers = nRanksPerIpcDomain - 1;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * nRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
+  uint32_t alignedNelems = ((nelems + nRanksPerIpcDomain - 1) / nRanksPerIpcDomain + nelemsPerInt4 - 1) /
+                           nelemsPerInt4 * nelemsPerInt4 * nRanksPerIpcDomain;
+  uint32_t nelemsPerRank = alignedNelems / nRanksPerIpcDomain;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t lastInt4Index = nelems / nelemsPerInt4;
   uint32_t remainder = nelems % nelemsPerInt4;
@@ -51,6 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
   int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
   int4* buff4 = reinterpret_cast<int4*>((char*)buff);
   DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * nPeers;
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nelemsPerInt4>>;
 
   uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
   uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
@@ -59,7 +62,7 @@ __global__ void __launch_bounds__(1024, 1)
     nInt4PerBlock += remainderForBlock;
   }
   if (nInt4PerBlock == 0) return;
-  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode;
+  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerIpcDomain;
 
   for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
     int rankIdx = idx / nInt4PerBlock;
@@ -82,15 +85,16 @@ __global__ void __launch_bounds__(1024, 1)
   for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset > lastInt4Index) continue;
-    int4 tmp = scratch4[offset];
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(scratch4[offset]);
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % nRanksPerIpcDomain;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-      tmp = calVector<T, OpType>(data, tmp);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data);
     }
+    int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % nRanksPerIpcDomain;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[peerIdx], offset, tmp);
     }
@@ -127,17 +131,17 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
       nThreadsPerBlock = 1024;
       nBlocks = 64;
     }
-    allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceRsAg<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        nRanksPerNode, worldSize, nelems);
+        nRanksPerIpcDomain, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -179,9 +183,13 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, c
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > nChannelsPerConnection_) {
+    WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_);
+    return CommResult::CommInvalidArgument;
+  }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
-                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
+                                algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize, stream, nullptr, 0, 0,
                                 numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
@@ -198,8 +206,8 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
                                                           size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index eabe3dc5..4b243444 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -86,15 +86,15 @@ template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
-                          int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
-                          uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
+                          int nRanksPerIpcDomain, int worldSize, size_t nelems, size_t scratchSize,
+                          uint32_t nblocksForPut, uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
   uint32_t bid = blockIdx.x;
   constexpr uint32_t nStepsPerIter = 4;
   uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4);
   uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter;
   const uint32_t chunkSize = nInt4PerIter * worldSize;
   uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize;
-  uint32_t nPeers = nRanksPerNode - 1;
+  uint32_t nPeers = nRanksPerIpcDomain - 1;
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   const uint32_t scratchIterStride = 2 * chunkSize;  // one for AS, one for AG
   const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride;
@@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x;
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
         int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1;
         // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot)
         uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter;
@@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1)
         int4 tmp = loadVec(buff, myChunkOffset, nelems);
         // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer])
         for (uint32_t peer = 0; peer < nPeers; peer++) {
-          int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+          int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
@@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1)
         uint32_t dstOffset =
             baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
         for (uint32_t i = 0; i < nPeers; i++) {
-          int peerIdx = (rank + i + 1) % nRanksPerNode;
+          int peerIdx = (rank + i + 1) % nRanksPerIpcDomain;
           int index = peerIdx < rank ? peerIdx : peerIdx - 1;
           mscclpp::write<int4>(((void**)remoteMemories)[index], dstOffset, tmp);
         }
@@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       // Copy other ranks' reduced chunks from scratch to result
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
         for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
           uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv +
                             step * blockDim.x * nblocksForRecv;
@@ -224,7 +224,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
@@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter {
     }
     allreduceRsAgPipeline<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
+        nRanksPerIpcDomain, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
     return cudaGetLastError();
   }
 };
@@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_,
-                                algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0,
-                                0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                                algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize, stream,
+                                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -305,8 +305,8 @@ std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt
                                                                   void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index f95ba7e3..e7ed0cab 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -35,10 +35,10 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 //
 // This approach requires registering both input and output buffers as remote
 // memories (2 * nPeers handles), but avoids scratch buffer allocation and
-// the extra copy steps of the standard RSAG. The NRanksPerNode template
+// the extra copy steps of the standard RSAG. The NRanks template
 // parameter enables compile-time unrolling of peer loops (supports 4 or 8).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
+template <int NRanks, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
@@ -48,12 +48,12 @@ __global__ void __launch_bounds__(1024, 1)
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
-  constexpr int NPeers = NRanksPerNode - 1;
+  constexpr int NPeers = NRanks - 1;
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1;
-  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * NRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
+  constexpr uint32_t outputRemoteBufferOffset = NPeers;
+  uint32_t alignedNelems =
+      ((nelems + NRanks - 1) / NRanks + nelemsPerInt4 - 1) / nelemsPerInt4 * nelemsPerInt4 * NRanks;
+  uint32_t nelemsPerRank = alignedNelems / NRanks;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
 
@@ -69,7 +69,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
   if (nInt4PerBlock == 0) return;
 
-  if (threadIdx.x < NPeers) {
+  if ((int)threadIdx.x < NPeers) {
     memoryChannelsLocal[threadIdx.x].relaxedSignal();
     memoryChannelsLocal[threadIdx.x].relaxedWait();
   }
@@ -86,18 +86,19 @@ __global__ void __launch_bounds__(1024, 1)
     int4 tmp_raw = buff4[offset];
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int rankIdx = (rank + i + 1) % NRanks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
     }
     AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
+#pragma unroll
     for (int i = 0; i < NPeers; i++) {
       acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
     }
     int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int rankIdx = (rank + i + 1) % NRanks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
     }
@@ -105,7 +106,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
   // Use device barrier gives better performance here.
   globalSyncer.sync(gridDim.x);
-  if (blockIdx.x == 0 && threadIdx.x < NPeers) {
+  if (blockIdx.x == 0 && (int)threadIdx.x < NPeers) {
     memoryChannelsLocal[threadIdx.x].signal();
     memoryChannelsLocal[threadIdx.x].wait();
   }
@@ -115,8 +116,8 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
@@ -126,16 +127,17 @@ struct AllreduceRsAgZeroCopyAdapter {
         nBlocks = 128;
       }
     }
-    if (nRanksPerNode == 4) {
+    if (nRanksPerIpcDomain == 4) {
       allreduceRsAgZeroCopy<4, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
-    } else if (nRanksPerNode == 8) {
+    } else if (nRanksPerIpcDomain == 8) {
       allreduceRsAgZeroCopy<8, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else {
-      THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode);
+      WARN(ALGO, "AllreduceRsAgZeroCopy only supports nRanksPerIpcDomain of 4 or 8, got: ", nRanksPerIpcDomain);
+      return cudaErrorInvalidValue;
     }
     return cudaGetLastError();
   }
@@ -164,11 +166,19 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > nChannelsPerConnection_) {
+    WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_);
+    return CommResult::CommInvalidArgument;
+  }
   cudaError_t error =
       allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
-                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream,
-                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize,
+                stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
+    if (error == cudaErrorInvalidValue) {
+      WARN(ALGO, "AllreduceRsAgZeroCopy received invalid launch arguments: ", cudaGetErrorString(error));
+      return CommResult::CommInvalidArgument;
+    }
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
@@ -193,16 +203,14 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
                                                                   void* output, size_t size, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->worldSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->semaphores_;
 
   // register input and output memories
   RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc);
   RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc);
-  this->inputMemories_.push_back(inputMemory);
-  this->outputMemories_.push_back(outputMemory);
 
   auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory);
   auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory);
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cu
similarity index 77%
rename from src/ext/collectives/collective_utils.cc
rename to src/ext/collectives/collective_utils.cu
index 016c4a5c..6d4b005a 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cu
@@ -1,16 +1,93 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include "collective_utils.hpp"
-
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
+#include "collective_utils.hpp"
+
 namespace mscclpp {
 namespace collective {
+
+namespace {
+
+#if !defined(MSCCLPP_DEVICE_HIP)
+__global__ void fp8NvlsSupportProbeKernel(int* supported) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
+    (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
+  *supported = 1;
+#else
+  *supported = 0;
+#endif
+}
+
+bool detectFp8NvlsSupport() {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  auto supportedDevice = mscclpp::detail::gpuCallocUnique<int>();
+  int supportedHost = 0;
+  auto stream = gpuStreamPool()->getStream();
+
+  fp8NvlsSupportProbeKernel<<<1, 1, 0, stream>>>(supportedDevice.get());
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    return false;
+  }
+
+  MSCCLPP_CUDATHROW(
+      cudaMemcpyAsync(&supportedHost, supportedDevice.get(), sizeof(supportedHost), cudaMemcpyDeviceToHost, stream));
+  err = cudaStreamSynchronize(stream);
+  if (err != cudaSuccess) {
+    (void)cudaGetLastError();
+    return false;
+  }
+  return supportedHost != 0;
+}
+#endif
+
+}  // namespace
+
+bool isFp8DataType(DataType dtype) {
+  return dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || dtype == DataType::FLOAT8_E5M2 ||
+         dtype == DataType::FLOAT8_E5M2FNUZ || dtype == DataType::FLOAT8_E4M3B15;
+}
+
+bool isNativeFp8DataType(DataType dtype) {
+#if defined(__FP8_TYPES_EXIST__)
+#if defined(__FP8_E4M3_IS_FNUZ__)
+  if (dtype == DataType::FLOAT8_E4M3FNUZ) {
+    return true;
+  }
+#else
+  if (dtype == DataType::FLOAT8_E4M3FN) {
+    return true;
+  }
+#endif
+#if defined(__FP8_E5M2_IS_FNUZ__)
+  if (dtype == DataType::FLOAT8_E5M2FNUZ) {
+    return true;
+  }
+#else
+  if (dtype == DataType::FLOAT8_E5M2) {
+    return true;
+  }
+#endif
+#endif
+  return false;
+}
+
+bool isFp8NvlsSupported() {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return false;
+#else
+  static const bool supported = detectFp8NvlsSupport();
+  return supported;
+#endif
+}
+
 std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
                                                            mscclpp::RegisteredMemory localMemory) {
   std::vector<mscclpp::RegisteredMemory> remoteMemories;
@@ -98,7 +175,8 @@ std::vector<std::shared_ptr<mscclpp::NvlsConnection>> setupNvlsConnections(std::
   return nvlsConnections;
 }
 
-std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
+std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::shared_ptr<mscclpp::Communicator> comm,
+                                                      std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
                                                       void* buffer, size_t bufferSize, int nSwitchChannels) {
   std::vector<mscclpp::SwitchChannel> channels;
 
@@ -107,6 +185,8 @@ std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_pt
     mscclpp::SwitchChannel switchChannel = nvlsConnection->bindAllocatedMemory((CUdeviceptr)buffer, bufferSize);
     channels.push_back(switchChannel);
   }
+  // Synchronize to make sure all ranks have their NVLS channels set up before any rank starts using them.
+  comm->bootstrap()->barrier();
   return channels;
 }
 
@@ -153,4 +233,4 @@ std::shared_ptr<mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>> setupBaseMemo
 
 }  // namespace collective
 
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index 64f5ec54..bba82ee5 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -4,6 +4,7 @@
 #include <mscclpp/algorithm.hpp>
 
 #include "allreduce/common.hpp"
+#include "collective_utils.hpp"
 
 namespace mscclpp {
 namespace collective {
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index a54352b3..e0c63a3d 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -30,8 +30,6 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> inputScratchSemaphores_;
   std::vector<RegisteredMemory> remoteScratchMemories_;
   RegisteredMemory localScratchMemory_;
-  std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
-      memoryChannelsMap_;
   bool symmetricMemory_ = false;
 };
 }  // namespace collective
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 81b74add..b408c64c 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -33,6 +33,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  bool fp8NvlsSupported_{false};
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index 8f02a873..2ce3a4fb 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -33,6 +33,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  bool fp8NvlsSupported_{false};
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index d53ea180..ec64e967 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -36,9 +36,10 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsOutConnections_;
   int computeCapabilityMajor_{0};
+  bool fp8NvlsSupported_{false};
 };
 
 }  // namespace collective
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index de7ca471..771126c9 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -29,7 +29,7 @@ class AllreducePacket : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
-  const int maxBlockNum_ = 56;
+  const int maxBlockNum_ = 112;
   std::vector<Connection> conns_;
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
@@ -37,4 +37,4 @@ class AllreducePacket : public AlgorithmBuilder {
   std::vector<RegisteredMemory> registeredMemories_;
 };
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
index 05bf2ef3..528d9708 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -27,8 +27,6 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
   int nChannelsPerConnection_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> semaphores_;
-  std::vector<RegisteredMemory> inputMemories_;
-  std::vector<RegisteredMemory> outputMemories_;
 
   std::vector<BaseMemoryChannel> baseChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 93b18e26..5d593449 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -36,36 +36,46 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() {
   }
 }
 
-template <typename T>
+template <typename T, typename AccumT = T>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size,
                                                       int tid, int nThreads) {
-  // nvls can only handle 4 bytes alignment
-  MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
-  constexpr size_t nElem = calcVectorSize<T>();
-  // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
-  constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> || std::is_same_v<T, unsigned int> ||
-                              std::is_same_v<T, uint32_t>)
-                                 ? 1
-                                 : nElem;
-  using vectorType = mscclpp::VectorType<T, vecSize>;
-  const size_t nVec = size / sizeof(vectorType);
-  const size_t srcOffset4 = srcOffset / sizeof(vectorType);
-  const size_t dstOffset4 = dstOffset / sizeof(vectorType);
-  vectorType* src4 = (vectorType*)src;
-  vectorType* dst4 = (vectorType*)dst;
-  for (size_t idx = tid; idx < nVec; idx += nThreads) {
-    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx);
-    mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx);
-  }
-  // handle rest of data
-  size_t processed = nVec * sizeof(vectorType);
-  constexpr size_t nRestElem = 4 / sizeof(T);
-  using restVectorType = mscclpp::VectorType<T, nRestElem>;
-  const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType);
-  const size_t endIdx = (srcOffset + size) / sizeof(restVectorType);
-  for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) {
-    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx);
-    mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx);
+#if defined(__FP8_TYPES_EXIST__) && \
+    (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
+  if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    assert(false && "FP8 NVLS multimem requires sm_100a or newer");
+    return;
+  } else
+#endif
+  {
+    // nvls can only handle 4 bytes alignment
+    MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
+    constexpr size_t nElem = calcVectorSize<T>();
+    // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
+    constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> ||
+                                std::is_same_v<T, unsigned int> || std::is_same_v<T, uint32_t>)
+                                   ? 1
+                                   : nElem;
+    using vectorType = mscclpp::VectorType<T, vecSize>;
+    const size_t nVec = size / sizeof(vectorType);
+    const size_t srcOffset4 = srcOffset / sizeof(vectorType);
+    const size_t dstOffset4 = dstOffset / sizeof(vectorType);
+    vectorType* src4 = (vectorType*)src;
+    vectorType* dst4 = (vectorType*)dst;
+    for (size_t idx = tid; idx < nVec; idx += nThreads) {
+      auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<vectorType, AccumT>(src4 + srcOffset4 + idx);
+      mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx);
+    }
+    // handle rest of data
+    size_t processed = nVec * sizeof(vectorType);
+    constexpr size_t nRestElem = 4 / sizeof(T);
+    using restVectorType = mscclpp::VectorType<T, nRestElem>;
+    const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType);
+    const size_t endIdx = (srcOffset + size) / sizeof(restVectorType);
+    for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) {
+      auto val =
+          mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<restVectorType, AccumT>((restVectorType*)src + idx);
+      mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx);
+    }
   }
 }
 #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index f705a9d1..7dd6158a 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -26,11 +26,22 @@ namespace mscclpp {
 
 namespace collective {
 constexpr int NUM_NVLS_CONNECTION = 8;
-constexpr int NUM_SEMAPHORES = 64;
+// Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side
+// semaphore indices grow as O(nRanksPerIpcDomain) (e.g. nvls_block_pipeline uses
+// up to ~5 * nRanksPerIpcDomain entries).
+constexpr int NUM_SEMAPHORES = 512;
 
-constexpr int MAX_NRANKS_PER_NODE = 8;
+// Upper bound on the number of NVLink-reachable ranks that participate in a
+// single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to
+// GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing
+// of shared-memory channel arrays in the allreduce/allgather kernels.
+constexpr int MAX_IPC_DOMAIN_NRANKS = 72;
 
-constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
+constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // Two 70 MiB buffers for double-buffered packet scratch space.
+
+bool isFp8DataType(DataType dtype);
+bool isNativeFp8DataType(DataType dtype);
+bool isFp8NvlsSupported();
 
 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                   RegisteredMemory localMemory);
@@ -50,7 +61,8 @@ std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
 std::vector<std::shared_ptr<NvlsConnection>> setupNvlsConnections(std::shared_ptr<Communicator> comm, size_t size,
                                                                   int numConnections);
 
-std::vector<SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<NvlsConnection>> conns, void* buffer,
+std::vector<SwitchChannel> setupNvlsChannels(std::shared_ptr<Communicator> comm,
+                                             std::vector<std::shared_ptr<NvlsConnection>> conns, void* buffer,
                                              size_t bufferSize, int nSwitchChannels);
 
 std::shared_ptr<DeviceHandle<SwitchChannel>> setupNvlsChannelDeviceHandles(
@@ -71,8 +83,9 @@ std::shared_ptr<DeviceHandle<BaseMemoryChannel>> setupBaseMemoryChannelDeviceHan
 class AlgorithmCtx {
  public:
   int rank;
-  int workSize;
+  int worldSize;
   int nRanksPerNode;
+  int nRanksPerIpcDomain;
 
   std::vector<RegisteredMemory> registeredMemories;
   std::vector<MemoryChannel> memoryChannels;
@@ -89,4 +102,4 @@ class AlgorithmCtx {
 
 }  // namespace collective
 }  // namespace mscclpp
-#endif  // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_
diff --git a/src/ext/nccl/CMakeLists.txt b/src/ext/nccl/CMakeLists.txt
index 9767e66f..463b7550 100644
--- a/src/ext/nccl/CMakeLists.txt
+++ b/src/ext/nccl/CMakeLists.txt
@@ -13,6 +13,7 @@ target_include_directories(mscclpp_nccl PRIVATE
     include 
     ${PROJECT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/src/core/include
+    ${PROJECT_SOURCE_DIR}/src/ext/collectives/include
     ${GPU_INCLUDE_DIRS}
 )
 target_link_libraries(mscclpp_nccl PUBLIC mscclpp mscclpp_collectives)
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index c94aab34..1ccac65d 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -6,6 +6,7 @@
 #include <mscclpp/env.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "collective_utils.hpp"
 #include "debug.h"
 
 namespace mscclpp {
@@ -20,24 +21,15 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
     return false;
   }
 
-  const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
-                     dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
-
-  if (!isFp8) {
+  if (!collective::isFp8DataType(dtype)) {
     return nvlsSupported;
   }
 
-  // FP8 handling
 #if !defined(__HIP_PLATFORM_AMD__)
-  // NVLS does not support FP8 on devices with compute capability < 10
-  if (config.computeCapability.first < 10) {
+  if (!collective::isNativeFp8DataType(dtype)) {
     return false;
   }
-#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
-  return true;
-#else
-  return false;
-#endif
+  return nvlsSupported && config.fp8NvlsSupported;
 #else
   return nvlsSupported;
 #endif
diff --git a/src/ext/nccl/algorithm_selector.hpp b/src/ext/nccl/algorithm_selector.hpp
index c8705f8b..2048ea05 100644
--- a/src/ext/nccl/algorithm_selector.hpp
+++ b/src/ext/nccl/algorithm_selector.hpp
@@ -16,6 +16,7 @@ namespace nccl {
 struct AlgorithmSelectorConfig {
   bool symmetricMemory;
   bool nvlsSupported;
+  bool fp8NvlsSupported;
   bool isCuMemMapAllocated;
   bool inCaptureMode;
   std::pair<int, int> computeCapability;
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index 2d6c5f9d..8fcc1bb1 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -20,6 +20,7 @@
 #include <mscclpp/algorithm.hpp>
 
 #include "algorithm_selector.hpp"
+#include "collective_utils.hpp"
 #include "datatype_conversion.hpp"
 
 static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;
@@ -239,6 +240,8 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
   static const bool isNvlsSupported = mscclpp::isNvlsSupported();
   static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
   static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
+  const bool fp8NvlsSupported =
+      mscclpp::collective::isNativeFp8DataType(request.dtype) ? mscclpp::collective::isFp8NvlsSupported() : false;
 
   const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
                                    mscclpp::isCuMemMapAllocated(request.outputBuffer);
@@ -249,6 +252,7 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
 
   mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
                                                 .nvlsSupported = isNvlsSupported,
+                                                .fp8NvlsSupported = fp8NvlsSupported,
                                                 .isCuMemMapAllocated = isCuMemMapAllocated,
                                                 .inCaptureMode = inCaptureMode,
                                                 .computeCapability = deviceComputeCapability,
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index c28087a4..eb6985a8 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -127,6 +127,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
     MPI_Comm_size(shmcomm, &shmrank);
     return shmrank;
   }
+  int getNranksPerIpcDomain() const override { return getNranksPerNode(); }
   void allGather(void* sendbuf, int size) override {
     MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD);
   }
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index 977314e9..0b5de4cb 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -60,3 +60,79 @@ TEST(GpuUtilsTest, Memcpy) {
     EXPECT_EQ(hostBuff[i], hostBuffTmp[i]);
   }
 }
+
+TEST(GpuUtilsTest, BufferPoolBasic) {
+  mscclpp::GpuBufferPool pool(4096);
+
+  auto first = pool.allocate(64, 256);
+  EXPECT_EQ(first->bytes(), size_t(64));
+  EXPECT_EQ(first->offset() % 256, size_t(0));
+  EXPECT_EQ(first->data(), pool.data() + first->offset());
+  EXPECT_EQ(first->deviceId(), pool.deviceId());
+  EXPECT_EQ(pool.activeBytes(), size_t(64));
+
+  auto second = pool.allocate(128, 512);
+  EXPECT_EQ(second->bytes(), size_t(128));
+  EXPECT_EQ(second->offset() % 512, size_t(0));
+  EXPECT_EQ(second->data(), pool.data() + second->offset());
+  EXPECT_EQ(pool.activeBytes(), size_t(64 + 128));
+
+  first.reset();
+  EXPECT_EQ(pool.activeBytes(), size_t(128));
+  second.reset();
+  EXPECT_EQ(pool.activeBytes(), size_t(0));
+  EXPECT_EQ(pool.freeBytes(), pool.bytes());
+}
+
+TEST(GpuUtilsTest, BufferPoolReservesAlignmentPadding) {
+  mscclpp::GpuBufferPool pool(1024);
+
+  auto first = pool.allocate(100, 1);
+  auto second = pool.allocate(100, 256);
+  auto third = pool.allocate(1, 1);
+
+  EXPECT_EQ(first->offset(), size_t(0));
+  EXPECT_EQ(second->offset(), size_t(256));
+  EXPECT_EQ(third->offset(), size_t(356));
+}
+
+TEST(GpuUtilsTest, BufferPoolReuseAfterRelease) {
+  mscclpp::GpuBufferPool pool(1024);
+
+  auto first = pool.allocate(128, 1);
+  auto firstOffset = first->offset();
+  first.reset();
+
+  auto second = pool.allocate(128, 1);
+  EXPECT_EQ(second->offset(), firstOffset);
+  second.reset();
+  EXPECT_EQ(pool.freeBytes(), pool.bytes());
+}
+
+TEST(GpuUtilsTest, BufferPoolThrowsOnInvalidAllocation) {
+  mscclpp::GpuBufferPool pool(1024);
+
+  bool zeroSizeThrows = false;
+  try {
+    (void)pool.allocate(0);
+  } catch (const mscclpp::Error&) {
+    zeroSizeThrows = true;
+  }
+  EXPECT_TRUE(zeroSizeThrows);
+
+  bool zeroAlignmentThrows = false;
+  try {
+    (void)pool.allocate(1, 0);
+  } catch (const mscclpp::Error&) {
+    zeroAlignmentThrows = true;
+  }
+  EXPECT_TRUE(zeroAlignmentThrows);
+
+  bool outOfMemoryThrows = false;
+  try {
+    (void)pool.allocate(pool.bytes() + 1);
+  } catch (const mscclpp::Error&) {
+    outOfMemoryThrows = true;
+  }
+  EXPECT_TRUE(outOfMemoryThrows);
+}