diff --git a/CMakeLists.txt b/CMakeLists.txt index 49154e0b..3f9bf8e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA) else() set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver) endif() + list(APPEND GPU_LIBRARIES CUDA::nvml) else() set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md index b4e4fcdf..a3a2f808 100644 --- a/docs/guide/mscclpp-torch-integration.md +++ b/docs/guide/mscclpp-torch-integration.md @@ -475,7 +475,7 @@ All examples are in [`examples/torch-integration/`](../../examples/torch-integra The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime. -**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py) +**Reference implementation:** MSCCL++ ships a ready-to-use autotuner in [`python/mscclpp_benchmark/bench_collective.py`](../../python/mscclpp_benchmark/bench_collective.py). It benchmarks every candidate algorithm, block count, and thread count per message size, writes the winning configuration to a JSON file, and can replay that file at runtime. The sections below explain the underlying mechanism; see that benchmark for the complete, maintained implementation. ### How It Works @@ -656,9 +656,20 @@ def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100): self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) ``` -### Running the Tuning Example +### Running the Autotuner + +MSCCL++'s built-in autotuner benchmarks every candidate configuration and saves the best one to JSON. Run it across the ranks of your job, then reuse the generated config: ```bash -MSCCLPP_MASTER_ADDR= MSCCLPP_MASTER_PORT= \ - torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py +# Autotune and save the tuned config +mpirun -np 8 --allow-run-as-root \ + python3 -m mscclpp_benchmark.bench_collective \ + --collective allreduce --dtype float16 --autotune \ + --write-config /tmp/mscclpp_tuned_configs.json + +# Replay the tuned config in a benchmark +mpirun -np 8 --allow-run-as-root \ + python3 -m mscclpp_benchmark.bench_collective \ + --collective allreduce --dtype float16 \ + --config-path /tmp/mscclpp_tuned_configs.json ``` diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu index 02df3685..13802f80 100644 --- a/examples/customized-collective-algorithm/customized_allgather.cu +++ b/examples/customized-collective-algorithm/customized_allgather.cu @@ -79,7 +79,7 @@ __global__ void __launch_bounds__(1024) struct Context { int rank; - int workSize; + int worldSize; int nRanksPerNode; std::vector registeredMemories; @@ -140,7 +140,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { size_t inputSize, cudaStream_t stream) { auto algoCtx = std::static_pointer_cast(ctx); int rank = algoCtx->rank; - int worldSize = algoCtx->workSize; + int worldSize = algoCtx->worldSize; int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE; allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize); @@ -154,16 +154,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { void* output, size_t inputSize, mscclpp::DataType dtype) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); + ctx->worldSize = comm->bootstrap()->getNranks(); ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // register memories mscclpp::RegisteredMemory inputBufRegMem = comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc); mscclpp::RegisteredMemory outputBufRegMem = - comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc); + comm->registerMemory(output, inputSize * ctx->worldSize, mscclpp::Transport::CudaIpc); std::vector> remoteRegMemories; - for (int i = 0; i < ctx->workSize; i++) { + for (int i = 0; i < ctx->worldSize; i++) { if (i == ctx->rank) continue; comm->sendMemory(outputBufRegMem, i, 0); remoteRegMemories.push_back(comm->recvMemory(i, 0)); diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu index 907b3ada..5ba2935f 100644 --- a/examples/torch-integration/customized_allgather.cu +++ b/examples/torch-integration/customized_allgather.cu @@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024) struct Context { int rank; - int workSize; + int worldSize; int nRanksPerNode; std::vector registeredMemories; @@ -108,7 +108,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { cudaStream_t stream) { auto algoCtx = std::static_pointer_cast(ctx); int rank = algoCtx->rank; - int worldSize = algoCtx->workSize; + int worldSize = algoCtx->worldSize; int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE; allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes); @@ -122,16 +122,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { void* output, size_t inputBytes, mscclpp::DataType dtype) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); + ctx->worldSize = comm->bootstrap()->getNranks(); ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // register memories mscclpp::RegisteredMemory inputBufRegMem = comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc); mscclpp::RegisteredMemory outputBufRegMem = - comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc); + comm->registerMemory(output, inputBytes * ctx->worldSize, mscclpp::Transport::CudaIpc); std::vector> remoteRegMemories; - for (int i = 0; i < ctx->workSize; i++) { + for (int i = 0; i < ctx->worldSize; i++) { if (i == ctx->rank) continue; comm->sendMemory(outputBufRegMem, i, 0); remoteRegMemories.push_back(comm->recvMemory(i, 0)); diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py deleted file mode 100644 index b96087c2..00000000 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ /dev/null @@ -1,476 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py - -import os -import ipaddress - -import netifaces as ni -import torch -import mscclpp -import mscclpp.ext -import mscclpp.utils as mscclpp_utils - -# -- Helpers ------------------------------------------------------------------ - - -def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor: - """Allocate a tensor backed by RawGpuBuffer (symmetric memory).""" - # PyTorch's from_dlpack does not support certain float8 DLPack type codes. - # Work around by importing as uint8 and reinterpreting via .view(). - _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz) - if dtype in _DLPACK_UNSUPPORTED: - dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8)) - return torch.utils.dlpack.from_dlpack(dlpack).view(dtype) - dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype)) - return torch.utils.dlpack.from_dlpack(dlpack) - - -def _load_algorithms(scratch: torch.Tensor, rank: int): - return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms( - scratch_buffer=scratch.data_ptr(), - scratch_buffer_size=scratch.nbytes, - rank=rank, - ) - - -def _interfaces_for_ip(ip: str): - target = ipaddress.ip_address(ip) - for iface in ni.interfaces(): - addrs = ni.ifaddresses(iface) - if ni.AF_INET in addrs: - for link in addrs[ni.AF_INET]: - if "addr" in link and ipaddress.ip_address(link["addr"]) == target: - return iface - return None - - -def _to_mscclpp_op(op) -> mscclpp.ReduceOp: - if op == torch.distributed.ReduceOp.SUM: - return mscclpp.ReduceOp.SUM - if op == torch.distributed.ReduceOp.MIN: - return mscclpp.ReduceOp.MIN - raise ValueError(f"unsupported op: {op}") - - -def _round_pow2(size: int) -> int: - """Round up to next power-of-2, clamped to [1024, 256 MB].""" - size = max(size, 1024) - size = min(size, 256 << 20) - return 1 << (size - 1).bit_length() - - -# -- CustomizedComm ----------------------------------------------------------- - - -class CustomizedComm: - """Exposes all_reduce, all_gather, barrier with lazy per-size tuning.""" - - _TUNE_N_WARMUP = 5 - _TUNE_N_GRAPH_LAUNCHES = 10 - _TUNE_N_OPS_PER_GRAPH = 100 - _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 56, 64, 128] - _CANDIDATE_NTHREADS = [512, 768, 1024] - _NBLOCKS_LIMIT = { - "default_allreduce_nvls_packet": 16, - "default_allreduce_packet": 56, - "default_allreduce_allpair_packet": 64, - "default_allreduce_fullmesh": 64, - "default_allgather_fullmesh2": 32, - } - - def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): - self.comm = comm - self.rank = comm.my_rank - self.world_size = comm.nranks - self.symmetric_memory = symmetric_memory - self._nvls = mscclpp.is_nvls_supported() - - self._scratch = _make_tensor(1 << 27, torch.float16) - self._barrier_tensor = _make_tensor(4096, torch.float32) - - algos = _load_algorithms(self._scratch, self.rank) - self._algos = {(a.collective, a.name): a for a in algos} - - # {collective: {rounded_size: (algo, nblocks, nthreads)}} - self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}} - self._tune_buf = None - self._time_buf = None - - def _algo(self, collective: str, name: str): - return self._algos.get((collective, name)) - - def _default_ar_config(self): - """Fallback allreduce config for barrier / timing sync.""" - pkt = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and pkt: - return (pkt, 0, 0) - return (self._algo("allreduce", "default_allreduce_packet"), 0, 0) - - # -- low-level execute -- - - def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True): - s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream - ret = algo.execute( - comm=self.comm.communicator, - input_buffer=tensor.data_ptr(), - output_buffer=tensor.data_ptr(), - input_size=tensor.nbytes, - output_size=tensor.nbytes, - dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype), - op=op, - stream=s, - nblocks=nb, - nthreads_per_block=nt, - symmetric_memory=sym, - accum_dtype=accum_dtype, - ) - if ret != 0: - print(f"Rank {self.rank}: {algo.name} failed ({ret})") - return ret - - def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None): - if sym is None: - sym = self.symmetric_memory - s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream - ret = algo.execute( - comm=self.comm.communicator, - input_buffer=inp.data_ptr(), - output_buffer=out.data_ptr(), - input_size=inp.nbytes, - output_size=out.nbytes, - dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype), - op=mscclpp.ReduceOp.NOP, - stream=s, - nblocks=nb, - nthreads_per_block=nt, - symmetric_memory=sym, - ) - if ret != 0: - print(f"Rank {self.rank}: AG {algo.name} failed ({ret})") - return ret - - def _barrier_internal(self): - a, nb, nt = self._default_ar_config() - self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True) - - # -- lazy tuning -- - - def _ensure_tune_bufs(self): - if self._tune_buf is None: - self._tune_buf = _make_tensor(1 << 27, torch.float16) - self._tune_buf.normal_() - self._time_buf = _make_tensor(4096, torch.float32) - return self._tune_buf - - def _ar_candidates(self, size: int): - out = [] - if size <= 4 << 20: - a = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_packet") - if a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_allpair_packet") - if a: - out.append(a) - if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and self.symmetric_memory and a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") - if a: - out.append(a) - if torch.version.hip is not None: - a = self._algo("allreduce", "default_allreduce_fullmesh") - if a: - out.append(a) - return out - - def _ag_candidates(self): - a = self._algo("allgather", "default_allgather_fullmesh2") - return [a] if a else [] - - def _run_tune(self, collective, algo, buf, size, nb, nt): - """Single tune invocation for either collective.""" - if collective == "allreduce": - return algo.execute( - comm=self.comm.communicator, - input_buffer=buf.data_ptr(), - output_buffer=buf.data_ptr(), - input_size=size, - output_size=size, - dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype), - op=mscclpp.ReduceOp.SUM, - stream=torch.cuda.current_stream().cuda_stream, - nblocks=nb, - nthreads_per_block=nt, - symmetric_memory=True, - ) - else: - total = size * self.world_size - out_ptr = buf.data_ptr() - return algo.execute( - comm=self.comm.communicator, - input_buffer=out_ptr + self.rank * size, - output_buffer=out_ptr, - input_size=size, - output_size=total, - dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype), - op=mscclpp.ReduceOp.NOP, - stream=torch.cuda.current_stream().cuda_stream, - nblocks=nb, - nthreads_per_block=nt, - symmetric_memory=False, - ) - - def _tune_size(self, collective: str, target_size: int): - """Auto-tune one (collective, target_size) pair and cache result.""" - buf = self._ensure_tune_bufs() - cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates() - - best_time, best_cfg = float("inf"), None - used = set() - run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt) - - for algo in cands: - nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128) - for nb in self._CANDIDATE_NBLOCKS: - if nb > nb_limit: - continue - for nt in self._CANDIDATE_NTHREADS: - # Feasibility — sync result across ranks so all agree - ret = run(algo, nb, nt) - torch.cuda.synchronize() - self._time_buf[0] = float(ret) - self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True) - if self._time_buf[0].item() != 0: - continue - used.add(algo) - - # Warmup - for _ in range(self._TUNE_N_WARMUP): - run(algo, nb, nt) - - # CUDA-graph timed benchmark - cs = torch.cuda.Stream() - cs.wait_stream(torch.cuda.current_stream()) - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=cs): - for _ in range(self._TUNE_N_OPS_PER_GRAPH): - run(algo, nb, nt) - - start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - start.record(cs) - with torch.cuda.stream(cs): - for _ in range(self._TUNE_N_GRAPH_LAUNCHES): - g.replay() - end.record(cs) - end.synchronize() - elapsed = start.elapsed_time(end) - - # Cross-rank timing sync - self._time_buf.fill_(elapsed) - torch.cuda.current_stream().wait_stream(cs) - self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True) - avg = self._time_buf[self.rank].item() / self.world_size - - if avg < best_time: - best_time, best_cfg = avg, (algo, nb, nt) - - if best_cfg: - self._tune_cache[collective][target_size] = best_cfg - if self.rank == 0: - n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH - print( - f"[tune] {collective} size={target_size}: {best_cfg[0].name} " - f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us", - flush=True, - ) - else: - fb = ( - self._default_ar_config() - if collective == "allreduce" - else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None) - ) - self._tune_cache[collective][target_size] = fb - - torch.cuda.synchronize() - self._barrier_internal() - for a in used: - a.reset() - - # -- public API -- - - def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None): - sz = _round_pow2(tensor.nbytes) - if sz not in self._tune_cache["allreduce"]: - self._tune_size("allreduce", sz) - a, nb, nt = self._tune_cache["allreduce"][sz] - self._exec_ar( - tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory - ) - - def all_gather(self, output_tensor, input_tensor, stream=None): - sz = _round_pow2(input_tensor.nbytes) - if sz not in self._tune_cache["allgather"]: - self._tune_size("allgather", sz) - a, nb, nt = self._tune_cache["allgather"][sz] - self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory) - - def barrier(self): - self._barrier_internal() - - def destroy(self): - self._algos.clear() - self._tune_cache = {"allreduce": {}, "allgather": {}} - self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None - - -# -- Benchmarks (standalone) -------------------------------------------------- - - -def _bench_sizes(low=5 * 1024, high=80 << 20): - sizes, c = [], low - while c <= high: - sizes.append(c) - c *= 2 - return sizes - - -def benchmark_allreduce( - comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100 -): - sizes = _bench_sizes() - if comm.rank == 0: - print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}") - print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}") - - cs = torch.cuda.Stream() - buf = _make_tensor(1 << 27, dtype) - buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0) - - for size in sizes: - nelems = size // buf.element_size() - t = buf[: size // buf.element_size()] - comm.all_reduce(t, accum_dtype=accum_dtype) - torch.cuda.synchronize() - - cs.wait_stream(torch.cuda.current_stream()) - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=cs): - for _ in range(n_iter): - comm.all_reduce(t, accum_dtype=accum_dtype) - with torch.cuda.stream(cs): - for _ in range(n_warmup): - g.replay() - comm.barrier() - cs.synchronize() - - s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - s.record(cs) - with torch.cuda.stream(cs): - for _ in range(n_graph_launches): - g.replay() - e.record(cs) - e.synchronize() - - ms = s.elapsed_time(e) / (n_graph_launches * n_iter) - if comm.rank == 0: - print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}") - - -def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100): - sizes = _bench_sizes() - if comm.rank == 0: - print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}") - print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}") - - cs = torch.cuda.Stream() - buf = _make_tensor(1 << 27, dtype) - buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0) - - for prs in sizes: - total = prs * comm.world_size - if total > buf.nbytes: - break - nt = total // buf.element_size() - npr = prs // buf.element_size() - out = buf[:nt] - inp = out[comm.rank * npr : (comm.rank + 1) * npr] - - comm.all_gather(out, inp) - torch.cuda.synchronize() - - cs.wait_stream(torch.cuda.current_stream()) - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=cs): - for _ in range(n_iter): - comm.all_gather(out, inp) - with torch.cuda.stream(cs): - for _ in range(n_warmup): - g.replay() - comm.barrier() - cs.synchronize() - - s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - s.record(cs) - with torch.cuda.stream(cs): - for _ in range(n_graph_launches): - g.replay() - e.record(cs) - e.synchronize() - - ms = s.elapsed_time(e) / (n_graph_launches * n_iter) - if comm.rank == 0: - print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}") - - -# -- Bootstrap & main --------------------------------------------------------- - - -def init_dist() -> mscclpp.CommGroup: - addr = os.environ.get("MSCCLPP_MASTER_ADDR") - if addr: - rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"]) - port = os.environ["MSCCLPP_MASTER_PORT"] - iface = _interfaces_for_ip(addr) - if not iface: - raise ValueError(f"No interface for {addr}") - return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world) - import torch.distributed as dist - - dist.init_process_group(backend="gloo") - return mscclpp.CommGroup(torch_group=dist.group.WORLD) - - -def main(): - local = int(os.environ["LOCAL_RANK"]) - torch.cuda.set_device(local) - - dtype_str = os.environ.get("DTYPE", "float16") - dtype = getattr(torch, dtype_str, torch.float16) - accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} - accum_str = os.environ.get("ACCUM_DTYPE") - accum_dtype = accum_map.get(accum_str) if accum_str else None - - comm_group = init_dist() - cc = CustomizedComm(comm_group) - - print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") - benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype) - cc.barrier() - torch.cuda.synchronize() - - benchmark_allgather(cc, dtype=dtype) - cc.barrier() - torch.cuda.synchronize() - - cc.destroy() - print(f"rank {local} completed successfully.") - - -if __name__ == "__main__": - main() diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 45b56bcc..4c14f1ee 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -46,6 +46,10 @@ class Bootstrap { /// @return The total number of ranks per node. virtual int getNranksPerNode() const = 0; + /// Return the number of ranks in this rank's GPU IPC domain. + /// @return The number of ranks in the GPU IPC domain. + virtual int getNranksPerIpcDomain() const; + /// Send arbitrary data to another process. /// /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size, @@ -144,6 +148,9 @@ class TcpBootstrap : public Bootstrap { /// Return the total number of ranks per node. int getNranksPerNode() const override; + /// Return the number of ranks in this rank's GPU IPC domain. + int getNranksPerIpcDomain() const override; + /// Send arbitrary data to another process. /// /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size, diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index a6dd306b..7415119f 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -37,7 +37,7 @@ class Env { const std::string logLevel; /// Env name: `MSCCLPP_LOG_SUBSYS`. The log subsystem, a comma-separated list of subsystems to enable - /// logging for. Possible values are ENV, GPU, NET, CONN, EXEC, NCCL, ALL (default). + /// logging for. Possible values are ENV, GPU, NET, CONN, EXEC, NCCL, ALGO, ALL (default). /// If the first character is '^', it inverts the mask, i.e., enables all subsystems except those specified. /// For example, "^NET,CONN" enables all subsystems except NET and CONN. const std::string logSubsys; diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index b8d096e2..b289bd4d 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -31,6 +31,7 @@ using CUmemorytype = hipMemoryType; constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled; constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed; constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice; +constexpr auto cudaErrorInvalidValue = hipErrorInvalidValue; constexpr auto cudaSuccess = hipSuccess; constexpr auto cudaErrorNotSupported = hipErrorNotSupported; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 82fa3ec0..926bd6ca 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -1,10 +1,11 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #ifndef MSCCLPP_GPU_UTILS_HPP_ #define MSCCLPP_GPU_UTILS_HPP_ #include +#include #include #include "env.hpp" @@ -403,6 +404,96 @@ class GpuBuffer { std::shared_ptr memory_; }; +namespace detail { + +class GpuBufferPoolStorage; + +} // namespace detail + +/// A deterministic sub-allocation pool for GPU communication buffers. +/// +/// The pool allocates one `GpuBuffer` slab and returns reference-counted buffers for sub-ranges in that slab. +/// If all ranks create pools with the same size and make the same allocation calls, returned buffers have identical +/// offsets from the slab base pointer. Python bindings use this property to build symmetric memory buffers while +/// relying on normal Python reference counting to return buffers to the local pool. +/// +class GpuBufferPool { + public: + /// Represents one buffer returned by `GpuBufferPool`. + /// + /// This object owns a sub-range of a pool allocation. Destroying the last reference to this object returns the + /// sub-range to the pool. Destruction only updates local pool bookkeeping and does not perform any + /// cross-rank synchronization. + class Buffer { + public: + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + Buffer(Buffer&&) = delete; + Buffer& operator=(Buffer&&) = delete; + + /// Destructor. Returns this buffer's sub-range to the owning pool when the last reference is destroyed. + ~Buffer(); + + /// Returns the number of bytes requested for this buffer. + /// @return Number of bytes in this buffer. + size_t bytes() const; + + /// Returns this buffer's byte offset from the pool base pointer. + /// @return Byte offset from the pool base pointer. + size_t offset() const; + + /// Returns the device pointer to this buffer. + /// @return Device pointer to this buffer. + char* data() const; + + /// Returns the device id of the underlying pool allocation. + /// @return Device id of the underlying pool allocation. + int deviceId() const; + + private: + friend class detail::GpuBufferPoolStorage; + Buffer(std::shared_ptr storage, size_t offset, size_t bytes); + + std::shared_ptr storage_; + size_t offset_; + size_t bytes_; + }; + + /// Constructs a pool backed by a single `GpuBuffer`. + /// @param bytes Number of bytes to reserve in the pool. + /// @param granularity Granularity used to size the underlying `GpuBuffer`. + explicit GpuBufferPool(size_t bytes, GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum); + + /// Allocates a sub-range from the pool. + /// @param bytes Number of bytes to allocate. + /// @param alignment Alignment in bytes for the returned offset. + /// @return A reference-counted pooled buffer. + std::shared_ptr allocate(size_t bytes, size_t alignment = 256); + + /// Returns the number of bytes in the underlying pool allocation. + /// @return Number of bytes in the pool. + size_t bytes() const; + + /// Returns the number of bytes that are available for new buffers. + /// @return Number of free bytes. + size_t freeBytes() const; + + /// Returns the number of bytes currently held by active pooled buffers. + /// @return Number of active bytes. + size_t activeBytes() const; + + /// Returns the device pointer to the pool base. + /// @return Device pointer to the pool base. + char* data(); + + /// Returns the device id of the underlying pool allocation. + /// @return Device id of the underlying pool allocation. + int deviceId() const; + + private: + std::shared_ptr storage_; +}; + } // namespace mscclpp #endif // MSCCLPP_GPU_UTILS_HPP_ diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index b52b6572..fcdd7fdd 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -37,7 +37,10 @@ struct SwitchChannelDeviceHandle { SwitchChannelDeviceHandle::multimemStore(val, reinterpret_cast(mcPtr) + index); } - template + /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the + /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the + /// `.acc::f16` variant of the instruction is used. For all other types `AccumT` is ignored. + template MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) { VectorType val; if constexpr (std::is_same_v) { @@ -80,32 +83,78 @@ struct SwitchChannelDeviceHandle { : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + } +#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000) + else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];" + : "=r"(val.words[0]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" - : "=r"(val.words[0]), "=r"(val.words[1]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e4m3x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" - : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e5m2x4 %0, [%1];" + : "=r"(val.words[0]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" - : "=r"(val.words[0]), "=r"(val.words[1]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e5m2x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" - : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) - : "l"(ptr) - : "memory"); - } else { - static_assert(dependentFalse, "Not supported type"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } + } +#endif + else { + static_assert(dependentFalse, "Unsupported vector type for multimemLoadReduce"); } return val; }; @@ -148,7 +197,9 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } +#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000) + else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), @@ -168,8 +219,10 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else { - static_assert(dependentFalse, "Not supported type"); + } +#endif + else { + static_assert(dependentFalse, "Unsupported vector type for multimemStore"); } }; @@ -194,7 +247,7 @@ struct SwitchChannelDeviceHandle { } else if constexpr (std::is_same_v && std::is_same_v) { asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory"); } else { - static_assert(dependentFalse, "Not supported type"); + static_assert(dependentFalse, "Unsupported vector type for multimemStoreReduce"); } }; #endif // defined(MSCCLPP_DEVICE_CUDA) diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index a94f9863..7e9af6c1 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -56,6 +56,7 @@ void register_core(nb::module_& m) { .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode) + .def("get_n_ranks_per_ipc_domain", &Bootstrap::getNranksPerIpcDomain) .def( "send", [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) { diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp index d6527502..ba7ca9e9 100644 --- a/python/csrc/gpu_utils_py.cpp +++ b/python/csrc/gpu_utils_py.cpp @@ -1,12 +1,14 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include #include +#include #include #include #include +#include #include #include @@ -15,6 +17,13 @@ using namespace mscclpp; constexpr int BYTE_BITS = 8; +struct DlpackContext { + DLManagedTensor managedTensor{}; + std::vector shape; + std::vector strides; + std::shared_ptr owner; +}; + static DLDeviceType getDeviceType() { #if defined(MSCCLPP_USE_ROCM) return kDLROCM; @@ -52,65 +61,77 @@ static DLDataType getDlType(std::string type) { } } -static nb::capsule toDlpack(GpuBuffer buffer, std::string dataType, std::vector& shape, - std::vector& strides) { +static void dlpackDeleter(DLManagedTensor* self) { delete static_cast(self->manager_ctx); } + +static void dlpackCapsuleDestructor(PyObject* capsule) { + if (PyCapsule_IsValid(capsule, "used_dltensor")) { + return; + } + if (!PyCapsule_IsValid(capsule, "dltensor")) { + return; + } + DLManagedTensor* managedTensor = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); + if (managedTensor == nullptr) { + return; + } + if (managedTensor->deleter) { + managedTensor->deleter(managedTensor); + } +} + +static nb::capsule makeDlpack(void* data, size_t bytes, int deviceId, std::shared_ptr owner, std::string dataType, + std::vector shape, std::vector strides) { DLDataType dtype = getDlType(dataType); - int64_t* tensorShape = shape.size() > 0 ? new int64_t[shape.size()] : new int64_t[1]; - int64_t* tensorStrides = strides.size() > 0 ? new int64_t[strides.size()] : nullptr; - if (shape.size() == 0) { - tensorShape[0] = (int64_t)(buffer.nelems() / ((dtype.bits * dtype.lanes + 7) / BYTE_BITS)); - } else { - for (size_t i = 0; i < shape.size(); ++i) { - tensorShape[i] = shape[i]; + auto ctx = std::make_unique(); + size_t elementBytes = (dtype.bits * dtype.lanes + 7) / BYTE_BITS; + if (shape.empty()) { + if (bytes % elementBytes != 0) { + throw Error("DLPack buffer size must be divisible by the element size.", ErrorCode::InvalidUsage); } + ctx->shape.push_back((int64_t)(bytes / elementBytes)); + } else { + ctx->shape = std::move(shape); } - for (size_t i = 0; i < strides.size(); ++i) { - tensorStrides[i] = strides[i]; + ctx->strides = std::move(strides); + if (!ctx->strides.empty() && ctx->strides.size() != ctx->shape.size()) { + throw Error("DLPack strides must have the same length as shape.", ErrorCode::InvalidUsage); } + ctx->owner = std::move(owner); - DLManagedTensor* dlManagedTensor = new DLManagedTensor(); - dlManagedTensor->dl_tensor.data = buffer.data(); + DLManagedTensor* dlManagedTensor = &ctx->managedTensor; + dlManagedTensor->dl_tensor.data = data; dlManagedTensor->dl_tensor.device.device_type = getDeviceType(); - dlManagedTensor->dl_tensor.device.device_id = buffer.deviceId(); - dlManagedTensor->dl_tensor.ndim = shape.size() == 0 ? 1 : shape.size(); - dlManagedTensor->dl_tensor.strides = tensorStrides; - dlManagedTensor->dl_tensor.shape = tensorShape; + dlManagedTensor->dl_tensor.device.device_id = deviceId; + dlManagedTensor->dl_tensor.ndim = static_cast(ctx->shape.size()); + dlManagedTensor->dl_tensor.strides = ctx->strides.empty() ? nullptr : ctx->strides.data(); + dlManagedTensor->dl_tensor.shape = ctx->shape.data(); dlManagedTensor->dl_tensor.byte_offset = 0; dlManagedTensor->dl_tensor.dtype = dtype; - dlManagedTensor->manager_ctx = new GpuBuffer(buffer); - dlManagedTensor->deleter = [](DLManagedTensor* self) { - delete static_cast*>(self->manager_ctx); - self->manager_ctx = nullptr; - self->dl_tensor.data = nullptr; - if (self->dl_tensor.shape != nullptr) { - delete[] self->dl_tensor.shape; - self->dl_tensor.shape = nullptr; - if (self->dl_tensor.strides) { - delete[] self->dl_tensor.strides; - self->dl_tensor.strides = nullptr; - } - } - delete self; - }; + dlManagedTensor->manager_ctx = ctx.get(); + dlManagedTensor->deleter = dlpackDeleter; - PyObject* dlCapsule = PyCapsule_New(static_cast(dlManagedTensor), "dltensor", [](PyObject* capsule) { - if (PyCapsule_IsValid(capsule, "used_dltensor")) { - return; - } - if (!PyCapsule_IsValid(capsule, "dltensor")) { - return; - } - DLManagedTensor* managedTensor = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); - if (managedTensor == nullptr) { - return; - } - if (managedTensor->deleter) { - managedTensor->deleter(managedTensor); - } - }); + PyObject* dlCapsule = PyCapsule_New(static_cast(dlManagedTensor), "dltensor", dlpackCapsuleDestructor); + if (dlCapsule == nullptr) { + throw Error("Failed to create DLPack capsule.", ErrorCode::InvalidUsage); + } + ctx.release(); return nb::steal(dlCapsule); } +static nb::capsule toDlpack(GpuBuffer buffer, std::string dataType, std::vector& shape, + std::vector& strides) { + auto owner = std::make_shared>(buffer); + return makeDlpack(buffer.data(), buffer.nelems(), buffer.deviceId(), std::move(owner), dataType, shape, strides); +} + +static nb::capsule toDlpack(std::shared_ptr buffer, std::string dataType, + std::vector& shape, std::vector& strides) { + void* data = buffer->data(); + size_t bytes = buffer->bytes(); + int deviceId = buffer->deviceId(); + return makeDlpack(data, bytes, deviceId, std::move(buffer), dataType, shape, strides); +} + void register_gpu_utils(nb::module_& m) { m.def("is_nvls_supported", &isNvlsSupported); @@ -131,4 +152,25 @@ void register_gpu_utils(nb::module_& m) { return toDlpack(self, dataType, shape, strides); }, nb::arg("data_type"), nb::arg("shape") = std::vector(), nb::arg("strides") = std::vector()); + + nb::class_(m, "CppRawGpuBufferPoolBuffer") + .def("bytes", &GpuBufferPool::Buffer::bytes) + .def("offset", &GpuBufferPool::Buffer::offset) + .def("data", [](GpuBufferPool::Buffer& self) { return reinterpret_cast(self.data()); }) + .def("device_id", &GpuBufferPool::Buffer::deviceId) + .def( + "to_dlpack", + [](std::shared_ptr self, std::string dataType, std::vector shape, + std::vector strides) { return toDlpack(std::move(self), dataType, shape, strides); }, + nb::arg("data_type"), nb::arg("shape") = std::vector(), nb::arg("strides") = std::vector()); + + nb::class_(m, "CppRawGpuBufferPool") + .def(nb::init(), nb::arg("bytes"), + nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum) + .def("bytes", &GpuBufferPool::bytes) + .def("free_bytes", &GpuBufferPool::freeBytes) + .def("active_bytes", &GpuBufferPool::activeBytes) + .def("data", [](GpuBufferPool& self) { return reinterpret_cast(self.data()); }) + .def("device_id", &GpuBufferPool::deviceId) + .def("allocate", &GpuBufferPool::allocate, nb::arg("bytes"), nb::arg("alignment") = 256); } diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 09408171..0a414937 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -50,6 +50,8 @@ CppExecutionPlan as ExecutionPlan, CppPacketType as PacketType, CppRawGpuBuffer as RawGpuBuffer, + CppRawGpuBufferPool as RawGpuBufferPool, + CppRawGpuBufferPoolBuffer as RawGpuBufferPoolBuffer, CppReduceOp as ReduceOp, env, is_nvls_supported, @@ -84,6 +86,8 @@ "ExecutionPlan", "PacketType", "RawGpuBuffer", + "RawGpuBufferPool", + "RawGpuBufferPoolBuffer", "ReduceOp", "env", "version", @@ -100,6 +104,7 @@ "AlgorithmCollection", "CommGroup", "GpuBuffer", + "GpuBufferPool", "GpuBufferGranularity", ] diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py index e07424f5..33d630b7 100644 --- a/python/mscclpp/_core/buffer.py +++ b/python/mscclpp/_core/buffer.py @@ -6,9 +6,14 @@ import cupy as cp import numpy as np -from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity +from mscclpp._mscclpp import ( + CppRawGpuBuffer, + CppRawGpuBufferPool, + CppRawGpuBufferPoolBuffer, + CppGpuBufferGranularity, +) -__all__ = ["GpuBuffer", "GpuBufferGranularity"] +__all__ = ["GpuBuffer", "GpuBufferPool", "GpuBufferGranularity"] GpuBufferGranularity = CppGpuBufferGranularity @@ -35,3 +40,62 @@ def __new__( buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity) memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0) return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr) + + +class GpuBufferPool: + """A GPU buffer pool that returns raw buffers backed by one communication-friendly allocation. + + All ranks should create the same-sized pool and call :meth:`allocate` in the same order to get matching offsets. + """ + + def __init__( + self, + nbytes: int, + granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum, + ): + if nbytes <= 0: + raise ValueError("Pool size must be positive.") + self._pool = CppRawGpuBufferPool(int(nbytes), granularity) + + @property + def bytes(self) -> int: + """Number of bytes in the underlying pool allocation.""" + return self._pool.bytes() + + @property + def free_bytes(self) -> int: + """Number of bytes available for new buffers.""" + return self._pool.free_bytes() + + @property + def active_bytes(self) -> int: + """Number of bytes currently held by live raw buffers.""" + return self._pool.active_bytes() + + @property + def data(self) -> int: + """Device pointer to the pool base allocation.""" + return self._pool.data() + + @property + def device_id(self) -> int: + """CUDA/HIP device id of the pool allocation.""" + return self._pool.device_id() + + def allocate( + self, + nbytes: int, + alignment: int = 256, + ) -> CppRawGpuBufferPoolBuffer: + """Allocate a raw buffer from the pool. + + Args: + nbytes: Number of bytes to allocate. + alignment: Required byte alignment of the allocation offset from the pool base. + """ + if nbytes <= 0: + raise ValueError("Buffer size must be positive.") + if alignment <= 0: + raise ValueError("Alignment must be positive.") + + return self._pool.allocate(int(nbytes), int(alignment)) diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py index d42349dd..875e07f1 100644 --- a/python/mscclpp/_core/comm.py +++ b/python/mscclpp/_core/comm.py @@ -73,6 +73,7 @@ def __init__( self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() + self.ipc_domain_n_ranks = self.bootstrap.get_n_ranks_per_ipc_domain() def barrier(self): self.bootstrap.barrier() diff --git a/python/mscclpp_benchmark/bench_collective.py b/python/mscclpp_benchmark/bench_collective.py index c526438d..21b61152 100644 --- a/python/mscclpp_benchmark/bench_collective.py +++ b/python/mscclpp_benchmark/bench_collective.py @@ -199,7 +199,6 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl "default_allreduce_nvls_packet", max_message_size=512 * 1024, max_nblocks=16, - supported_skus=("H100", "GB300"), requires_nvls=True, ), CandidateSpec( @@ -216,6 +215,10 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl "default_allreduce_rsag_zero_copy", min_message_size=512 * 1024 + 1, ), + CandidateSpec( + "default_allreduce_rsag", + min_message_size=512 * 1024 + 1, + ), CandidateSpec( "default_allreduce_fullmesh", min_message_size=512 * 1024 + 1, @@ -228,7 +231,6 @@ def _candidate_specs(collective: str, *, symmetric_memory: bool = False) -> tupl CandidateSpec( "default_allreduce_nvls_zero_copy", max_nblocks=32, - supported_skus=("H100", "GB300"), requires_nvls=True, requires_symmetric_memory=True, ), diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 6b3119cb..ecb52d9d 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -1,5 +1,5 @@ # Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +# Licensed under the MIT License. from concurrent.futures import ThreadPoolExecutor import os @@ -30,7 +30,7 @@ Device, DeviceType, ) -from mscclpp import CommGroup, GpuBuffer +from mscclpp import CommGroup, GpuBuffer, GpuBufferPool from mscclpp.utils import KernelBuilder, pack from ._cpp import _ext from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group @@ -60,6 +60,61 @@ def all_ranks_on_the_same_node(mpi_group: MpiGroup): return last_rank_ip == root_ip +@parametrize_mpi_groups(1) +def test_gpu_buffer_pool(mpi_group: MpiGroup): + pool = GpuBufferPool(4096) + base_ptr = pool.data + buf = pool.allocate(16 * np.dtype(np.int32).itemsize, alignment=512) + offset = buf.data() - base_ptr + assert offset % 512 == 0 + assert pool.active_bytes == buf.bytes() + del buf + assert pool.active_bytes == 0 + assert pool.free_bytes == pool.bytes + + padding_pool = GpuBufferPool(1024) + padding_first = padding_pool.allocate(100, alignment=1) + padding_second = padding_pool.allocate(100, alignment=256) + padding_third = padding_pool.allocate(1, alignment=1) + assert padding_first.offset() == 0 + assert padding_second.offset() == 256 + assert padding_third.offset() == 356 + + +@parametrize_mpi_groups(1) +def test_gpu_buffer_pool_to_torch_dlpack(mpi_group: MpiGroup): + torch = pytest.importorskip("torch") + + pool = GpuBufferPool(4096) + buf = pool.allocate(16 * torch.empty((), dtype=torch.float32).element_size(), alignment=512) + tensor = torch.utils.dlpack.from_dlpack(buf.to_dlpack(data_type=str(torch.float32), shape=[16])) + assert tensor.data_ptr() == buf.data() + assert tensor.numel() == 16 + assert tensor.dtype == torch.float32 + del buf + assert pool.active_bytes == tensor.numel() * tensor.element_size() + del tensor + assert pool.active_bytes == 0 + + +@parametrize_mpi_groups(2, 4, 8) +def test_gpu_buffer_pool_symmetric_offsets(mpi_group: MpiGroup): + pool = GpuBufferPool(8192) + offsets = [] + + first = pool.allocate(64, alignment=256) + offsets.append(first.offset()) + second = pool.allocate(128, alignment=512) + offsets.append(second.offset()) + del first + + reused = pool.allocate(32, alignment=128) + offsets.append(reused.offset()) + + gathered_offsets = mpi_group.comm.allgather(offsets) + assert all(rank_offsets == gathered_offsets[0] for rank_offsets in gathered_offsets) + + @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("ifIpPortTrio", [f"{ethernet_interface_name}:localhost:50000", ethernet_interface_name, ""]) def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str): diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc index c0713daa..c0d34188 100644 --- a/src/core/algorithm.cc +++ b/src/core/algorithm.cc @@ -52,8 +52,16 @@ CommResult NativeAlgorithm::execute(std::shared_ptr comm, const vo AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory); auto it = contexts_.find(ctxKey); if (it == contexts_.end()) { + INFO(ALGO, name_, " context cache MISS (creating new context, this triggers collective setup): rank=", + comm->bootstrap()->getRank(), ", baseSendBuff=", ctxKey.baseSendBuff, ", baseRecvBuff=", ctxKey.baseRecvBuff, + ", baseSendSize=", ctxKey.baseSendSize, ", baseRecvSize=", ctxKey.baseRecvSize, + ", numContexts(before)=", contexts_.size()); auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype); contexts_[ctxKey] = ctx; + } else { + INFO(ALGO, name_, " context cache HIT (reusing context, no collective setup): rank=", comm->bootstrap()->getRank(), + ", baseSendBuff=", ctxKey.baseSendBuff, ", baseRecvBuff=", ctxKey.baseRecvBuff, + ", numContexts=", contexts_.size()); } return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, extras, accumDtype); diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc index b3032e50..ffdd9c1c 100644 --- a/src/core/bootstrap/bootstrap.cc +++ b/src/core/bootstrap/bootstrap.cc @@ -50,6 +50,8 @@ MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector& ranks) { } } +MSCCLPP_API_CPP int Bootstrap::getNranksPerIpcDomain() const { return getNranksPerNode(); } + MSCCLPP_API_CPP void Bootstrap::send(const std::vector& data, int peer, int tag) { size_t size = data.size(); send((void*)&size, sizeof(size_t), peer, tag); @@ -83,6 +85,7 @@ class TcpBootstrap::Impl { int getRank(); int getNranks(); int getNranksPerNode(); + int getNranksPerIpcDomain(); void allGather(void* allData, int size); void broadcast(void* data, int size, int root); void send(void* data, int size, int peer, int tag); @@ -95,6 +98,7 @@ class TcpBootstrap::Impl { int rank_; int nRanks_; int nRanksPerNode_; + int nRanksPerIpcDomain_; bool netInitialized; std::unique_ptr listenSockRoot_; std::unique_ptr listenSock_; @@ -148,6 +152,7 @@ TcpBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), nRanksPerNode_(0), + nRanksPerIpcDomain_(0), netInitialized(false), peerCommAddresses_(nRanks, SocketAddress()), barrierArr_(nRanks, 0), @@ -451,6 +456,24 @@ int TcpBootstrap::Impl::getNranksPerNode() { return nRanksPerNode_; } +int TcpBootstrap::Impl::getNranksPerIpcDomain() { + if (nRanksPerIpcDomain_ > 0) return nRanksPerIpcDomain_; + std::vector ipcDomainHashes(nRanks_); + ipcDomainHashes[rank_] = getIpcDomainHash(); + allGather(ipcDomainHashes.data(), sizeof(uint64_t)); + + int nRanksPerIpcDomain = 0; + for (int i = 0; i < nRanks_; ++i) { + if (ipcDomainHashes[i] == ipcDomainHashes[rank_]) { + ++nRanksPerIpcDomain; + } + } + INFO(MSCCLPP_INIT, "rank %d IPC domain fabric hash 0x%016llx nRanksPerIpcDomain %d", rank_, + static_cast(ipcDomainHashes[rank_]), nRanksPerIpcDomain); + nRanksPerIpcDomain_ = nRanksPerIpcDomain; + return nRanksPerIpcDomain_; +} + void TcpBootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = rank_; @@ -592,6 +615,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getNranks() const { return pimpl_->getNranks() MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() const { return pimpl_->getNranksPerNode(); } +MSCCLPP_API_CPP int TcpBootstrap::getNranksPerIpcDomain() const { return pimpl_->getNranksPerIpcDomain(); } + MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); } diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc index fcecc4dd..15c6af4e 100644 --- a/src/core/executor/executor.cc +++ b/src/core/executor/executor.cc @@ -389,6 +389,7 @@ struct Executor::Impl { nvlsConnection->bindAllocatedMemory((CUdeviceptr)bufferInfo.first, bufferInfo.second); context.nvlsChannels.push_back(switchChannel); } + this->comm->bootstrap()->barrier(); } void setupSemaphores(ExecutionContext& context, const ExecutionPlan& plan) { diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 1ce61322..f599325b 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -2,8 +2,13 @@ // Licensed under the MIT License. #include +#include +#include #include #include +#include +#include +#include #include "gpu_utils_internal.hpp" @@ -274,8 +279,171 @@ void gpuMemset(void* ptr, int value, size_t bytes) { MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); } +class GpuBufferPoolStorage : public std::enable_shared_from_this { + public: + GpuBufferPoolStorage(size_t bytes, GpuBufferGranularity granularity); + std::shared_ptr allocate(size_t bytes, size_t alignment); + void release(size_t offset) noexcept; + size_t bytes() const; + size_t freeBytes() const; + size_t activeBytes() const; + char* data(); + int deviceId() const; + + private: + struct Block { + size_t offset; + size_t bytes; + size_t reservedOffset; + size_t reservedBytes; + }; + + static size_t alignUp(size_t offset, size_t alignment); + Block reserveBlock(size_t bytes, size_t alignment); + void releaseBlock(size_t offset, size_t bytes) noexcept; + + mutable std::mutex mutex_; + GpuBuffer buffer_; + std::map freeBlocks_; + std::unordered_map activeBlocks_; +}; + +GpuBufferPoolStorage::GpuBufferPoolStorage(size_t bytes, GpuBufferGranularity granularity) + : buffer_(bytes, granularity) { + if (bytes == 0) { + throw Error("GpuBufferPool size must be positive.", ErrorCode::InvalidUsage); + } + freeBlocks_[0] = buffer_.bytes(); +} + +size_t GpuBufferPoolStorage::alignUp(size_t offset, size_t alignment) { + if (alignment == 0) { + throw Error("GpuBufferPool allocation alignment must be positive.", ErrorCode::InvalidUsage); + } + size_t remainder = offset % alignment; + if (remainder == 0) { + return offset; + } + return offset + alignment - remainder; +} + +GpuBufferPoolStorage::Block GpuBufferPoolStorage::reserveBlock(size_t bytes, size_t alignment) { + if (bytes == 0) { + throw Error("GpuBufferPool allocation size must be positive.", ErrorCode::InvalidUsage); + } + for (auto it = freeBlocks_.begin(); it != freeBlocks_.end(); ++it) { + size_t blockOffset = it->first; + size_t blockBytes = it->second; + size_t alignedOffset = alignUp(blockOffset, alignment); + size_t prefixBytes = alignedOffset - blockOffset; + if (prefixBytes > blockBytes || bytes > blockBytes - prefixBytes) { + continue; + } + + size_t suffixOffset = alignedOffset + bytes; + size_t reservedBytes = prefixBytes + bytes; + size_t suffixBytes = blockBytes - reservedBytes; + freeBlocks_.erase(it); + if (suffixBytes > 0) { + freeBlocks_[suffixOffset] = suffixBytes; + } + Block block{alignedOffset, bytes, blockOffset, reservedBytes}; + activeBlocks_[alignedOffset] = block; + return block; + } + throw Error("GpuBufferPool does not have enough free memory for the requested allocation.", ErrorCode::InvalidUsage); +} + +void GpuBufferPoolStorage::releaseBlock(size_t offset, size_t bytes) noexcept { + auto next = freeBlocks_.lower_bound(offset); + if (next != freeBlocks_.begin()) { + auto prev = std::prev(next); + if (prev->first + prev->second == offset) { + offset = prev->first; + bytes += prev->second; + next = freeBlocks_.erase(prev); + } + } + if (next != freeBlocks_.end() && offset + bytes == next->first) { + bytes += next->second; + freeBlocks_.erase(next); + } + freeBlocks_[offset] = bytes; +} + +std::shared_ptr GpuBufferPoolStorage::allocate(size_t bytes, size_t alignment) { + std::lock_guard lock(mutex_); + Block block = reserveBlock(bytes, alignment); + return std::shared_ptr(new GpuBufferPool::Buffer(shared_from_this(), block.offset, bytes)); +} + +void GpuBufferPoolStorage::release(size_t offset) noexcept { + std::lock_guard lock(mutex_); + auto active = activeBlocks_.find(offset); + if (active == activeBlocks_.end()) { + return; + } + Block block = active->second; + activeBlocks_.erase(active); + releaseBlock(block.reservedOffset, block.reservedBytes); +} + +size_t GpuBufferPoolStorage::bytes() const { return buffer_.bytes(); } + +size_t GpuBufferPoolStorage::freeBytes() const { + std::lock_guard lock(mutex_); + size_t freeBytes = 0; + for (auto const& block : freeBlocks_) { + freeBytes += block.second; + } + return freeBytes; +} + +size_t GpuBufferPoolStorage::activeBytes() const { + std::lock_guard lock(mutex_); + size_t activeBytes = 0; + for (auto const& block : activeBlocks_) { + activeBytes += block.second.bytes; + } + return activeBytes; +} + +char* GpuBufferPoolStorage::data() { return buffer_.data(); } + +int GpuBufferPoolStorage::deviceId() const { return buffer_.deviceId(); } + } // namespace detail +GpuBufferPool::Buffer::Buffer(std::shared_ptr storage, size_t offset, size_t bytes) + : storage_(std::move(storage)), offset_(offset), bytes_(bytes) {} + +GpuBufferPool::Buffer::~Buffer() { storage_->release(offset_); } + +size_t GpuBufferPool::Buffer::bytes() const { return bytes_; } + +size_t GpuBufferPool::Buffer::offset() const { return offset_; } + +char* GpuBufferPool::Buffer::data() const { return storage_->data() + offset_; } + +int GpuBufferPool::Buffer::deviceId() const { return storage_->deviceId(); } + +GpuBufferPool::GpuBufferPool(size_t bytes, GpuBufferGranularity granularity) + : storage_(std::make_shared(bytes, granularity)) {} + +std::shared_ptr GpuBufferPool::allocate(size_t bytes, size_t alignment) { + return storage_->allocate(bytes, alignment); +} + +size_t GpuBufferPool::bytes() const { return storage_->bytes(); } + +size_t GpuBufferPool::freeBytes() const { return storage_->freeBytes(); } + +size_t GpuBufferPool::activeBytes() const { return storage_->activeBytes(); } + +char* GpuBufferPool::data() { return storage_->data(); } + +int GpuBufferPool::deviceId() const { return storage_->deviceId(); } + bool isNvlsSupported() { if (env()->forceDisableNvls) { return false; diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index cb808bc8..e9095ada 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -525,7 +525,15 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3 if constexpr (std::is_same_v) { assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type"); return; - } else { + } +#if defined(__FP8_TYPES_EXIST__) && \ + (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) + else if constexpr (std::is_same_v || std::is_same_v) { + assert(false && "FP8 MULTI_LOAD_REDUCE_STORE requires sm_100a or newer"); + return; + } +#endif + else { static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes"); const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); if (size <= 0) { diff --git a/src/core/include/utils_internal.hpp b/src/core/include/utils_internal.hpp index c5c67e26..c6934194 100644 --- a/src/core/include/utils_internal.hpp +++ b/src/core/include/utils_internal.hpp @@ -37,6 +37,7 @@ int64_t busIdToInt64(const std::string busId); uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); +uint64_t getIpcDomainHash(); void getRandomData(void* buffer, size_t bytes); struct netIf { diff --git a/src/core/logger.cc b/src/core/logger.cc index e13ba1b9..f33ea81b 100644 --- a/src/core/logger.cc +++ b/src/core/logger.cc @@ -61,6 +61,8 @@ static LogSubsysSet stringToLogSubsysSet(const std::string& subsysStr) { set.set(static_cast(LogSubsys::EXEC)); } else if (token == "NCCL") { set.set(static_cast(LogSubsys::NCCL)); + } else if (token == "ALGO") { + set.set(static_cast(LogSubsys::ALGO)); } else if (token == "ALL") { set.set(); // all bits } diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 8cc55430..adbf8e5b 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -6,6 +6,10 @@ #include #include +#if defined(MSCCLPP_USE_CUDA) +#include +#endif + #include #include #include @@ -175,6 +179,67 @@ uint64_t getPidHash(void) { return *pidHash; } +#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN) +namespace { + +class NvmlState { + public: + NvmlState() : initialized_(nvmlInit_v2() == NVML_SUCCESS) {} + + ~NvmlState() { + if (initialized_) { + (void)nvmlShutdown(); + } + } + + bool isInitialized() const { return initialized_; } + + private: + bool initialized_ = false; +}; + +uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) { + char hashData[NVML_GPU_FABRIC_UUID_LEN + sizeof(fabricInfo.cliqueId)]; + std::memcpy(hashData, fabricInfo.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + std::memcpy(hashData + NVML_GPU_FABRIC_UUID_LEN, &fabricInfo.cliqueId, sizeof(fabricInfo.cliqueId)); + return getHash(hashData, sizeof(hashData)); +} + +bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) { + // Use the current CUDA device; callers must set the rank's device before querying. + int deviceId; + char pciBusId[] = "00000000:00:00.0"; + if (cudaGetDevice(&deviceId) != cudaSuccess || + cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) { + return false; + } + + static NvmlState nvml; + nvmlDevice_t nvmlDevice; + nvmlGpuFabricInfo_t fabricInfo = {}; + if (!nvml.isInitialized() || nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS || + nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS || + fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) { + return false; + } + + ipcDomainHash = getFabricHash(fabricInfo); + return true; +} + +} // namespace +#endif + +uint64_t getIpcDomainHash(void) { +#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN) + uint64_t ipcDomainHash; + if (tryGetNvmlIpcDomainHash(ipcDomainHash)) { + return ipcDomainHash; + } +#endif + return getHostHash(); +} + int parseStringList(const char* string, netIf* ifList, int maxList) { if (!string) return 0; diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index d1b4e731..49688f47 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -16,8 +16,8 @@ constexpr int kMaxThreadsPerBlock = 1024; template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle* memoryChannels, - int rank, int nRanksPerNode, [[maybe_unused]] int worldSize, size_t nelems) { - const int nPeer = nRanksPerNode - 1; + int rank, int nRanksPerIpcDomain, [[maybe_unused]] int worldSize, size_t nelems) { + const int nPeer = nRanksPerIpcDomain - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by 16 const size_t nInt4 = nelems * sizeof(int) / sizeof(int4); @@ -33,10 +33,11 @@ __global__ void __launch_bounds__(1024, 1) const size_t restNInt4 = nInt4 % nInt4PerChunk; const size_t scratchChunkRankOffset = nInt4PerChunk * rank; - __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeer) { - channels[lid] = memoryChans[lid]; + // Peer count may exceed WARP_SIZE on MNNVL. + for (int i = lid; i < nPeer; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); const int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -138,11 +139,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr ct if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->nRanksPerNode, ctx->workSize, nElem); + ctx->nRanksPerIpcDomain, ctx->worldSize, nElem); } else { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->nRanksPerNode, ctx->workSize, nElem); + ctx->nRanksPerIpcDomain, ctx->worldSize, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -156,8 +157,8 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, kMaxBlocks); diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 3500c0c4..11f211ca 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -11,12 +11,12 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, - size_t nelemsPerGPU) { + size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, + size_t nRanksPerIpcDomain, size_t nelemsPerGPU) { const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; const size_t wid = tid / WARP_SIZE; - const size_t nPeer = nRanksPerNode - 1; + const size_t nPeer = nRanksPerIpcDomain - 1; // Round down to multiple of peer count. const size_t nThread = (blockDim.x * gridDim.x) / WARP_SIZE / nPeer * nPeer * WARP_SIZE; @@ -162,12 +162,12 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr c size_t channelOutOffset = *static_cast(ctx->extras["channel_out_offset"].get()); if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh2<<>>( - (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->nRanksPerNode, nElem); + (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->worldSize, + ctx->nRanksPerIpcDomain, nElem); } else { allgatherFullmesh2<<>>( - (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->nRanksPerNode, nElem); + (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->worldSize, + ctx->nRanksPerIpcDomain, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -181,8 +181,8 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->memorySemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 49058f59..0e34be71 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -14,14 +14,11 @@ namespace collective { template __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, uint32_t flagSize) { - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); - const int nPeers = nRanksPerNode - 1; + const int nPeers = nRanksPerIpcDomain - 1; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0; @@ -71,25 +68,25 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand } } -inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int worldSize) { - if (inputSize < worldSize * sizeof(int)) { - return {worldSize - 1, (worldSize - 1) * WARP_SIZE}; +inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain) { + if (inputSize < nRanksPerIpcDomain * sizeof(int)) { + return {nRanksPerIpcDomain - 1, (nRanksPerIpcDomain - 1) * WARP_SIZE}; } - return {(worldSize - 1) * 4, 512}; + return {(nRanksPerIpcDomain - 1) * 4, 512}; } template struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); allreduceAllPairs<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize); + nRanksPerIpcDomain, worldSize, nelems, numScratchBuff, flags, flagSize); return cudaGetLastError(); } }; @@ -108,16 +105,22 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr&, DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); + if (algoCtx->worldSize != algoCtx->nRanksPerIpcDomain) { + WARN(ALGO, + "AllreduceAllpairPacket requires worldSize to match nRanksPerIpcDomain, got worldSize=", algoCtx->worldSize, + ", nRanksPerIpcDomain=", algoCtx->nRanksPerIpcDomain); + return CommResult::CommInvalidArgument; + } std::pair blockAndThreadNum{nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->nRanksPerIpcDomain); } if (blockAndThreadNum.first > maxBlockNum_) { WARN(ALGO, "Requested block number ", blockAndThreadNum.first, " exceeds the maximum supported block number ", maxBlockNum_, "."); return CommResult::CommInvalidArgument; } - const int nPeers = algoCtx->nRanksPerNode - 1; + const int nPeers = algoCtx->nRanksPerIpcDomain - 1; // The kernel maps peer sends by warpId, so every peer needs a full warp. if (blockAndThreadNum.second % WARP_SIZE != 0 || blockAndThreadNum.second / WARP_SIZE < nPeers) { WARN(ALGO, @@ -138,8 +141,8 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr, - nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode, - algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, + nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerIpcDomain, + algoCtx->worldSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error)); @@ -153,8 +156,8 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p auto ctx = std::make_shared(); const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index 24d2a31c..eb872624 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -9,12 +9,23 @@ namespace mscclpp { namespace collective { +namespace { +// Per-context cache of input-side MemoryChannels keyed by input pointer. +// Lifetime is tied to AlgorithmCtx, so entries are released when the ctx is +// evicted from the framework's context cache (avoids unbounded growth across +// allreduce calls that pass different input buffers). +using InputChannelsCache = + std::unordered_map, std::shared_ptr>>>; +constexpr const char* kInputChannelsExtraKey = "inputChannels"; +} // namespace + template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, int rank, - int nRanksPerNode, int worldSize, size_t nelems) { - const int nPeer = nRanksPerNode - 1; + int nRanksPerIpcDomain, int worldSize, size_t nelems) { + const int nPeer = nRanksPerIpcDomain - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); @@ -49,12 +60,13 @@ __global__ void __launch_bounds__(512, 1) const size_t blockOffset = nInt4PerChunk * blockIdx.x; const size_t scratchChunkRankOffset = chunkSizePerRank * rank; - __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; - __shared__ DeviceHandle outChannels[MAX_NRANKS_PER_NODE - 1]; + __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; + __shared__ DeviceHandle outChannels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeer) { - channels[lid] = memoryChans[lid]; - outChannels[lid] = memoryOutChans[lid]; + // Peer count may exceed WARP_SIZE on MNNVL. + for (int i = lid; i < nPeer; i += WARP_SIZE) { + channels[i] = memoryChans[i]; + outChannels[i] = memoryOutChans[i]; } __syncwarp(); @@ -156,7 +168,7 @@ template struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, - size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize, + size_t channelOutDataOffset, size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; @@ -165,7 +177,7 @@ struct AllreduceAllconnectAdapter { if (nThreadsPerBlock == 0) nThreadsPerBlock = 512; allreduceFullmesh<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels, - channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems); + channelOutDataOffset, rank, nRanksPerIpcDomain, worldSize, nelems); return cudaGetLastError(); } }; @@ -194,17 +206,17 @@ CommResult AllreduceFullmesh::allreduceKernelFunc( MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); channelOutOffset = (char*)output - (char*)recvBasePtr; } - std::shared_ptr> inputChannelHandles; - if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) { - inputChannelHandles = this->memoryChannelsMap_[input].second; - } else { + auto& inputChannelsCache = *static_cast(ctx->extras.at(kInputChannelsExtraKey).get()); + auto it = inputChannelsCache.find(input); + if (it == inputChannelsCache.end()) { RegisteredMemory localMemory = comm_->registerMemory(const_cast(input), inputSize, Transport::CudaIpc); std::vector channels = setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory, nChannelsPerConnection_); - this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels)); + auto handles = setupMemoryChannelDeviceHandles(channels); + it = inputChannelsCache.emplace(input, std::make_pair(std::move(channels), std::move(handles))).first; } - inputChannelHandles = this->memoryChannelsMap_[input].second; + std::shared_ptr> inputChannelHandles = it->second.second; AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { @@ -222,7 +234,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc( } cudaError_t error = allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(), - nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, + nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error)); @@ -248,8 +260,8 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->outputSemaphores_; @@ -266,6 +278,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptrmemoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory, nChannelsPerConnection_); ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels); + ctx->extras.insert({kInputChannelsExtraKey, std::make_shared()}); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 2d71cd63..ea34ce69 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -20,15 +20,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* switchChannels, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, - [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = nRanksPerNode - 1; - int nBlocksForCopy = nRanksPerNode * 2; - int nBlocksForReduce = nRanksPerNode; + int nPeers = nRanksPerIpcDomain - 1; + int nBlocksForCopy = nRanksPerIpcDomain * 2; + int nBlocksForReduce = nRanksPerIpcDomain; int copyReduceRatio = nBlocksForCopy / nBlocksForReduce; - size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode; - size_t sizePerRank = size / nRanksPerNode; + size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain; + size_t sizePerRank = size / nRanksPerIpcDomain; assert(sizePerRank % alignment == 0); uint32_t sizePerBlock = ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment; @@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1) deviceSemaphore[bid + 2 * nBlocksForCopy].acquire(); } __syncthreads(); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank; char* srcData = (char*)src + blockOffset; @@ -125,7 +125,7 @@ __global__ void __launch_bounds__(1024, 1) channels->wait(); } __syncthreads(); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock + @@ -150,7 +150,7 @@ template struct NvlsBlockPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -166,9 +166,9 @@ struct NvlsBlockPipelineAdapter { #endif { using ChannelType = DeviceHandle; - allreduceNvlsBlockPipeline - <<>>(input, scratch, output, (ChannelType*)memoryChannels, - nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + allreduceNvlsBlockPipeline<<>>( + input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, + nRanksPerIpcDomain); return cudaGetLastError(); } } @@ -176,7 +176,10 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - int nBaseChannels = 64; + fp8NvlsSupported_ = isFp8NvlsSupported(); + int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); + // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel). + int nBaseChannels = std::max(64, 4 * nRanksPerIpcDomain); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -187,12 +190,15 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } -CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, - void* output, size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras, - DataType accumDtype) { +CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); + if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) { + WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction."); + return CommResult::CommInvalidArgument; + } AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); @@ -200,11 +206,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->nRanksPerNode * 5, 1024}; + blockAndThreadNum = {ctx->nRanksPerIpcDomain * 5, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error)); @@ -221,12 +227,12 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar void*, size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index a616485e..98d9e1a3 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -82,7 +82,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr comm) { int nSwitchChannels = 1; this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); this->switchChannels_ = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); } AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { @@ -93,8 +93,8 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = this->switchChannels_; @@ -123,7 +123,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr } cudaError_t error = allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, - 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, + 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error)); @@ -154,4 +154,4 @@ std::shared_ptr AllreduceNvlsPacket::build() { }); } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 3bb054da..b11fef8e 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -18,15 +18,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* multicast, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, - [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = nRanksPerNode - 1; + int nPeers = nRanksPerIpcDomain - 1; int nBlocks = gridDim.x; int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION; int bid = blockIdx.x; - size_t sizePerRank = size / nRanksPerNode; - size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode; + size_t sizePerRank = size / nRanksPerIpcDomain; + size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain; const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment; size_t start = bid * maxSizePerBlock; size_t end = min(start + maxSizePerBlock, sizePerRank); @@ -53,19 +53,20 @@ __global__ void __launch_bounds__(1024, 1) lastIterSize = sizePerBlock % copyPerIter; } - const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x * 2; + const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x * 2; auto memoryChans = memoryChannels + chanOffset; - __shared__ DeviceHandle channels[(MAX_NRANKS_PER_NODE - 1) * 2]; + __shared__ DeviceHandle channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeers * 2) { - channels[lid] = memoryChans[lid]; + // Peer count may exceed WARP_SIZE on MNNVL. + for (int i = lid; i < nPeers * 2; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); for (int it = 0; it < nIter; it++) { const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter; if (warpId < endCopyWid) { int tidInCopy = threadIdx.x; - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -96,7 +97,7 @@ __global__ void __launch_bounds__(1024, 1) channels[tidInRecvCopy + nPeers].wait(); } asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory"); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -113,7 +114,7 @@ template struct NvlsWarpPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -129,17 +130,20 @@ struct NvlsWarpPipelineAdapter { #endif { using ChannelType = DeviceHandle; - allreduceNvlsWarpPipeline - <<>>(input, scratch, output, (ChannelType*)memoryChannels, - nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + allreduceNvlsWarpPipeline<<>>( + input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, + nRanksPerIpcDomain); return cudaGetLastError(); } } }; void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { - nSwitchChannels_ = 8; - int nBaseChannels = 64; + nSwitchChannels_ = NUM_NVLS_CONNECTION; + fp8NvlsSupported_ = isFp8NvlsSupported(); + int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); + // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain. + int nBaseChannels = std::max(64, 8 * nRanksPerIpcDomain); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -155,6 +159,10 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); + if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) { + WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction."); + return CommResult::CommInvalidArgument; + } AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); @@ -162,11 +170,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->nRanksPerNode * 4, 1024}; + blockAndThreadNum = {ctx->nRanksPerIpcDomain * 4, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error)); @@ -183,12 +191,12 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share void*, size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index e7f2028f..57858c17 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -6,25 +6,25 @@ #include "allreduce/allreduce_nvls_zero_copy.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" -#include "debug.h" +#include "logger.hpp" namespace mscclpp { namespace collective { constexpr int MAX_NBLOCKS = 32; -template +template __global__ void __launch_bounds__(1024, 1) allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle* memoryChannels, [[maybe_unused]] mscclpp::DeviceHandle* multicast, [[maybe_unused]] mscclpp::DeviceHandle* multicastOut, [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset, - [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - int nPeers = nRanksPerNode - 1; + int nPeers = nRanksPerIpcDomain - 1; int nBlocks = gridDim.x; int bid = blockIdx.x; - size_t sizePerRank = size / nRanksPerNode; + size_t sizePerRank = size / nRanksPerIpcDomain; const size_t minAlign = 16; // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks; @@ -40,12 +40,13 @@ __global__ void __launch_bounds__(1024, 1) mscclpp::DeviceHandle* multicastPtr = multicast + bid; mscclpp::DeviceHandle* multicastOutPtr = multicastOut + bid; - const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x; + const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x; auto memoryChans = memoryChannels + chanOffset; - __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nRanksPerNode - 1) { - channels[lid] = memoryChans[lid]; + // Peer count may exceed WARP_SIZE on MNNVL. + for (int i = lid; i < nRanksPerIpcDomain - 1; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); if (threadIdx.x < nPeers) { @@ -56,8 +57,8 @@ __global__ void __launch_bounds__(1024, 1) T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; if (curBlockSize > 0) { - handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize, - threadIdx.x, blockDim.x); + handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, + curBlockSize, threadIdx.x, blockDim.x); } __syncthreads(); if (threadIdx.x < nPeers) { @@ -72,7 +73,7 @@ struct NvlsAdapter { static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*, mscclpp::DeviceHandle* nvlsChannels, mscclpp::DeviceHandle* nvlsOutChannels, size_t channelInOffset, - size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize, + size_t channelOutOffset, size_t, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -80,17 +81,11 @@ struct NvlsAdapter { } else if constexpr (std::is_same_v) { // fp8_e4m3b15 is a software-only type with no hardware NVLS support. return cudaErrorNotSupported; - } else -#if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000) - if constexpr (std::is_same_v || std::is_same_v) { - return cudaErrorNotSupported; - } else -#endif - { + } else { using ChannelType = DeviceHandle; - allreduceNvls<<>>((ChannelType*)memoryChannels, nvlsChannels, - nvlsOutChannels, channelInOffset, channelOutOffset, - inputSize, rank, nRanksPerNode); + allreduceNvls<<>>( + (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize, + rank, nRanksPerIpcDomain); return cudaGetLastError(); } } @@ -102,6 +97,7 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { cudaDeviceProp deviceProp; MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); computeCapabilityMajor_ = deviceProp.major; + fp8NvlsSupported_ = isFp8NvlsSupported(); nSwitchChannels_ = 32; this->conns_ = setupConnections(comm); // setup semaphores @@ -120,13 +116,17 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo [[maybe_unused]] const std::unordered_map& extras, mscclpp::DataType accumDtype) { if (!symmetricMemory_) { - WARN("AllreduceNvls requires symmetric memory for now."); + WARN(ALGO, "AllreduceNvls requires symmetric memory."); return CommResult::CommInvalidArgument; } auto ctx = std::static_pointer_cast(ctx_void); + if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) { + WARN(ALGO, "FP8 NVLS allreduce requires device support for FP8 multimem reduction."); + return CommResult::CommInvalidArgument; + } AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { - WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); + WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast(dtype)); return CommResult::CommInvalidArgument; } size_t sendBytes, recvBytes; @@ -142,24 +142,28 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { - numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024}; + numBlocksAndThreads = {::min(ctx->nRanksPerIpcDomain, MAX_NBLOCKS), 1024}; // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS]. if (computeCapabilityMajor_ == 10) { - numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->workSize, MAX_NBLOCKS)); + numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->worldSize, MAX_NBLOCKS)); } } if (numBlocksAndThreads.first > MAX_NBLOCKS) { - WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS); + WARN(ALGO, "Number of blocks exceeds maximum supported value of ", MAX_NBLOCKS); return CommResult::CommInvalidArgument; } - cudaError_t error = - allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels, - nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, - inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, + nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, + ctx->nRanksPerIpcDomain, ctx->worldSize, inputSize, stream, nullptr, 0, 0, + numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { - WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error)); + if (error == cudaErrorNotSupported) { + WARN(ALGO, "AllreduceNvls does not support the requested data type."); + return CommResult::CommInvalidArgument; + } + WARN(ALGO, "AllreduceNvls failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; @@ -179,20 +183,24 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); + INFO(ALGO, "AllreduceNvls init context: rank=", ctx->rank, ", sendBasePtr=", (void*)sendBasePtr, + ", recvBasePtr=", (void*)recvBasePtr, ", sendBytes=", sendBytes, ", recvBytes=", recvBytes, + ", inputOffset=", (char*)input - (char*)sendBasePtr, ", outputOffset=", (char*)output - (char*)recvBasePtr); // setup channels - ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_); + ctx->switchChannels = + setupNvlsChannels(comm, this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_); if (input != output) { auto nvlsOutConnections = this->nvlsOutConnections_; std::vector outChannels = - setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_); ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end()); } diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index 414c2b1f..696bdc3d 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -15,7 +15,7 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff #if defined(ENABLE_NPKIT) , @@ -23,9 +23,6 @@ __global__ void __launch_bounds__(1024, 1) #else ) { #endif - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - #if defined(ENABLE_NPKIT) extern __shared__ int4 NpkitSharedMem[]; NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem); @@ -56,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1) else nelems = nelems / (sizeof(int) / sizeof(T)); - const int nPeers = nRanksPerNode - 1; + const int nPeers = nRanksPerIpcDomain - 1; const size_t nPkts = nelems / 2; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; @@ -81,10 +78,11 @@ __global__ void __launch_bounds__(1024, 1) uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // Put channels into shared memory, read channel info from global memory is unexpectable slow. - __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = tid % WARP_SIZE; - if (lid < nPeers) { - channels[lid] = memoryChannels[lid]; + // Peer count may exceed WARP_SIZE on MNNVL. + for (int i = lid; i < nPeers; i += WARP_SIZE) { + channels[i] = memoryChannels[i]; } __syncwarp(); // step 1: write to scratch buffer @@ -156,31 +154,32 @@ template struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); - // Optimize the number of blocks to be multiple of (worldSize - 1) - nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1); + // Optimize the number of blocks to be multiple of the IPC-domain peer count. + const int nPeers = nRanksPerIpcDomain - 1; + nBlocks = nBlocks / nPeers * nPeers; #if defined(ENABLE_NPKIT) size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), - NpKit::GetCpuTimestamp()); + nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff, + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff); + nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif return cudaGetLastError(); } }; -inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize, +inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain, int worldSize, [[maybe_unused]] DataType dtype) { - int nBlocks = (nRanksPerNode - 1) * 4; + int nBlocks = (nRanksPerIpcDomain - 1) * 4; int nThreadsPerBlock = 1024; if (inputSize >= 32768) { nBlocks = (worldSize - 1) * 8; @@ -198,12 +197,7 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int // FP8-specific tuning for 32KB-256KB range { - bool isFp8 = dtype == DataType::FLOAT8_E4M3B15; -#if defined(__FP8_TYPES_EXIST__) - isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || - dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ; -#endif - if (isFp8) { + if (isFp8DataType(dtype)) { if (inputSize < (64 << 10)) { nThreadsPerBlock = 64; } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) { @@ -231,9 +225,19 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ const std::unordered_map&, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); + if (ctx->worldSize != ctx->nRanksPerIpcDomain) { + WARN(ALGO, "AllreducePacket requires worldSize to match nRanksPerIpcDomain, got worldSize=", ctx->worldSize, + ", nRanksPerIpcDomain=", ctx->nRanksPerIpcDomain); + return CommResult::CommInvalidArgument; + } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode, dtype); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->nRanksPerIpcDomain, ctx->worldSize, dtype); + } else { + const int nPeers = ctx->nRanksPerIpcDomain - 1; + if (blockAndThreadNum.first < nPeers) { + return CommResult::CommInvalidArgument; + } } if (blockAndThreadNum.first > maxBlockNum_) { WARN(ALGO, "Requested block number ", blockAndThreadNum.first, " exceeds the maximum supported block number ", @@ -261,8 +265,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ } cudaError_t error = allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr, - channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, - stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, + channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->worldSize, + inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error)); @@ -276,8 +280,8 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptr(); const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index db471b93..2d6d4dec 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include + #include "allreduce/allreduce_rsag.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" @@ -28,21 +30,21 @@ namespace collective { // // Data is processed in int4-sized (16-byte) units for coalesced memory access, // with special handling for any remainder elements at the tail. -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - DeviceHandle* switchChannels, void* remoteMemories, int rank, int nRanksPerNode, + DeviceHandle* switchChannels, void* remoteMemories, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems) { int blockId = blockIdx.x; - uint32_t nPeers = nRanksPerNode - 1; + uint32_t nPeers = nRanksPerIpcDomain - 1; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * nRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / nRanksPerNode; + uint32_t alignedNelems = ((nelems + nRanksPerIpcDomain - 1) / nRanksPerIpcDomain + nelemsPerInt4 - 1) / + nelemsPerInt4 * nelemsPerInt4 * nRanksPerIpcDomain; + uint32_t nelemsPerRank = alignedNelems / nRanksPerIpcDomain; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t lastInt4Index = nelems / nelemsPerInt4; uint32_t remainder = nelems % nelemsPerInt4; @@ -51,6 +53,7 @@ __global__ void __launch_bounds__(1024, 1) int4* resultBuff4 = reinterpret_cast((char*)resultBuff); int4* buff4 = reinterpret_cast((char*)buff); DeviceHandle* memoryChannelsLocal = memoryChannels + blockId * nPeers; + using AccumVec = std::conditional_t, int4, mscclpp::VectorType>; uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x; uint32_t remainderForBlock = nInt4PerRank % gridDim.x; @@ -59,7 +62,7 @@ __global__ void __launch_bounds__(1024, 1) nInt4PerBlock += remainderForBlock; } if (nInt4PerBlock == 0) return; - uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode; + uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerIpcDomain; for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) { int rankIdx = idx / nInt4PerBlock; @@ -82,15 +85,16 @@ __global__ void __launch_bounds__(1024, 1) for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) { uint32_t offset = idx + offset4 + rank * nInt4PerRank; if (offset > lastInt4Index) continue; - int4 tmp = scratch4[offset]; + AccumVec acc = mscclpp::upcastVector(scratch4[offset]); for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % nRanksPerIpcDomain; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; int4 data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); - tmp = calVector(data, tmp); + acc = mscclpp::calVectorAccum(acc, data); } + int4 tmp = mscclpp::downcastVector(acc); for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % nRanksPerIpcDomain; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[peerIdx], offset, tmp); } @@ -127,17 +131,17 @@ template struct AllreduceRsAgAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, - void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { nThreadsPerBlock = 1024; nBlocks = 64; } - allreduceRsAg<<>>( + allreduceRsAg<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - nRanksPerNode, worldSize, nelems); + nRanksPerIpcDomain, worldSize, nelems); return cudaGetLastError(); } }; @@ -179,9 +183,13 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, c return CommResult::CommInvalidArgument; } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + if (numBlocksAndThreads.first > nChannelsPerConnection_) { + WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_); + return CommResult::CommInvalidArgument; + } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, - algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, + algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); @@ -198,8 +206,8 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index eabe3dc5..4b243444 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -86,15 +86,15 @@ template __global__ void __launch_bounds__(1024, 1) allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, - int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut, - uint32_t nblocksForReduce, uint32_t nblocksForRecv) { + int nRanksPerIpcDomain, int worldSize, size_t nelems, size_t scratchSize, + uint32_t nblocksForPut, uint32_t nblocksForReduce, uint32_t nblocksForRecv) { uint32_t bid = blockIdx.x; constexpr uint32_t nStepsPerIter = 4; uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4); uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter; const uint32_t chunkSize = nInt4PerIter * worldSize; uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize; - uint32_t nPeers = nRanksPerNode - 1; + uint32_t nPeers = nRanksPerIpcDomain - 1; int4* scratch4 = reinterpret_cast((char*)scratch); const uint32_t scratchIterStride = 2 * chunkSize; // one for AS, one for AG const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride; @@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x; for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1; // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot) uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter; @@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp = loadVec(buff, myChunkOffset, nelems); // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer]) for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; uint32_t peerSlotOffset = baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; int4 data = scratch4[peerSlotOffset]; @@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1) uint32_t dstOffset = baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; for (uint32_t i = 0; i < nPeers; i++) { - int peerIdx = (rank + i + 1) % nRanksPerNode; + int peerIdx = (rank + i + 1) % nRanksPerIpcDomain; int index = peerIdx < rank ? peerIdx : peerIdx - 1; mscclpp::write(((void**)remoteMemories)[index], dstOffset, tmp); } @@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); // Copy other ranks' reduced chunks from scratch to result for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv; @@ -224,7 +224,7 @@ template struct AllreduceRsAgPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); @@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter { } allreduceRsAgPipeline<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); + nRanksPerIpcDomain, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); return cudaGetLastError(); } }; @@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc( std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_, - algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, - 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize, stream, + nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -305,8 +305,8 @@ std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt void*, size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index f95ba7e3..e7ed0cab 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -35,10 +35,10 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // // This approach requires registering both input and output buffers as remote // memories (2 * nPeers handles), but avoids scratch buffer allocation and -// the extra copy steps of the standard RSAG. The NRanksPerNode template +// the extra copy steps of the standard RSAG. The NRanks template // parameter enables compile-time unrolling of peer loops (supports 4 or 8). -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, @@ -48,12 +48,12 @@ __global__ void __launch_bounds__(1024, 1) assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); - constexpr int NPeers = NRanksPerNode - 1; + constexpr int NPeers = NRanks - 1; constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1; - uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * NRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / NRanksPerNode; + constexpr uint32_t outputRemoteBufferOffset = NPeers; + uint32_t alignedNelems = + ((nelems + NRanks - 1) / NRanks + nelemsPerInt4 - 1) / nelemsPerInt4 * nelemsPerInt4 * NRanks; + uint32_t nelemsPerRank = alignedNelems / NRanks; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; @@ -69,7 +69,7 @@ __global__ void __launch_bounds__(1024, 1) } if (nInt4PerBlock == 0) return; - if (threadIdx.x < NPeers) { + if ((int)threadIdx.x < NPeers) { memoryChannelsLocal[threadIdx.x].relaxedSignal(); memoryChannelsLocal[threadIdx.x].relaxedWait(); } @@ -86,18 +86,19 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp_raw = buff4[offset]; #pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; + int rankIdx = (rank + i + 1) % NRanks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); } AccumVec acc = mscclpp::upcastVector(tmp_raw); +#pragma unroll for (int i = 0; i < NPeers; i++) { acc = mscclpp::calVectorAccum(acc, data[i]); } int4 tmp = mscclpp::downcastVector(acc); #pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; + int rankIdx = (rank + i + 1) % NRanks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); } @@ -105,7 +106,7 @@ __global__ void __launch_bounds__(1024, 1) } // Use device barrier gives better performance here. globalSyncer.sync(gridDim.x); - if (blockIdx.x == 0 && threadIdx.x < NPeers) { + if (blockIdx.x == 0 && (int)threadIdx.x < NPeers) { memoryChannelsLocal[threadIdx.x].signal(); memoryChannelsLocal[threadIdx.x].wait(); } @@ -115,8 +116,8 @@ template struct AllreduceRsAgZeroCopyAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, - void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { @@ -126,16 +127,17 @@ struct AllreduceRsAgZeroCopyAdapter { nBlocks = 128; } } - if (nRanksPerNode == 4) { + if (nRanksPerIpcDomain == 4) { allreduceRsAgZeroCopy<4, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); - } else if (nRanksPerNode == 8) { + } else if (nRanksPerIpcDomain == 8) { allreduceRsAgZeroCopy<8, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); } else { - THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode); + WARN(ALGO, "AllreduceRsAgZeroCopy only supports nRanksPerIpcDomain of 4 or 8, got: ", nRanksPerIpcDomain); + return cudaErrorInvalidValue; } return cudaGetLastError(); } @@ -164,11 +166,19 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + if (numBlocksAndThreads.first > nChannelsPerConnection_) { + WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_); + return CommResult::CommInvalidArgument; + } cudaError_t error = allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(), - nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, - nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->worldSize, inputSize, + stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { + if (error == cudaErrorInvalidValue) { + WARN(ALGO, "AllreduceRsAgZeroCopy received invalid launch arguments: ", cudaGetErrorString(error)); + return CommResult::CommInvalidArgument; + } WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } @@ -193,16 +203,14 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt void* output, size_t size, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); - ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->worldSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->semaphores_; // register input and output memories RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc); RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc); - this->inputMemories_.push_back(inputMemory); - this->outputMemories_.push_back(outputMemory); auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory); auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory); diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cu similarity index 77% rename from src/ext/collectives/collective_utils.cc rename to src/ext/collectives/collective_utils.cu index 016c4a5c..6d4b005a 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cu @@ -1,16 +1,93 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#include "collective_utils.hpp" - #include #include #include +#include #include #include +#include "collective_utils.hpp" + namespace mscclpp { namespace collective { + +namespace { + +#if !defined(MSCCLPP_DEVICE_HIP) +__global__ void fp8NvlsSupportProbeKernel(int* supported) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \ + (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) + *supported = 1; +#else + *supported = 0; +#endif +} + +bool detectFp8NvlsSupport() { + AvoidCudaGraphCaptureGuard cgcGuard; + auto supportedDevice = mscclpp::detail::gpuCallocUnique(); + int supportedHost = 0; + auto stream = gpuStreamPool()->getStream(); + + fp8NvlsSupportProbeKernel<<<1, 1, 0, stream>>>(supportedDevice.get()); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return false; + } + + MSCCLPP_CUDATHROW( + cudaMemcpyAsync(&supportedHost, supportedDevice.get(), sizeof(supportedHost), cudaMemcpyDeviceToHost, stream)); + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) { + (void)cudaGetLastError(); + return false; + } + return supportedHost != 0; +} +#endif + +} // namespace + +bool isFp8DataType(DataType dtype) { + return dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || dtype == DataType::FLOAT8_E5M2 || + dtype == DataType::FLOAT8_E5M2FNUZ || dtype == DataType::FLOAT8_E4M3B15; +} + +bool isNativeFp8DataType(DataType dtype) { +#if defined(__FP8_TYPES_EXIST__) +#if defined(__FP8_E4M3_IS_FNUZ__) + if (dtype == DataType::FLOAT8_E4M3FNUZ) { + return true; + } +#else + if (dtype == DataType::FLOAT8_E4M3FN) { + return true; + } +#endif +#if defined(__FP8_E5M2_IS_FNUZ__) + if (dtype == DataType::FLOAT8_E5M2FNUZ) { + return true; + } +#else + if (dtype == DataType::FLOAT8_E5M2) { + return true; + } +#endif +#endif + return false; +} + +bool isFp8NvlsSupported() { +#if defined(MSCCLPP_DEVICE_HIP) + return false; +#else + static const bool supported = detectFp8NvlsSupport(); + return supported; +#endif +} + std::vector setupRemoteMemories(std::shared_ptr comm, int rank, mscclpp::RegisteredMemory localMemory) { std::vector remoteMemories; @@ -98,7 +175,8 @@ std::vector> setupNvlsConnections(std:: return nvlsConnections; } -std::vector setupNvlsChannels(std::vector> conns, +std::vector setupNvlsChannels(std::shared_ptr comm, + std::vector> conns, void* buffer, size_t bufferSize, int nSwitchChannels) { std::vector channels; @@ -107,6 +185,8 @@ std::vector setupNvlsChannels(std::vectorbindAllocatedMemory((CUdeviceptr)buffer, bufferSize); channels.push_back(switchChannel); } + // Synchronize to make sure all ranks have their NVLS channels set up before any rank starts using them. + comm->bootstrap()->barrier(); return channels; } @@ -153,4 +233,4 @@ std::shared_ptr> setupBaseMemo } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index 64f5ec54..bba82ee5 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -4,6 +4,7 @@ #include #include "allreduce/common.hpp" +#include "collective_utils.hpp" namespace mscclpp { namespace collective { diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp index a54352b3..e0c63a3d 100644 --- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp @@ -30,8 +30,6 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { std::vector> inputScratchSemaphores_; std::vector remoteScratchMemories_; RegisteredMemory localScratchMemory_; - std::unordered_map, std::shared_ptr>>> - memoryChannelsMap_; bool symmetricMemory_ = false; }; } // namespace collective diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 81b74add..b408c64c 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -33,6 +33,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { std::vector baseChannels_; std::vector conns_; std::vector> nvlsConnections_; + bool fp8NvlsSupported_{false}; }; } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index 8f02a873..2ce3a4fb 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -33,6 +33,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { std::vector baseChannels_; std::vector conns_; std::vector> nvlsConnections_; + bool fp8NvlsSupported_{false}; }; } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index d53ea180..ec64e967 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -36,9 +36,10 @@ class AllreduceNvls : public AlgorithmBuilder { std::vector> nvlsConnections_; std::vector> nvlsOutConnections_; int computeCapabilityMajor_{0}; + bool fp8NvlsSupported_{false}; }; } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ \ No newline at end of file +#endif // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp index de7ca471..771126c9 100644 --- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp @@ -29,7 +29,7 @@ class AllreducePacket : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; - const int maxBlockNum_ = 56; + const int maxBlockNum_ = 112; std::vector conns_; uintptr_t flagBuffer_; size_t flagBufferSize_; @@ -37,4 +37,4 @@ class AllreducePacket : public AlgorithmBuilder { std::vector registeredMemories_; }; } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp index 05bf2ef3..528d9708 100644 --- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp @@ -27,8 +27,6 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder { int nChannelsPerConnection_; std::vector conns_; std::vector> semaphores_; - std::vector inputMemories_; - std::vector outputMemories_; std::vector baseChannels_; std::shared_ptr> baseMemoryChannelHandles_; diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 93b18e26..5d593449 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -36,36 +36,46 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() { } } -template +template MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size, int tid, int nThreads) { - // nvls can only handle 4 bytes alignment - MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); - constexpr size_t nElem = calcVectorSize(); - // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations - constexpr size_t vecSize = (std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v) - ? 1 - : nElem; - using vectorType = mscclpp::VectorType; - const size_t nVec = size / sizeof(vectorType); - const size_t srcOffset4 = srcOffset / sizeof(vectorType); - const size_t dstOffset4 = dstOffset / sizeof(vectorType); - vectorType* src4 = (vectorType*)src; - vectorType* dst4 = (vectorType*)dst; - for (size_t idx = tid; idx < nVec; idx += nThreads) { - auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); - mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx); - } - // handle rest of data - size_t processed = nVec * sizeof(vectorType); - constexpr size_t nRestElem = 4 / sizeof(T); - using restVectorType = mscclpp::VectorType; - const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType); - const size_t endIdx = (srcOffset + size) / sizeof(restVectorType); - for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) { - auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); - mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx); +#if defined(__FP8_TYPES_EXIST__) && \ + (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) + if constexpr (std::is_same_v || std::is_same_v) { + assert(false && "FP8 NVLS multimem requires sm_100a or newer"); + return; + } else +#endif + { + // nvls can only handle 4 bytes alignment + MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); + constexpr size_t nElem = calcVectorSize(); + // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations + constexpr size_t vecSize = (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v) + ? 1 + : nElem; + using vectorType = mscclpp::VectorType; + const size_t nVec = size / sizeof(vectorType); + const size_t srcOffset4 = srcOffset / sizeof(vectorType); + const size_t dstOffset4 = dstOffset / sizeof(vectorType); + vectorType* src4 = (vectorType*)src; + vectorType* dst4 = (vectorType*)dst; + for (size_t idx = tid; idx < nVec; idx += nThreads) { + auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); + mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx); + } + // handle rest of data + size_t processed = nVec * sizeof(vectorType); + constexpr size_t nRestElem = 4 / sizeof(T); + using restVectorType = mscclpp::VectorType; + const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType); + const size_t endIdx = (srcOffset + size) / sizeof(restVectorType); + for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) { + auto val = + mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); + mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx); + } } } #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index f705a9d1..7dd6158a 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -26,11 +26,22 @@ namespace mscclpp { namespace collective { constexpr int NUM_NVLS_CONNECTION = 8; -constexpr int NUM_SEMAPHORES = 64; +// Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side +// semaphore indices grow as O(nRanksPerIpcDomain) (e.g. nvls_block_pipeline uses +// up to ~5 * nRanksPerIpcDomain entries). +constexpr int NUM_SEMAPHORES = 512; -constexpr int MAX_NRANKS_PER_NODE = 8; +// Upper bound on the number of NVLink-reachable ranks that participate in a +// single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to +// GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing +// of shared-memory channel arrays in the allreduce/allgather kernels. +constexpr int MAX_IPC_DOMAIN_NRANKS = 72; -constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB +constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // Two 70 MiB buffers for double-buffered packet scratch space. + +bool isFp8DataType(DataType dtype); +bool isNativeFp8DataType(DataType dtype); +bool isFp8NvlsSupported(); std::vector setupRemoteMemories(std::shared_ptr comm, int rank, RegisteredMemory localMemory); @@ -50,7 +61,8 @@ std::shared_ptr> setupMemoryChannelDeviceHandles( std::vector> setupNvlsConnections(std::shared_ptr comm, size_t size, int numConnections); -std::vector setupNvlsChannels(std::vector> conns, void* buffer, +std::vector setupNvlsChannels(std::shared_ptr comm, + std::vector> conns, void* buffer, size_t bufferSize, int nSwitchChannels); std::shared_ptr> setupNvlsChannelDeviceHandles( @@ -71,8 +83,9 @@ std::shared_ptr> setupBaseMemoryChannelDeviceHan class AlgorithmCtx { public: int rank; - int workSize; + int worldSize; int nRanksPerNode; + int nRanksPerIpcDomain; std::vector registeredMemories; std::vector memoryChannels; @@ -89,4 +102,4 @@ class AlgorithmCtx { } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_ \ No newline at end of file +#endif // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_ diff --git a/src/ext/nccl/CMakeLists.txt b/src/ext/nccl/CMakeLists.txt index 9767e66f..463b7550 100644 --- a/src/ext/nccl/CMakeLists.txt +++ b/src/ext/nccl/CMakeLists.txt @@ -13,6 +13,7 @@ target_include_directories(mscclpp_nccl PRIVATE include ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src/core/include + ${PROJECT_SOURCE_DIR}/src/ext/collectives/include ${GPU_INCLUDE_DIRS} ) target_link_libraries(mscclpp_nccl PUBLIC mscclpp mscclpp_collectives) diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc index c94aab34..1ccac65d 100644 --- a/src/ext/nccl/algorithm_selector.cc +++ b/src/ext/nccl/algorithm_selector.cc @@ -6,6 +6,7 @@ #include #include +#include "collective_utils.hpp" #include "debug.h" namespace mscclpp { @@ -20,24 +21,15 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da return false; } - const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || - dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ; - - if (!isFp8) { + if (!collective::isFp8DataType(dtype)) { return nvlsSupported; } - // FP8 handling #if !defined(__HIP_PLATFORM_AMD__) - // NVLS does not support FP8 on devices with compute capability < 10 - if (config.computeCapability.first < 10) { + if (!collective::isNativeFp8DataType(dtype)) { return false; } -#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) - return true; -#else - return false; -#endif + return nvlsSupported && config.fp8NvlsSupported; #else return nvlsSupported; #endif diff --git a/src/ext/nccl/algorithm_selector.hpp b/src/ext/nccl/algorithm_selector.hpp index c8705f8b..2048ea05 100644 --- a/src/ext/nccl/algorithm_selector.hpp +++ b/src/ext/nccl/algorithm_selector.hpp @@ -16,6 +16,7 @@ namespace nccl { struct AlgorithmSelectorConfig { bool symmetricMemory; bool nvlsSupported; + bool fp8NvlsSupported; bool isCuMemMapAllocated; bool inCaptureMode; std::pair computeCapability; diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc index 2d6c5f9d..8fcc1bb1 100644 --- a/src/ext/nccl/nccl.cc +++ b/src/ext/nccl/nccl.cc @@ -20,6 +20,7 @@ #include #include "algorithm_selector.hpp" +#include "collective_utils.hpp" #include "datatype_conversion.hpp" static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL; @@ -239,6 +240,8 @@ static std::shared_ptr algoSelector( static const bool isNvlsSupported = mscclpp::isNvlsSupported(); static const std::pair deviceComputeCapability = getDeviceComputeCapability(); static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory; + const bool fp8NvlsSupported = + mscclpp::collective::isNativeFp8DataType(request.dtype) ? mscclpp::collective::isFp8NvlsSupported() : false; const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast(request.inputBuffer)) && mscclpp::isCuMemMapAllocated(request.outputBuffer); @@ -249,6 +252,7 @@ static std::shared_ptr algoSelector( mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory, .nvlsSupported = isNvlsSupported, + .fp8NvlsSupported = fp8NvlsSupported, .isCuMemMapAllocated = isCuMemMapAllocated, .inCaptureMode = inCaptureMode, .computeCapability = deviceComputeCapability, diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index c28087a4..eb6985a8 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -127,6 +127,7 @@ class MPIBootstrap : public mscclpp::Bootstrap { MPI_Comm_size(shmcomm, &shmrank); return shmrank; } + int getNranksPerIpcDomain() const override { return getNranksPerNode(); } void allGather(void* sendbuf, int size) override { MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD); } diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc index 977314e9..0b5de4cb 100644 --- a/test/unit/gpu_utils_tests.cc +++ b/test/unit/gpu_utils_tests.cc @@ -60,3 +60,79 @@ TEST(GpuUtilsTest, Memcpy) { EXPECT_EQ(hostBuff[i], hostBuffTmp[i]); } } + +TEST(GpuUtilsTest, BufferPoolBasic) { + mscclpp::GpuBufferPool pool(4096); + + auto first = pool.allocate(64, 256); + EXPECT_EQ(first->bytes(), size_t(64)); + EXPECT_EQ(first->offset() % 256, size_t(0)); + EXPECT_EQ(first->data(), pool.data() + first->offset()); + EXPECT_EQ(first->deviceId(), pool.deviceId()); + EXPECT_EQ(pool.activeBytes(), size_t(64)); + + auto second = pool.allocate(128, 512); + EXPECT_EQ(second->bytes(), size_t(128)); + EXPECT_EQ(second->offset() % 512, size_t(0)); + EXPECT_EQ(second->data(), pool.data() + second->offset()); + EXPECT_EQ(pool.activeBytes(), size_t(64 + 128)); + + first.reset(); + EXPECT_EQ(pool.activeBytes(), size_t(128)); + second.reset(); + EXPECT_EQ(pool.activeBytes(), size_t(0)); + EXPECT_EQ(pool.freeBytes(), pool.bytes()); +} + +TEST(GpuUtilsTest, BufferPoolReservesAlignmentPadding) { + mscclpp::GpuBufferPool pool(1024); + + auto first = pool.allocate(100, 1); + auto second = pool.allocate(100, 256); + auto third = pool.allocate(1, 1); + + EXPECT_EQ(first->offset(), size_t(0)); + EXPECT_EQ(second->offset(), size_t(256)); + EXPECT_EQ(third->offset(), size_t(356)); +} + +TEST(GpuUtilsTest, BufferPoolReuseAfterRelease) { + mscclpp::GpuBufferPool pool(1024); + + auto first = pool.allocate(128, 1); + auto firstOffset = first->offset(); + first.reset(); + + auto second = pool.allocate(128, 1); + EXPECT_EQ(second->offset(), firstOffset); + second.reset(); + EXPECT_EQ(pool.freeBytes(), pool.bytes()); +} + +TEST(GpuUtilsTest, BufferPoolThrowsOnInvalidAllocation) { + mscclpp::GpuBufferPool pool(1024); + + bool zeroSizeThrows = false; + try { + (void)pool.allocate(0); + } catch (const mscclpp::Error&) { + zeroSizeThrows = true; + } + EXPECT_TRUE(zeroSizeThrows); + + bool zeroAlignmentThrows = false; + try { + (void)pool.allocate(1, 0); + } catch (const mscclpp::Error&) { + zeroAlignmentThrows = true; + } + EXPECT_TRUE(zeroAlignmentThrows); + + bool outOfMemoryThrows = false; + try { + (void)pool.allocate(pool.bytes() + 1); + } catch (const mscclpp::Error&) { + outOfMemoryThrows = true; + } + EXPECT_TRUE(outOfMemoryThrows); +}