Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
dd8b301
Scale native allreduce/allgather algos for MNNVL/MNNVLS
Binyang2014 Apr 27, 2026
893a08e
Enable MNNVL allreduce tuning
Binyang2014 Apr 28, 2026
dded5e0
Improve MNNVL allreduce tuning performance
Binyang2014 Apr 28, 2026
865c2bc
Optimize MNNVL allreduce without symmetric memory
Binyang2014 Apr 28, 2026
3bc00cb
Enable NVLS zero-copy without symmetric memory flag
Binyang2014 Apr 28, 2026
533f329
Tune no-sym MNNVL with RSAG zero-copy
Binyang2014 Apr 28, 2026
45a651b
Decouple IPC-domain hint from bootstrap nRanksPerNode
Binyang2014 May 1, 2026
2a2fca8
Rename collective ctx/kernel param nRanksPerNode to ipcDomainNranks
Binyang2014 May 1, 2026
2efda4d
Restore compile-time templated NRanksPerNode for rsag_zero_copy
Binyang2014 May 1, 2026
1c29817
Revert AllreduceRsAgZeroCopy non-symmetric ctx key tag back to ++tag
Binyang2014 May 1, 2026
7bc5e04
Reset GPU tokens before reuse
Binyang2014 May 2, 2026
9a36884
Rename gpuMemset wrapper and zero TokenPool slots in deleter
Binyang2014 May 2, 2026
987f800
Merge remote-tracking branch 'origin/main' into binyli/mnnvl
Binyang2014 May 4, 2026
6296803
Make NVLS non-zero-copy allreduce algorithms MNNVL-ready
Binyang2014 May 5, 2026
9aeeaf0
Simplify torch-integration tuning example for MPI-only multi-node tes…
Binyang2014 May 6, 2026
905b23d
Drop non-MNNVL multi_node regime from torch-integration example
Binyang2014 May 6, 2026
4a0d5b2
Simplify torch-integration tuning example
Binyang2014 May 6, 2026
307a471
Shorten verbose comments and use THROW in validateIpcDomainSpansWorld
Binyang2014 May 6, 2026
f0c6ac0
Fold validateIpcDomainSpansWorld into getIpcDomainNranks
Binyang2014 May 6, 2026
bde23ce
Revert verbose RSAG zero-copy comment; rename NRanksPerNode template …
Binyang2014 May 6, 2026
095cfff
Revert RSAG nBlocks default to 64
Binyang2014 May 6, 2026
639b80d
Tie AllreduceAllpairPacket maxBlockNum_ to MAX_IPC_DOMAIN_NRANKS - 1
Binyang2014 May 6, 2026
e8caab7
Strip preflight validation blocks from NVLS pipeline allreduce kernels
Binyang2014 May 6, 2026
7d80a33
Default torch example SYMMETRIC_MEMORY env to 1
Binyang2014 May 6, 2026
d1b04a3
NVLS zero-copy allreduce: support FP16 accumulator for FP8 inputs
Binyang2014 May 7, 2026
113d859
fix
Binyang2014 May 8, 2026
9ff7e1c
update
Binyang2014 May 8, 2026
654bcfa
update
Binyang2014 May 8, 2026
5516bdb
fix
Binyang2014 May 8, 2026
e208cc3
WIP
Binyang2014 May 8, 2026
825fc12
address hang issue
Binyang2014 May 9, 2026
224b3de
Clean up completed communicator receives
Binyang2014 May 13, 2026
0c09239
Merge branch 'main' into binyli/mnnvl
Binyang2014 May 13, 2026
7724e49
Fix lint and ROCm error alias
Copilot May 13, 2026
24850ef
Merge branch 'main' into binyli/mnnvl
Binyang2014 May 15, 2026
ee82cc4
Merge branch 'main' into binyli/mnnvl
Binyang2014 May 15, 2026
dbebde2
Configure IPC domain per communicator
Binyang2014 May 15, 2026
93b4354
temp solution
Binyang2014 May 15, 2026
0744e80
detect ipc domain automaticlly
Binyang2014 May 16, 2026
94af88d
Fix tuning example hang
Binyang2014 May 16, 2026
f32cfb1
update
Binyang2014 May 16, 2026
594dc79
Address NVLS review feedback
seagater May 16, 2026
18d3737
Tighten NVML IPC domain hash lookup
seagater May 16, 2026
4db71b9
Move barrier into setupNvlsChannels and clean up NVLS pipeline state
Binyang2014 May 18, 2026
35331cf
Fix collective topology sizing
Binyang2014 May 20, 2026
ac44e98
update
Binyang2014 May 20, 2026
7308c32
merge main
Binyang2014 May 22, 2026
42ece40
Fix memory leak
Binyang2014 May 24, 2026
641420d
increase nvls memory size to 64 GB
Binyang2014 May 26, 2026
ea73a1e
WIP
Binyang2014 May 26, 2026
ba0b3e3
merge main
Binyang2014 Jun 4, 2026
99f20cd
merge main
Binyang2014 Jun 4, 2026
ecc8c5c
move tuner to benchmark
Binyang2014 Jun 4, 2026
000cd5b
update algo
Binyang2014 Jun 5, 2026
cd9d504
add check for nvls fp8 support
Binyang2014 Jun 5, 2026
d754c35
lint and log
Binyang2014 Jun 6, 2026
2954d81
log
Binyang2014 Jun 6, 2026
d401f97
WIP
Binyang2014 Jun 6, 2026
ee3606e
WIP
Binyang2014 Jun 6, 2026
e1c2679
add more logs
Binyang2014 Jun 8, 2026
0a26c30
for buffer pool
Binyang2014 Jun 12, 2026
ddf8b14
update for buffer pool
Binyang2014 Jun 13, 2026
9c26eb4
add test
Binyang2014 Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA)
else()
set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
endif()
list(APPEND GPU_LIBRARIES CUDA::nvml)
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
Expand Down
19 changes: 15 additions & 4 deletions docs/guide/mscclpp-torch-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ All examples are in [`examples/torch-integration/`](../../examples/torch-integra

The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.

**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
**Reference implementation:** MSCCL++ ships a ready-to-use autotuner in [`python/mscclpp_benchmark/bench_collective.py`](../../python/mscclpp_benchmark/bench_collective.py). It benchmarks every candidate algorithm, block count, and thread count per message size, writes the winning configuration to a JSON file, and can replay that file at runtime. The sections below explain the underlying mechanism; see that benchmark for the complete, maintained implementation.

### How It Works

Expand Down Expand Up @@ -656,9 +656,20 @@ def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
```

### Running the Tuning Example
### Running the Autotuner

MSCCL++'s built-in autotuner benchmarks every candidate configuration and saves the best one to JSON. Run it across the ranks of your job, then reuse the generated config:

```bash
MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
# Autotune and save the tuned config
mpirun -np 8 --allow-run-as-root \
python3 -m mscclpp_benchmark.bench_collective \
--collective allreduce --dtype float16 --autotune \
--write-config /tmp/mscclpp_tuned_configs.json

# Replay the tuned config in a benchmark
mpirun -np 8 --allow-run-as-root \
python3 -m mscclpp_benchmark.bench_collective \
--collective allreduce --dtype float16 \
--config-path /tmp/mscclpp_tuned_configs.json
```
10 changes: 5 additions & 5 deletions examples/customized-collective-algorithm/customized_allgather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ __global__ void __launch_bounds__(1024)

struct Context {
int rank;
int workSize;
int worldSize;
int nRanksPerNode;

std::vector<mscclpp::RegisteredMemory> registeredMemories;
Expand Down Expand Up @@ -140,7 +140,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
size_t inputSize, cudaStream_t stream) {
auto algoCtx = std::static_pointer_cast<Context>(ctx);
int rank = algoCtx->rank;
int worldSize = algoCtx->workSize;
int worldSize = algoCtx->worldSize;

int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize);
Expand All @@ -154,16 +154,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
void* output, size_t inputSize, mscclpp::DataType dtype) {
auto ctx = std::make_shared<Context>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->worldSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();

// register memories
mscclpp::RegisteredMemory inputBufRegMem =
comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory outputBufRegMem =
comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc);
comm->registerMemory(output, inputSize * ctx->worldSize, mscclpp::Transport::CudaIpc);
std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
for (int i = 0; i < ctx->workSize; i++) {
for (int i = 0; i < ctx->worldSize; i++) {
if (i == ctx->rank) continue;
comm->sendMemory(outputBufRegMem, i, 0);
remoteRegMemories.push_back(comm->recvMemory(i, 0));
Expand Down
10 changes: 5 additions & 5 deletions examples/torch-integration/customized_allgather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024)

struct Context {
int rank;
int workSize;
int worldSize;
int nRanksPerNode;

std::vector<mscclpp::RegisteredMemory> registeredMemories;
Expand Down Expand Up @@ -108,7 +108,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
cudaStream_t stream) {
auto algoCtx = std::static_pointer_cast<Context>(ctx);
int rank = algoCtx->rank;
int worldSize = algoCtx->workSize;
int worldSize = algoCtx->worldSize;

int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes);
Expand All @@ -122,16 +122,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
void* output, size_t inputBytes, mscclpp::DataType dtype) {
auto ctx = std::make_shared<Context>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->worldSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();

// register memories
mscclpp::RegisteredMemory inputBufRegMem =
comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory outputBufRegMem =
comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc);
comm->registerMemory(output, inputBytes * ctx->worldSize, mscclpp::Transport::CudaIpc);
std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
for (int i = 0; i < ctx->workSize; i++) {
for (int i = 0; i < ctx->worldSize; i++) {
if (i == ctx->rank) continue;
comm->sendMemory(outputBufRegMem, i, 0);
remoteRegMemories.push_back(comm->recvMemory(i, 0));
Expand Down
Loading
Loading