microsoft · Binyang2014 · Apr 27, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA)
     else()
         set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
     endif()
+    list(APPEND GPU_LIBRARIES CUDA::nvml)
 else()
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")

diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
@@ -475,7 +475,7 @@ All examples are in [`examples/torch-integration/`](../../examples/torch-integra
 
 The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
 
-**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
+**Reference implementation:** MSCCL++ ships a ready-to-use autotuner in [`python/mscclpp_benchmark/bench_collective.py`](../../python/mscclpp_benchmark/bench_collective.py). It benchmarks every candidate algorithm, block count, and thread count per message size, writes the winning configuration to a JSON file, and can replay that file at runtime. The sections below explain the underlying mechanism; see that benchmark for the complete, maintained implementation.
 
 ### How It Works
 
@@ -656,9 +656,20 @@ def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
                 self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
 ```
 
-### Running the Tuning Example
+### Running the Autotuner
+
+MSCCL++'s built-in autotuner benchmarks every candidate configuration and saves the best one to JSON. Run it across the ranks of your job, then reuse the generated config:
 
 ```bash
-MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
-  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
+# Autotune and save the tuned config
+mpirun -np 8 --allow-run-as-root \
+  python3 -m mscclpp_benchmark.bench_collective \
+  --collective allreduce --dtype float16 --autotune \
+  --write-config /tmp/mscclpp_tuned_configs.json
+
+# Replay the tuned config in a benchmark
+mpirun -np 8 --allow-run-as-root \
+  python3 -m mscclpp_benchmark.bench_collective \
+  --collective allreduce --dtype float16 \
+  --config-path /tmp/mscclpp_tuned_configs.json
 ```
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -79,7 +79,7 @@ __global__ void __launch_bounds__(1024)
 
 struct Context {
   int rank;
-  int workSize;
+  int worldSize;
   int nRanksPerNode;
 
   std::vector<mscclpp::RegisteredMemory> registeredMemories;
@@ -140,7 +140,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                           size_t inputSize, cudaStream_t stream) {
     auto algoCtx = std::static_pointer_cast<Context>(ctx);
     int rank = algoCtx->rank;
-    int worldSize = algoCtx->workSize;
+    int worldSize = algoCtx->worldSize;
 
     int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
     allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize);
@@ -154,16 +154,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                              void* output, size_t inputSize, mscclpp::DataType dtype) {
     auto ctx = std::make_shared<Context>();
     ctx->rank = comm->bootstrap()->getRank();
-    ctx->workSize = comm->bootstrap()->getNranks();
+    ctx->worldSize = comm->bootstrap()->getNranks();
     ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
     // register memories
     mscclpp::RegisteredMemory inputBufRegMem =
         comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc);
     mscclpp::RegisteredMemory outputBufRegMem =
-        comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc);
+        comm->registerMemory(output, inputSize * ctx->worldSize, mscclpp::Transport::CudaIpc);
     std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
-    for (int i = 0; i < ctx->workSize; i++) {
+    for (int i = 0; i < ctx->worldSize; i++) {
       if (i == ctx->rank) continue;
       comm->sendMemory(outputBufRegMem, i, 0);
       remoteRegMemories.push_back(comm->recvMemory(i, 0));

diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
@@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024)
 
 struct Context {
   int rank;
-  int workSize;
+  int worldSize;
   int nRanksPerNode;
 
   std::vector<mscclpp::RegisteredMemory> registeredMemories;
@@ -108,7 +108,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                           cudaStream_t stream) {
     auto algoCtx = std::static_pointer_cast<Context>(ctx);
     int rank = algoCtx->rank;
-    int worldSize = algoCtx->workSize;
+    int worldSize = algoCtx->worldSize;
 
     int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
     allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes);
@@ -122,16 +122,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
                                              void* output, size_t inputBytes, mscclpp::DataType dtype) {
     auto ctx = std::make_shared<Context>();
     ctx->rank = comm->bootstrap()->getRank();
-    ctx->workSize = comm->bootstrap()->getNranks();
+    ctx->worldSize = comm->bootstrap()->getNranks();
     ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
     // register memories
     mscclpp::RegisteredMemory inputBufRegMem =
         comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc);
     mscclpp::RegisteredMemory outputBufRegMem =
-        comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc);
+        comm->registerMemory(output, inputBytes * ctx->worldSize, mscclpp::Transport::CudaIpc);
     std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
-    for (int i = 0; i < ctx->workSize; i++) {
+    for (int i = 0; i < ctx->worldSize; i++) {
       if (i == ctx->rank) continue;
       comm->sendMemory(outputBufRegMem, i, 0);
       remoteRegMemories.push_back(comm->recvMemory(i, 0));