Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .azure-pipelines/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -59,6 +61,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand Down
6 changes: 4 additions & 2 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ jobs:
displayName: Integration test A100
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

pool:
name: msccl-ci
Expand All @@ -53,6 +53,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

pool:
name: msccl-ci-h100
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0
pool:
name: mscclpp-multi-node
container:
Expand Down
4 changes: 4 additions & 0 deletions .azure-pipelines/nccl-api-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -56,6 +58,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/sglang-multi-node-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0
pool:
name: mscclpp-multi-node
container:
Expand Down
2 changes: 2 additions & 0 deletions .azure-pipelines/sglang-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0
pool:
name: msccl-ci-h100
container:
Expand Down
16 changes: 12 additions & 4 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ jobs:
name: msccl-ci
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -55,10 +55,10 @@ jobs:
name: msccl-ci
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -78,6 +78,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -97,6 +99,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand All @@ -118,6 +122,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand Down Expand Up @@ -161,6 +167,8 @@ jobs:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
cuda13:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda13.0

container:
image: $(containerImage)
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"/usr/include"
],
"C_Cpp.default.cStandard": "c17",
"C_Cpp.default.cppStandard": "c++17"
"C_Cpp.default.cppStandard": "c++20"
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer_amd.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"/usr/include"
],
"C_Cpp.default.cStandard": "c17",
Comment thread
Binyang2014 marked this conversation as resolved.
"C_Cpp.default.cppStandard": "c++17"
"C_Cpp.default.cppStandard": "c++20"
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
version: [ 'cuda11.8', 'cuda12.9' ]
version: [ 'cuda12.9', 'cuda13.0' ]

steps:
- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mscclpp-lang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ 'cuda11.8', 'cuda12.9' ]
version: [ 'cuda12.9', 'cuda13.0' ]

steps:
- uses: actions/checkout@v4
Expand Down
19 changes: 14 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ elseif(MSCCLPP_USE_CUDA)
if(NVIDIA_FOUND)
set(MSCCLPP_GPU_ARCHS "native")
else()
if(CUDAToolkit_VERSION VERSION_LESS "11.8")
message(FATAL_ERROR "CUDA 11.8 or higher required, found ${CUDAToolkit_VERSION}")
if(CUDAToolkit_VERSION VERSION_LESS "12.0")
message(FATAL_ERROR "CUDA 12.0 or higher required (C++20 build), found ${CUDAToolkit_VERSION}")
endif()
set(MSCCLPP_GPU_ARCHS 80)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
Expand Down Expand Up @@ -199,10 +199,11 @@ if(MSCCLPP_USE_ROCM)
endif()

# Declare project
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
if(MSCCLPP_USE_CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_EXTENSIONS OFF)
Comment thread
Binyang2014 marked this conversation as resolved.
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
enable_language(CUDA)

Expand All @@ -217,7 +218,7 @@ if(MSCCLPP_USE_CUDA)
set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
endif()
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_STANDARD 20)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")

set(CMAKE_HIP_ARCHITECTURES ${MSCCLPP_GPU_ARCHS})
Expand All @@ -226,6 +227,14 @@ else()
set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
endif()

message(STATUS "C++ standard: C++${CMAKE_CXX_STANDARD}")
if(MSCCLPP_USE_CUDA)
message(STATUS "CUDA toolkit version: ${CUDAToolkit_VERSION}")
message(STATUS "CUDA language standard: C++${CMAKE_CUDA_STANDARD}")
else()
message(STATUS "HIP language standard: C++${CMAKE_HIP_STANDARD}")
endif()

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_definitions(DEBUG_BUILD)
endif()
Expand Down
4 changes: 1 addition & 3 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ set -e

declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
Expand All @@ -21,7 +20,6 @@ extraLdPathTable=(

declare -A ofedVersionTable
ofedVersionTable=(
["cuda11.8"]="23.07-0.5.1.2"
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
Expand All @@ -34,7 +32,7 @@ TARGET=${1}
OS_ARCH=$(uname -m)

print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2|rocm7.2]"
echo "Usage: $0 [cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2|rocm7.2]"
}

if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
Expand Down
13 changes: 9 additions & 4 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@
* [NDm_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series)
* [ND_H100_v5](https://learn.microsoft.com/en-us/azure/virtual-machines/nd-h100-v5-series)
* Non-Azure Systems
* NVIDIA A100 GPUs + CUDA >= 11.8
* NVIDIA A100 GPUs + CUDA >= 12.0
* NVIDIA H100 GPUs + CUDA >= 12.0
* AMD MI250X GPUs + ROCm >= 5.7
* AMD MI300X GPUs + ROCm >= 6.0
* Toolchain
* MSCCL++ is built as **C++20** (both host and device code), so a C++20-capable toolchain is required.
* [CMake](https://cmake.org/) >= 3.25
* A C++20-capable host compiler, e.g., GCC >= 11 or Clang >= 14
* On NVIDIA platforms, **CUDA Toolkit >= 12.0** is required. `nvcc` first added `-std=c++20` support in CUDA 12.0, so earlier toolkits (11.x and below) cannot build the project.
* OS
* Tested on Ubuntu 20.04 and later
* Libraries
Expand Down Expand Up @@ -109,12 +114,12 @@ $ python -m pip install ".[cuda12]"
$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm7]"
```

> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, `rocm6`, or `rocm7`) is required to install CuPy.
> **Note:** A platform extra (`cuda12`, `cuda13`, `rocm6`, or `rocm7`) is required to install CuPy.
> The CUDA extras install pre-built CuPy wheels and CUDA Python bindings. The ROCm extras install CuPy from source
> and HIP Python for the matching ROCm major version, which require ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.

Optional extras can be installed by specifying them in brackets. Available extras:
- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package and CUDA Python bindings for your CUDA version.
- **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package and CUDA Python bindings for your CUDA version.
- **`rocm6`**, **`rocm7`**: Install CuPy from source and HIP Python for AMD ROCm platforms.
- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
Expand Down Expand Up @@ -215,7 +220,7 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0

```bash
# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, rocm6, or rocm7.
# Replace `cuda12` with your platform: cuda12, cuda13, rocm6, or rocm7.
$ python3 -m pip install ".[cuda12,benchmark,test]"

```
Expand Down
4 changes: 0 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@ dependencies = [
]

[project.optional-dependencies]
cuda11 = [
"cupy-cuda11x",
"cuda-bindings>=11.8,<12",
]
cuda12 = [
"cupy-cuda12x",
"cuda-bindings>=12,<13",
Expand Down
2 changes: 1 addition & 1 deletion python/mscclpp/_core/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _do_init(self):
self._is_hip = cp.cuda.runtime.is_hip
self._device_arch = get_device_arch()
self._compiler = self._get_compiler()
self._default_options = ["-std=c++17", "-O3", "--shared"]
self._default_options = ["-std=c++20", "-O3", "--shared"]
python_include = sysconfig.get_path("include")
pybind11_include = pybind11.get_include()
self._default_options += [f"-I{python_include}", f"-I{pybind11_include}"]
Expand Down
2 changes: 1 addition & 1 deletion python/mscclpp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(self, file: str, kernel_name: str, file_dir: str = None, macro_dict
self._kernel = Kernel(cubin, kernel_name)
self.kernel_map[kernel_key] = self._kernel

def _compile_cuda(self, source_file, output_file, std_version="c++17"):
def _compile_cuda(self, source_file, output_file, std_version="c++20"):
mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
include_dir = os.path.join(mscclpp_home, "include")
if not cp.cuda.runtime.is_hip:
Expand Down
11 changes: 0 additions & 11 deletions python/requirements_cuda11.txt

This file was deleted.

2 changes: 1 addition & 1 deletion src/core/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ MSCCLPP_API_CPP bool TransportFlags::operator==(TransportFlags other) const {
}

MSCCLPP_API_CPP bool TransportFlags::operator!=(TransportFlags other) const {
return detail::TransportFlagsBase::operator!=(other);
return !detail::TransportFlagsBase::operator==(other);
}

MSCCLPP_API_CPP detail::TransportFlagsBase TransportFlags::toBitset() const { return *this; }
Expand Down
4 changes: 2 additions & 2 deletions src/core/executor/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize,
size_t constDstOffset) {
std::ifstream file(this->planPath);
json obj = json::parse(file);
if (this->name != obj["name"]) {
if (this->name != obj["name"].get<std::string>()) {
throw Error("Plan name does not match", ErrorCode::ExecutorError);
}
this->collective = obj["collective"];
Expand Down Expand Up @@ -268,7 +268,7 @@ void ExecutionPlan::Impl::lightLoadExecutionPlan(size_t inputSize, size_t output
size_t constDstOffset) {
std::ifstream file(this->planPath);
json obj = json::parse(file);
if (this->name != obj["name"]) {
if (this->name != obj["name"].get<std::string>()) {
throw Error("Plan name does not match", ErrorCode::ExecutorError);
}
std::string protocol = obj["protocol"];
Expand Down
4 changes: 1 addition & 3 deletions test/deploy/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
fi

cd /root/mscclpp
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
pip3 install ".[cuda11,benchmark,test]"
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
if [[ "${CUDA_VERSION}" == *"12."* ]]; then
pip3 install ".[cuda12,benchmark,test]"
elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
pip3 install ".[cuda13,benchmark,test]"
Expand Down
Loading