Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .azure-pipelines/templates/ut-npkit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,29 @@ steps:
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
deployArgs: 'single-node-test'

- template: run-remote-task.yml
parameters:
name: HostDiagnostics
displayName: Dump host GPU / GDRCopy state
continueOnError: true
runRemoteArgs: '--no-docker'
remoteScript: |
uname -r
cat /proc/driver/nvidia/version | head -2 || true
nvidia-smi -L || true
echo "gdrdrv loaded: $(cat /sys/module/gdrdrv/version 2>/dev/null || echo none)"
ibv_devinfo -l 2>&1 | head -10 || true

- template: run-remote-task.yml
parameters:
name: ContainerDiagnostics
displayName: Dump container CUDA / libgdrapi state
continueOnError: true
remoteScript: |
nvcc --version | tail -4 || true
dpkg -l 2>/dev/null | awk '/libgdrapi/ {print $2,$3}' || true
ldd build/bin/mp_unit_tests 2>/dev/null | grep -iE 'gdr|cuda|mscclpp|ibverbs|mlx5' || true

- template: run-remote-task.yml
parameters:
name: MpUnitTests
Expand Down
23 changes: 23 additions & 0 deletions .azure-pipelines/templates/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,29 @@ steps:
deployArgs: 'single-node-test true ${{ parameters.platform }}'


- template: run-remote-task.yml
parameters:
name: HostDiagnostics
displayName: Dump host GPU / GDRCopy state
continueOnError: true
runRemoteArgs: '--no-docker'
remoteScript: |
uname -r
cat /proc/driver/nvidia/version | head -2 || true
nvidia-smi -L || true
echo "gdrdrv loaded: $(cat /sys/module/gdrdrv/version 2>/dev/null || echo none)"
ibv_devinfo -l 2>&1 | head -10 || true

- template: run-remote-task.yml
parameters:
name: ContainerDiagnostics
displayName: Dump container CUDA / libgdrapi state
continueOnError: true
remoteScript: |
nvcc --version | tail -4 || true
dpkg -l 2>/dev/null | awk '/libgdrapi/ {print $2,$3}' || true
ldd build/bin/mp_unit_tests 2>/dev/null | grep -iE 'gdr|cuda|mscclpp|ibverbs|mlx5' || true

- template: run-remote-task.yml
parameters:
name: UnitTests
Expand Down
49 changes: 45 additions & 4 deletions src/core/gdr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
#endif

// mscclpp's GDRCopy path uses gdr_pin_buffer_v2, which was added to the gdrdrv kernel module
// in 2.5. Older modules return ENOTTY for the v2 ioctl, surfacing as a confusing "ret=25"
// failure deep inside GdrMap. Refuse early with a clear status when the loaded module is older.
#define MSCCLPP_GDRDRV_MIN_MAJOR 2
#define MSCCLPP_GDRDRV_MIN_MINOR 5

namespace mscclpp {

// GdrContext
Expand All @@ -33,10 +39,14 @@ class GdrContext {

GdrStatus status() const { return status_; }
gdr_t handle() const { return handle_; }
int driverMajor() const { return driverMajor_; }
int driverMinor() const { return driverMinor_; }

private:
GdrStatus status_;
gdr_t handle_;
int driverMajor_;
int driverMinor_;
};

static std::shared_ptr<GdrContext> gdrContext() {
Expand All @@ -49,7 +59,8 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }

std::string gdrStatusMessage() {
switch (gdrStatus()) {
auto ctx = gdrContext();
switch (ctx->status()) {
case GdrStatus::Ok:
return "GDRCopy initialized successfully";
case GdrStatus::NotBuilt:
Expand All @@ -60,12 +71,16 @@ std::string gdrStatusMessage() {
return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
case GdrStatus::OpenFailed:
return "gdr_open() failed; GDRCopy driver may be misconfigured";
case GdrStatus::KernelTooOld:
return "gdrdrv kernel module " + std::to_string(ctx->driverMajor()) + "." + std::to_string(ctx->driverMinor()) +
" is older than the required minimum (" + std::to_string(MSCCLPP_GDRDRV_MIN_MAJOR) + "." +
std::to_string(MSCCLPP_GDRDRV_MIN_MINOR) + "); reinstall gdrcopy (e.g. v2.5.2) on the host";
default:
return "unknown GDRCopy status";
}
}

GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr), driverMajor_(0), driverMinor_(0) {
if (env()->forceDisableGdr) {
INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
status_ = GdrStatus::Disabled;
Expand All @@ -86,8 +101,29 @@ GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
return;
}

// Reject kernel modules older than the minimum required for gdr_pin_buffer_v2.
// Without this, GdrMap would later fail deep inside the v2 ioctl with ENOTTY (ret=25).
if (gdr_driver_get_version(handle_, &driverMajor_, &driverMinor_) != 0) {
INFO(GPU, "gdr_driver_get_version() failed; cannot verify kernel module version, disabling GDRCopy");
gdr_close(handle_);
handle_ = nullptr;
status_ = GdrStatus::KernelTooOld;
return;
}
if (driverMajor_ < MSCCLPP_GDRDRV_MIN_MAJOR ||
(driverMajor_ == MSCCLPP_GDRDRV_MIN_MAJOR && driverMinor_ < MSCCLPP_GDRDRV_MIN_MINOR)) {
WARN(GPU, "gdrdrv kernel module ", driverMajor_, ".", driverMinor_, " predates the v2 pin-buffer ioctl (need ",
MSCCLPP_GDRDRV_MIN_MAJOR, ".", MSCCLPP_GDRDRV_MIN_MINOR, "+); disabling GDRCopy");
gdr_close(handle_);
handle_ = nullptr;
status_ = GdrStatus::KernelTooOld;
return;
}

int libMajor = 0, libMinor = 0;
gdr_runtime_get_version(&libMajor, &libMinor);
status_ = GdrStatus::Ok;
INFO(GPU, "GDRCopy initialized successfully");
INFO(GPU, "GDRCopy initialized: libgdrapi ", libMajor, ".", libMinor, ", gdrdrv ", driverMajor_, ".", driverMinor_);
}

GdrContext::~GdrContext() {
Expand Down Expand Up @@ -134,8 +170,13 @@ GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_un
if (ret != 0) {
ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
if (ret != 0) {
// ENOTTY (25) here means the loaded gdrdrv kernel module doesn't recognise the v2 ioctl
// — GdrContext's version gate normally catches that earlier, so reaching here implies
// a real allocator or driver problem.
THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
"; gdrdrv ", pimpl_->ctx->driverMajor(), ".", pimpl_->ctx->driverMinor(),
". If ret==25 (ENOTTY), the kernel module is too old; otherwise ensure the GPU memory is "
"allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
}
}

Expand Down
1 change: 1 addition & 0 deletions src/core/include/gdr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ enum class GdrStatus {
Disabled, // Disabled via MSCCLPP_FORCE_DISABLE_GDR
DriverMissing, // /dev/gdrdrv not found
OpenFailed, // gdr_open() failed
KernelTooOld, // gdrdrv kernel module is older than the required minimum
};

/// Return the detailed status of the global GDRCopy context.
Expand Down
12 changes: 8 additions & 4 deletions test/deploy/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ if [ "${PLATFORM}" == "rocm" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
fi

# Install GDRCopy kernel module on host VMs (CUDA only)
# Install GDRCopy kernel module on host VMs (CUDA only). Reinstall when the loaded
# version doesn't match GDRDRV_VERSION — mscclpp's GDRCopy path uses gdr_pin_buffer_v2,
# which requires gdrdrv 2.5.
GDRCOPY_VERSION="2.5.2"
GDRDRV_VERSION="2.5"
if [ "${PLATFORM}" == "cuda" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"if lsmod | grep -q gdrdrv; then
echo 'gdrdrv module already loaded'
else
"LOADED=\$(cat /sys/module/gdrdrv/version 2>/dev/null || true)
echo \"gdrdrv loaded: \${LOADED:-none} (need ${GDRDRV_VERSION})\"
if [ \"\${LOADED}\" != \"${GDRDRV_VERSION}\" ]; then
set -e
sudo rmmod gdrdrv 2>/dev/null || true
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
Expand Down
2 changes: 1 addition & 1 deletion test/unit/gdr_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ TEST(GdrStatusTest, StatusIsValid) {
auto status = mscclpp::gdrStatus();
ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
status == mscclpp::GdrStatus::OpenFailed);
status == mscclpp::GdrStatus::OpenFailed || status == mscclpp::GdrStatus::KernelTooOld);
}

TEST(GdrStatusTest, EnabledConsistentWithStatus) {
Expand Down
Loading