diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml index 1bd89caf4..8a9ed2c5e 100644 --- a/.azure-pipelines/templates/ut-npkit.yml +++ b/.azure-pipelines/templates/ut-npkit.yml @@ -16,6 +16,29 @@ steps: cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"' deployArgs: 'single-node-test' +- template: run-remote-task.yml + parameters: + name: HostDiagnostics + displayName: Dump host GPU / GDRCopy state + continueOnError: true + runRemoteArgs: '--no-docker' + remoteScript: | + uname -r + cat /proc/driver/nvidia/version | head -2 || true + nvidia-smi -L || true + echo "gdrdrv loaded: $(cat /sys/module/gdrdrv/version 2>/dev/null || echo none)" + ibv_devinfo -l 2>&1 | head -10 || true + +- template: run-remote-task.yml + parameters: + name: ContainerDiagnostics + displayName: Dump container CUDA / libgdrapi state + continueOnError: true + remoteScript: | + nvcc --version | tail -4 || true + dpkg -l 2>/dev/null | awk '/libgdrapi/ {print $2,$3}' || true + ldd build/bin/mp_unit_tests 2>/dev/null | grep -iE 'gdr|cuda|mscclpp|ibverbs|mlx5' || true + - template: run-remote-task.yml parameters: name: MpUnitTests diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml index 743c66e6e..40ee3bd01 100644 --- a/.azure-pipelines/templates/ut.yml +++ b/.azure-pipelines/templates/ut.yml @@ -19,6 +19,29 @@ steps: deployArgs: 'single-node-test true ${{ parameters.platform }}' +- template: run-remote-task.yml + parameters: + name: HostDiagnostics + displayName: Dump host GPU / GDRCopy state + continueOnError: true + runRemoteArgs: '--no-docker' + remoteScript: | + uname -r + cat /proc/driver/nvidia/version | head -2 || true + nvidia-smi -L || true + echo "gdrdrv loaded: $(cat /sys/module/gdrdrv/version 2>/dev/null || echo none)" + ibv_devinfo -l 2>&1 | head -10 || true + +- template: run-remote-task.yml + parameters: + name: ContainerDiagnostics + displayName: Dump container CUDA / libgdrapi state + continueOnError: true + remoteScript: | + nvcc --version | tail -4 || true + dpkg -l 2>/dev/null | awk '/libgdrapi/ {print $2,$3}' || true + ldd build/bin/mp_unit_tests 2>/dev/null | grep -iE 'gdr|cuda|mscclpp|ibverbs|mlx5' || true + - template: run-remote-task.yml parameters: name: UnitTests diff --git a/src/core/gdr.cc b/src/core/gdr.cc index f361a3aa0..e34a63a7a 100644 --- a/src/core/gdr.cc +++ b/src/core/gdr.cc @@ -19,6 +19,12 @@ #define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1)) #endif +// mscclpp's GDRCopy path uses gdr_pin_buffer_v2, which was added to the gdrdrv kernel module +// in 2.5. Older modules return ENOTTY for the v2 ioctl, surfacing as a confusing "ret=25" +// failure deep inside GdrMap. Refuse early with a clear status when the loaded module is older. +#define MSCCLPP_GDRDRV_MIN_MAJOR 2 +#define MSCCLPP_GDRDRV_MIN_MINOR 5 + namespace mscclpp { // GdrContext @@ -33,10 +39,14 @@ class GdrContext { GdrStatus status() const { return status_; } gdr_t handle() const { return handle_; } + int driverMajor() const { return driverMajor_; } + int driverMinor() const { return driverMinor_; } private: GdrStatus status_; gdr_t handle_; + int driverMajor_; + int driverMinor_; }; static std::shared_ptr gdrContext() { @@ -49,7 +59,8 @@ GdrStatus gdrStatus() { return gdrContext()->status(); } bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; } std::string gdrStatusMessage() { - switch (gdrStatus()) { + auto ctx = gdrContext(); + switch (ctx->status()) { case GdrStatus::Ok: return "GDRCopy initialized successfully"; case GdrStatus::NotBuilt: @@ -60,12 +71,16 @@ std::string gdrStatusMessage() { return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)"; case GdrStatus::OpenFailed: return "gdr_open() failed; GDRCopy driver may be misconfigured"; + case GdrStatus::KernelTooOld: + return "gdrdrv kernel module " + std::to_string(ctx->driverMajor()) + "." + std::to_string(ctx->driverMinor()) + + " is older than the required minimum (" + std::to_string(MSCCLPP_GDRDRV_MIN_MAJOR) + "." + + std::to_string(MSCCLPP_GDRDRV_MIN_MINOR) + "); reinstall gdrcopy (e.g. v2.5.2) on the host"; default: return "unknown GDRCopy status"; } } -GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) { +GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr), driverMajor_(0), driverMinor_(0) { if (env()->forceDisableGdr) { INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR"); status_ = GdrStatus::Disabled; @@ -86,8 +101,29 @@ GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) { return; } + // Reject kernel modules older than the minimum required for gdr_pin_buffer_v2. + // Without this, GdrMap would later fail deep inside the v2 ioctl with ENOTTY (ret=25). + if (gdr_driver_get_version(handle_, &driverMajor_, &driverMinor_) != 0) { + INFO(GPU, "gdr_driver_get_version() failed; cannot verify kernel module version, disabling GDRCopy"); + gdr_close(handle_); + handle_ = nullptr; + status_ = GdrStatus::KernelTooOld; + return; + } + if (driverMajor_ < MSCCLPP_GDRDRV_MIN_MAJOR || + (driverMajor_ == MSCCLPP_GDRDRV_MIN_MAJOR && driverMinor_ < MSCCLPP_GDRDRV_MIN_MINOR)) { + WARN(GPU, "gdrdrv kernel module ", driverMajor_, ".", driverMinor_, " predates the v2 pin-buffer ioctl (need ", + MSCCLPP_GDRDRV_MIN_MAJOR, ".", MSCCLPP_GDRDRV_MIN_MINOR, "+); disabling GDRCopy"); + gdr_close(handle_); + handle_ = nullptr; + status_ = GdrStatus::KernelTooOld; + return; + } + + int libMajor = 0, libMinor = 0; + gdr_runtime_get_version(&libMajor, &libMinor); status_ = GdrStatus::Ok; - INFO(GPU, "GDRCopy initialized successfully"); + INFO(GPU, "GDRCopy initialized: libgdrapi ", libMajor, ".", libMinor, ", gdrdrv ", driverMajor_, ".", driverMinor_); } GdrContext::~GdrContext() { @@ -134,8 +170,13 @@ GdrMap::GdrMap(std::shared_ptr gpuMem, int deviceId) : pimpl_(std::make_un if (ret != 0) { ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh); if (ret != 0) { + // ENOTTY (25) here means the loaded gdrdrv kernel module doesn't recognise the v2 ioctl + // — GdrContext's version gate normally catches that earlier, so reaching here implies + // a real allocator or driver problem. THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr, - ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap)."); + "; gdrdrv ", pimpl_->ctx->driverMajor(), ".", pimpl_->ctx->driverMinor(), + ". If ret==25 (ENOTTY), the kernel module is too old; otherwise ensure the GPU memory is " + "allocated with cudaMalloc (not cuMemCreate/cuMemMap)."); } } diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp index c13783341..34717745b 100644 --- a/src/core/include/gdr.hpp +++ b/src/core/include/gdr.hpp @@ -17,6 +17,7 @@ enum class GdrStatus { Disabled, // Disabled via MSCCLPP_FORCE_DISABLE_GDR DriverMissing, // /dev/gdrdrv not found OpenFailed, // gdr_open() failed + KernelTooOld, // gdrdrv kernel module is older than the required minimum }; /// Return the detailed status of the global GDRCopy context. diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index 6358787bf..2deb0a2a1 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -39,14 +39,18 @@ if [ "${PLATFORM}" == "rocm" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu" fi -# Install GDRCopy kernel module on host VMs (CUDA only) +# Install GDRCopy kernel module on host VMs (CUDA only). Reinstall when the loaded +# version doesn't match GDRDRV_VERSION — mscclpp's GDRCopy path uses gdr_pin_buffer_v2, +# which requires gdrdrv 2.5. GDRCOPY_VERSION="2.5.2" +GDRDRV_VERSION="2.5" if [ "${PLATFORM}" == "cuda" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "if lsmod | grep -q gdrdrv; then - echo 'gdrdrv module already loaded' - else + "LOADED=\$(cat /sys/module/gdrdrv/version 2>/dev/null || true) + echo \"gdrdrv loaded: \${LOADED:-none} (need ${GDRDRV_VERSION})\" + if [ \"\${LOADED}\" != \"${GDRDRV_VERSION}\" ]; then set -e + sudo rmmod gdrdrv 2>/dev/null || true sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu index 78bb2e1ad..a418365d0 100644 --- a/test/unit/gdr_tests.cu +++ b/test/unit/gdr_tests.cu @@ -17,7 +17,7 @@ TEST(GdrStatusTest, StatusIsValid) { auto status = mscclpp::gdrStatus(); ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt || status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing || - status == mscclpp::GdrStatus::OpenFailed); + status == mscclpp::GdrStatus::OpenFailed || status == mscclpp::GdrStatus::KernelTooOld); } TEST(GdrStatusTest, EnabledConsistentWithStatus) {