diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 77f4ada363ae2..8967d8ef2dff1 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -212,19 +212,33 @@ bool device::has(aspect Aspect) const { return impl->has(Aspect); } void device::ext_oneapi_enable_peer_access(const device &peer) { ur_device_handle_t Device = impl->getHandleRef(); ur_device_handle_t Peer = peer.impl->getHandleRef(); - if (Device != Peer) { - detail::adapter_impl &Adapter = impl->getAdapter(); - Adapter.call(Device, Peer); + + if (Device == Peer) + return; + + if (peer.get_platform() != get_platform()) { + throw exception(errc::invalid, + "Cannot enable peer access between different platforms"); } + + impl->getAdapter().call( + Device, Peer); } void device::ext_oneapi_disable_peer_access(const device &peer) { ur_device_handle_t Device = impl->getHandleRef(); ur_device_handle_t Peer = peer.impl->getHandleRef(); - if (Device != Peer) { - detail::adapter_impl &Adapter = impl->getAdapter(); - Adapter.call(Device, Peer); + + if (Device == Peer) + return; + + if (peer.get_platform() != get_platform()) { + throw exception(errc::invalid, + "Cannot disable peer access between different platforms"); } + + impl->getAdapter().call( + Device, Peer); } bool device::ext_oneapi_can_access_peer(const device &peer, diff --git a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp new file mode 100644 index 0000000000000..982ae0a410999 --- /dev/null +++ b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp @@ -0,0 +1,120 @@ +// Verify that the Level Zero v2 adapter correctly makes USM device memory +// resident on peer devices when P2P access is enabled. +// +// Phase 1: Allocates memory on dev0, fills it with a known pattern, enables +// P2P access from dev1 to dev0, then uses dev1's queue to copy the data to +// the host and verifies all values match the fill pattern. +// +// Phase 2 (opposite direction): Allocates memory on dev1, fills it with a +// different pattern, enables P2P access from dev0 to dev1, then uses dev0's +// queue to copy the data to the host and verifies correctness. +// +// REQUIRES: level_zero && two-or-more-gpu-devices +// UNSUPPORTED: level_zero_v1_adapter +// UNSUPPORTED-INTENDED: Test is specific to the Level Zero v2 adapter. +// +// RUN: %{build} -o %t.out +// RUN: env UR_LOADER_USE_LEVEL_ZERO_V2=1 %{run} %t.out + +#include +#include + +#include +#include +#include + +using namespace sycl; + +// Allocate N ints on srcQueue's device, fill with fillVal, enable P2P so that +// dstDev can access srcDev's allocations, copy to host via dstQueue, verify +// all values, then clean up. Returns false on failure. +static bool testP2PRead(context &ctx, queue &srcQueue, device &srcDev, + queue &dstQueue, device &dstDev, size_t N, int fillVal, + const char *label) { + int *src = sycl::malloc_device(N, srcQueue); + if (!src) { + std::cout << label << ": device alloc failed. Skipping.\n"; + return true; // not a test failure + } + srcQueue.fill(src, fillVal, N).wait(); + + // Enable P2P: dstDev may now access allocations on srcDev. Under the + // Level Zero v2 adapter this also makes the srcDev allocation resident + // on dstDev. + std::cout << "Enabling P2P: dstDev may now access allocations on srcDev.\n"; + dstDev.ext_oneapi_enable_peer_access(srcDev); + + std::vector result(N, 0); + dstQueue.memcpy(result.data(), src, N * sizeof(int)).wait(); + + sycl::free(src, ctx); + std::cout + << "Disabling P2P: dstDev may no longer access allocations on srcDev.\n"; + dstDev.ext_oneapi_disable_peer_access(srcDev); + + for (size_t i = 0; i < N; ++i) { + if (result[i] != fillVal) { + std::cout << label << ": FAIL at index " << i << ": got " << result[i] + << ", expected " << fillVal << "\n"; + return false; + } + } + std::cout << label << ": OK\n"; + return true; +} + +int main() { + // Find a platform with at least two GPU devices. + std::vector gpus; + for (auto &plat : platform::get_platforms()) { + gpus = plat.get_devices(info::device_type::gpu); + if (gpus.size() >= 2) + break; + } + + if (gpus.size() < 2) { + std::cout << "Test requires at least two GPU devices on the same platform. " + "Skipping.\n"; + return 0; + } + + device &dev0 = gpus[0]; + device &dev1 = gpus[1]; + + std::cout << "Device 0: " << dev0.get_info() << "\n"; + std::cout << "Device 1: " << dev1.get_info() << "\n"; + + // Both devices share a single context for cross-device USM. + context ctx({dev0, dev1}); + queue q0(ctx, dev0); + queue q1(ctx, dev1); + + constexpr size_t N = 1024; + + // Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0). + std::cout << "Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0).\n"; + if (!dev1.ext_oneapi_can_access_peer( + dev0, ext::oneapi::peer_access::access_supported)) { + std::cout << "No hardware P2P support (dev1->dev0). Skipping.\n"; + return 0; + } + if (!testP2PRead(ctx, q0, dev0, q1, dev1, N, 0x42, + "Phase 1 (dev1 reads dev0)")) + return 1; + + // Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1). + std::cout + << "Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1).\n"; + if (!dev0.ext_oneapi_can_access_peer( + dev1, ext::oneapi::peer_access::access_supported)) { + std::cout << "No hardware P2P support (dev0->dev1). Skipping phase 2.\n"; + std::cout << "PASS\n"; + return 0; + } + if (!testP2PRead(ctx, q1, dev1, q0, dev0, N, 0x55, + "Phase 2 (dev0 reads dev1)")) + return 1; + + std::cout << "PASS\n"; + return 0; +} diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 7a679cd5812c1..ce1e4d8f1b02d 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -686,6 +686,13 @@ def open_check_file(file_name): if "opencl:cpu" in sycl_ls_output: config.available_features.add("opencl-cpu-rt") + # Count physical GPU devices: each physical GPU produces one output line + # that contains ":gpu]". Add a feature when at least two are present so + # tests requiring multi-GPU hardware can be skipped on single-GPU machines. + gpu_device_lines = [l for l in sycl_ls_output.splitlines() if ":gpu]" in l] + if len(gpu_device_lines) >= 2: + config.available_features.add("two-or-more-gpu-devices") + if len(config.sycl_devices) == 1 and config.sycl_devices[0] == "all": devices = set() for line in sycl_ls_output.splitlines(): diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 73c003ad90f88..42e63e952f392 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -150,7 +150,6 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/helpers/mutable_helpers.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp @@ -194,6 +193,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm_p2p.cpp ) install_ur_library(ur_adapter_level_zero_v2) diff --git a/unified-runtime/source/adapters/level_zero/context.cpp b/unified-runtime/source/adapters/level_zero/context.cpp index b45020efd8ee9..cc58068623020 100644 --- a/unified-runtime/source/adapters/level_zero/context.cpp +++ b/unified-runtime/source/adapters/level_zero/context.cpp @@ -41,6 +41,8 @@ ur_result_t urContextCreate( Context->initialize(); *RetContext = reinterpret_cast(Context); + // TODO: delete below 'if' when memory isolation in the context is + // implemented in the driver if (IndirectAccessTrackingEnabled) { std::scoped_lock Lock(Platform->ContextsMutex); Platform->Contexts.push_back(*RetContext); diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index 95db895791bc1..6ff90505097e4 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -2353,3 +2353,24 @@ void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle, void *HostPtr) { ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr)); } + +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_ const &device_handle) { + if (device_handle.Id.has_value()) { + return os << device_handle.Id.value(); + } + return os << "NONE"; +} + +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_::PeerStatus peer_status) { + switch (peer_status) { + case ur_device_handle_t_::PeerStatus::DISABLED: + return os << "DISABLED"; + case ur_device_handle_t_::PeerStatus::ENABLED: + return os << "ENABLED"; + case ur_device_handle_t_::PeerStatus::NO_CONNECTION: + return os << "NO_CONNECTION"; + } + return os << "UNKNOWN"; +} diff --git a/unified-runtime/source/adapters/level_zero/device.hpp b/unified-runtime/source/adapters/level_zero/device.hpp index e3e8b9c8125ae..0e184c0c5f01d 100644 --- a/unified-runtime/source/adapters/level_zero/device.hpp +++ b/unified-runtime/source/adapters/level_zero/device.hpp @@ -261,17 +261,29 @@ struct ur_device_handle_t_ : ur_object { std::unordered_map ZeOffsetToImageHandleMap; - // unique ephemeral identifer of the device in the adapter + // Devices which user enabled p2p access by + // urUsmP2P(Enable|Disable)PeerAccessExp. Devices are indexed by device id. + enum class PeerStatus : char { ENABLED, DISABLED, NO_CONNECTION }; + std::vector + peers; // info if our device can access given peer device allocations + + // unique ephemeral identifier of the device in the adapter std::optional Id; ur::RefCount RefCount; }; +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_ const &device_handle); +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_::PeerStatus peer_status); + // Collects a flat vector of unique devices for USM memory pool creation. // Traverses the input devices and their sub-devices, ensuring each Level Zero // device handle appears only once in the result. inline std::vector CollectDevicesForUsmPoolCreation( const std::vector &Devices) { + std::vector DevicesAndSubDevices; std::unordered_set Seen; diff --git a/unified-runtime/source/adapters/level_zero/platform.cpp b/unified-runtime/source/adapters/level_zero/platform.cpp index a805dbd73d149..5e8716b41a59f 100644 --- a/unified-runtime/source/adapters/level_zero/platform.cpp +++ b/unified-runtime/source/adapters/level_zero/platform.cpp @@ -746,9 +746,9 @@ ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { std::shared_lock Lock(URDevicesCacheMutex); auto it = std::find_if(URDevicesCache.begin(), URDevicesCache.end(), [&](std::unique_ptr &D) { - return D.get()->ZeDevice == ZeDevice && - (D.get()->RootDevice == nullptr || - D.get()->RootDevice->RootDevice == nullptr); + return D->ZeDevice == ZeDevice && + (D->RootDevice == nullptr || + D->RootDevice->RootDevice == nullptr); }); if (it != URDevicesCache.end()) { return (*it).get(); @@ -914,6 +914,43 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() { ZeDeviceSynchronizeSupported = Supported; } + for (auto &dev : URDevicesCache) { + dev->peers = std::vector( + URDevicesCache.size(), ur_device_handle_t_::PeerStatus::NO_CONNECTION); + + for (size_t peerId = 0; peerId < URDevicesCache.size(); ++peerId) { + if (peerId == dev->Id.value()) + continue; + + ZeStruct p2pProperties; + ZE2UR_CALL( + zeDeviceGetP2PProperties, + (dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2pProperties)); + if (!(p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)) { + UR_LOG(INFO, + "p2p access to memory of dev:{} from dev:{} not possible due to " + "lack of p2p property", + peerId, dev->Id.value()); + continue; + } + + ze_bool_t p2p; + ZE2UR_CALL(zeDeviceCanAccessPeer, + (dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2p)); + if (!p2p) { + UR_LOG(INFO, + "p2p access to memory of dev:{} from dev:{} not possible due to " + "no connection", + peerId, dev->Id.value()); + continue; + } + + UR_LOG(INFO, "p2p access to memory of dev:{} from dev:{} can be enabled", + peerId, dev->Id.value()); + dev->peers[peerId] = ur_device_handle_t_::PeerStatus::DISABLED; + } + } + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/platform.hpp b/unified-runtime/source/adapters/level_zero/platform.hpp index 82396632e7ea8..766160807a198 100644 --- a/unified-runtime/source/adapters/level_zero/platform.hpp +++ b/unified-runtime/source/adapters/level_zero/platform.hpp @@ -97,11 +97,10 @@ struct ur_platform_handle_t_ : ur::handle_base, uint32_t VersionMinor, uint32_t VersionBuild); - // Keep track of all contexts in the platform. This is needed to manage - // a lifetime of memory allocations in each context when there are kernels - // with indirect access. - // TODO: should be deleted when memory isolation in the context is implemented - // in the driver. + // Keep track of all contexts in the platform. In v1 L0 this is needed to + // manage a lifetime of memory allocations in each context when there are + // kernels with indirect access. In v2 it is used during + // ext_oneapi_enable_peer_access and ext_oneapi_disable_peer_access calls. std::list Contexts; ur_shared_mutex ContextsMutex; diff --git a/unified-runtime/source/adapters/level_zero/usm_p2p.cpp b/unified-runtime/source/adapters/level_zero/usm_p2p.cpp index 67e44fd06fbfe..7b094d42c1f81 100644 --- a/unified-runtime/source/adapters/level_zero/usm_p2p.cpp +++ b/unified-runtime/source/adapters/level_zero/usm_p2p.cpp @@ -12,17 +12,23 @@ namespace ur::level_zero { -ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t /*commandDevice*/, - ur_device_handle_t /*peerDevice*/) { +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { - // L0 has peer devices enabled by default + UR_LOG(INFO, + "ignored enabling peer access from {} to memory of {}, because P2P is " + "always enabled in Level Zero V1 adapter", + (void *)commandDevice, (void *)peerDevice); return UR_RESULT_SUCCESS; } -ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t /*commandDevice*/, - ur_device_handle_t /*peerDevice*/) { +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { - // L0 has peer devices enabled by default + UR_LOG(INFO, + "ignored disabling peer access from {} to memory of {}, because P2P " + "is always enabled in Level Zero V1 adapter", + (void *)commandDevice, (void *)peerDevice); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index eed549a58ca95..d869c7c0e6247 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -13,53 +13,6 @@ #include "event_provider_counter.hpp" #include "event_provider_normal.hpp" -static std::vector -filterP2PDevices(ur_device_handle_t hSourceDevice, - const std::vector &devices) { - std::vector p2pDevices; - for (auto &device : devices) { - if (device == hSourceDevice) { - continue; - } - - ze_bool_t p2p; - ZE2UR_CALL_THROWS(zeDeviceCanAccessPeer, - (device->ZeDevice, hSourceDevice->ZeDevice, &p2p)); - - if (p2p) { - p2pDevices.push_back(device); - } - } - return p2pDevices; -} - -static std::vector> -populateP2PDevices(const std::vector &devices) { - std::vector allDevices; - std::function collectDeviceAndSubdevices = - [&allDevices, &collectDeviceAndSubdevices](ur_device_handle_t device) { - allDevices.push_back(device); - for (auto &subDevice : device->SubDevices) { - collectDeviceAndSubdevices(subDevice); - } - }; - - for (auto &device : devices) { - collectDeviceAndSubdevices(device); - } - - uint64_t maxDeviceId = 0; - for (auto &device : allDevices) { - maxDeviceId = std::max(maxDeviceId, device->Id.value()); - } - - std::vector> p2pDevices(maxDeviceId + 1); - for (auto &device : allDevices) { - p2pDevices[device->Id.value()] = filterP2PDevices(device, allDevices); - } - return p2pDevices; -} - static std::vector uniqueDevices(uint32_t numDevices, const ur_device_handle_t *phDevices) { std::vector devices(phDevices, phDevices + numDevices); @@ -104,22 +57,15 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, nativeEventsPool(this, std::make_unique( this, v2::QUEUE_IMMEDIATE, v2::EVENT_FLAGS_PROFILING_ENABLED)), - p2pAccessDevices(populateP2PDevices(this->hDevices)), - defaultUSMPool(this, nullptr), asyncPool(this, nullptr) {} + defaultUSMPool(this, nullptr), asyncPool(this, nullptr) { + UR_LOG(INFO, "UR context created with {} devices", numDevices); +} ur_result_t ur_context_handle_t_::retain() { RefCount.retain(); return UR_RESULT_SUCCESS; } -ur_result_t ur_context_handle_t_::release() { - if (!RefCount.release()) - return UR_RESULT_SUCCESS; - - delete this; - return UR_RESULT_SUCCESS; -} - ur_platform_handle_t ur_context_handle_t_::getPlatform() const { return hDevices[0]->Platform; } @@ -145,37 +91,130 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() { ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { return &asyncPool; } void ur_context_handle_t_::addUsmPool(ur_usm_pool_handle_t hPool) { + UR_LOG(INFO, "Adding USM pool {} to context:{}", hPool, this); std::scoped_lock lock(Mutex); usmPoolHandles.push_back(hPool); } void ur_context_handle_t_::removeUsmPool(ur_usm_pool_handle_t hPool) { + UR_LOG(INFO, "Removing USM pool {} from context:{}", hPool, this) std::scoped_lock lock(Mutex); usmPoolHandles.remove(hPool); } -const std::vector & -ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const { - return p2pAccessDevices[hDevice->Id.value()]; +void ur_context_handle_t_::changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + if (!isValidDevice(hDevice)) { + UR_LOG(INFO, + "skipped changing peer device in context:{} because " + "commandDevice ptr:{} is invalid in this context", + (void *)this, (void *)hDevice); + return; + } + + if (!isValidDevice(peerDevice)) { + UR_LOG(INFO, + "skipped changing peer device in context:{} because " + "peerDevice ptr:{} is invalid in this context", + (void *)this, (void *)peerDevice); + return; + } + + std::shared_lock lock(Mutex); + UR_LOG(INFO, "{} peerDevice:{} in the default pool and {} usmPools", + isAdding ? "adding" : "removing", peerDevice->Id.value(), + usmPoolHandles.size()) + defaultUSMPool.changeResidentDevice(hDevice, peerDevice, isAdding); + for (const auto &hPool : usmPoolHandles) { + hPool->changeResidentDevice(hDevice, peerDevice, isAdding); + } +} + +std::vector +ur_context_handle_t_::getDevicesWhoseAllocationsCanBeAccessedFrom( + ur_device_handle_t hDevice) { + UR_FASSERT(hDevice != nullptr && hDevice->Id.has_value(), + "invalid device handle"); + + std::vector peers; + { + std::shared_lock lock(hDevice->Mutex); + peers = hDevice->peers; + } + + std::vector retVal; + std::copy_if( + std::begin(hDevices), std::end(hDevices), std::back_inserter(retVal), + [&](ur_device_handle_t peerCandidateDevice) { + const auto candidateId = peerCandidateDevice->Id.value(); + UR_FASSERT(candidateId < peers.size(), + "there is no device:" + << candidateId << " in peers table, number of devices:" + << peers.size()); + return peers[candidateId] == ur_device_handle_t_::PeerStatus::ENABLED; + }); + + return retVal; +} + +std::vector +ur_context_handle_t_::getDevicesWhichCanAccessAllocationsPresentOn( + ur_device_handle_t hDevice) { + UR_FASSERT(hDevice != nullptr && hDevice->Id.has_value(), + "invalid device handle"); + + const auto hDeviceId = hDevice->Id.value(); + std::vector retVal; + std::copy_if( + std::begin(hDevices), std::end(hDevices), std::back_inserter(retVal), + [&](ur_device_handle_t peerCandidateDevice) { + const auto candidateId = peerCandidateDevice->Id.value(); + std::shared_lock lock(peerCandidateDevice->Mutex); + UR_FASSERT( + hDeviceId < peerCandidateDevice->peers.size(), + "there is no device:" + << hDeviceId << " in peers table of device:" << candidateId + << ", number of devices:" << peerCandidateDevice->peers.size()); + return peerCandidateDevice->peers[hDeviceId] == + ur_device_handle_t_::PeerStatus::ENABLED; + }); + + return retVal; } namespace ur::level_zero { ur_result_t urContextCreate(uint32_t deviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t * /*pProperties*/, - ur_context_handle_t *phContext) try { - - ur_platform_handle_t hPlatform = phDevices[0]->Platform; - ZeStruct contextDesc{}; - - ze_context_handle_t zeContext{}; - ZE2UR_CALL(zeContextCreate, (hPlatform->ZeDriver, &contextDesc, &zeContext)); - - *phContext = - new ur_context_handle_t_(zeContext, deviceCount, phDevices, true); - return UR_RESULT_SUCCESS; -} catch (...) { - return exceptionToResult(std::current_exception()); + ur_context_handle_t *phContext) { + *phContext = nullptr; + try { + + ur_platform_handle_t hPlatform = phDevices[0]->Platform; + ZeStruct contextDesc{}; + + ze_context_handle_t rawZeContext{}; + ZE2UR_CALL(zeContextCreate, + (hPlatform->ZeDriver, &contextDesc, &rawZeContext)); + UR_LOG(INFO, "ZE context created with {} devices", deviceCount); + + // Wrap immediately so any exception thrown by the ur_context_handle_t_ + // constructor (after hContext member is initialised) does not double-free + // the Level Zero context handle. + *phContext = + new ur_context_handle_t_(rawZeContext, deviceCount, phDevices, true); + { + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); + } + return UR_RESULT_SUCCESS; + } catch (...) { + UR_LOG(ERR, "creating context failed"); + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); + } } ur_result_t urContextGetNativeHandle(ur_context_handle_t hContext, @@ -191,16 +230,26 @@ ur_result_t urContextCreateWithNativeHandle( ur_native_handle_t hNativeContext, ur_adapter_handle_t, uint32_t numDevices, const ur_device_handle_t *phDevices, const ur_context_native_properties_t *pProperties, - ur_context_handle_t *phContext) try { - auto zeContext = reinterpret_cast(hNativeContext); - - auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; - - *phContext = - new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); - return UR_RESULT_SUCCESS; -} catch (...) { - return exceptionToResult(std::current_exception()); + ur_context_handle_t *phContext) { + *phContext = nullptr; + try { + auto zeContext = reinterpret_cast(hNativeContext); + + auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; + + *phContext = + new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); + { + auto hPlatform = phDevices[0]->Platform; + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); + } + return UR_RESULT_SUCCESS; + } catch (...) { + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); + } } ur_result_t urContextRetain(ur_context_handle_t hContext) try { @@ -210,7 +259,20 @@ ur_result_t urContextRetain(ur_context_handle_t hContext) try { } ur_result_t urContextRelease(ur_context_handle_t hContext) try { - return hContext->release(); + if (!hContext->RefCount.release()) + return UR_RESULT_SUCCESS; + + auto Platform = hContext->getPlatform(); + { + std::scoped_lock Lock(Platform->ContextsMutex); + auto &Contexts = Platform->Contexts; + auto It = std::find(Contexts.begin(), Contexts.end(), hContext); + if (It != Contexts.end()) { + Contexts.erase(It); + } + } + delete hContext; + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/unified-runtime/source/adapters/level_zero/v2/context.hpp b/unified-runtime/source/adapters/level_zero/v2/context.hpp index c9cb3abec81f6..3092ba193a521 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.hpp @@ -15,6 +15,7 @@ #include "common.hpp" #include "common/ur_ref_count.hpp" #include "event_pool_cache.hpp" +#include "logger/ur_logger.hpp" #include "usm.hpp" enum class PoolCacheType { Immediate, Regular }; @@ -24,7 +25,6 @@ struct ur_context_handle_t_ : ur_object { const ur_device_handle_t *phDevices, bool ownZeContext); ur_result_t retain(); - ur_result_t release(); inline ze_context_handle_t getZeHandle() const { return hContext.get(); } ur_platform_handle_t getPlatform() const; @@ -35,6 +35,8 @@ struct ur_context_handle_t_ : ur_object { void addUsmPool(ur_usm_pool_handle_t hPool); void removeUsmPool(ur_usm_pool_handle_t hPool); + void changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, bool isAdding); template void forEachUsmPool(Func func) { std::shared_lock lock(Mutex); @@ -44,8 +46,11 @@ struct ur_context_handle_t_ : ur_object { } } - const std::vector & - getP2PDevices(ur_device_handle_t hDevice) const; + std::vector + getDevicesWhoseAllocationsCanBeAccessedFrom(ur_device_handle_t hDevice); + + std::vector + getDevicesWhichCanAccessAllocationsPresentOn(ur_device_handle_t hDevice); v2::event_pool &getNativeEventsPool() { return nativeEventsPool; } v2::command_list_cache_t &getCommandListCache() { return commandListCache; } @@ -81,7 +86,10 @@ struct ur_context_handle_t_ : ur_object { private: const v2::raii::ze_context_handle_t hContext; - const std::vector hDevices; + const std::vector + hDevices; // possibly without subdevices, only what was passed to ctor, + // context may have user-defined, limited subset of available + // devices v2::command_list_cache_t commandListCache; v2::event_pool_cache eventPoolCacheImmediate; v2::event_pool_cache eventPoolCacheRegular; @@ -90,9 +98,6 @@ struct ur_context_handle_t_ : ur_object { // (uses non-counter based events to allow for signaling from host) v2::event_pool nativeEventsPool; - // P2P devices for each device in the context, indexed by device id. - const std::vector> p2pAccessDevices; - ur_usm_pool_handle_t_ defaultUSMPool; ur_usm_pool_handle_t_ asyncPool; std::list usmPoolHandles; diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index e44ee872161cf..b0601ba956af2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -360,12 +360,17 @@ void *ur_discrete_buffer_handle_t::getDevicePtr( return getActiveDeviceAlloc(offset); } - auto &p2pDevices = hContext->getP2PDevices(hDevice); + auto p2pDevices = + hContext->getDevicesWhoseAllocationsCanBeAccessedFrom(hDevice); auto p2pAccessible = std::find(p2pDevices.begin(), p2pDevices.end(), activeAllocationDevice) != p2pDevices.end(); if (!p2pAccessible) { // TODO: migrate buffer through the host + UR_LOG(WARN, + "p2p is not accessible: requesting device ptr:{} cannot access " + "allocation on device ptr:{}", + (void *)hDevice, (void *)activeAllocationDevice); throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index 85f51e215e094..3bc216421e2d6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -111,19 +111,33 @@ makeProvider(usm::pool_descriptor poolDescriptor) { UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetMemoryType( hParams, urToUmfMemoryType(poolDescriptor.type))); - std::vector residentZeHandles; + // zeDeviceHandles has to be in the scope of hParams because hParams keeps + // reference of it inside. + std::vector zeDeviceHandles; - if (poolDescriptor.type == UR_USM_TYPE_DEVICE) { + if (poolDescriptor.supportsResidentDevices()) { assert(level_zero_device_handle); - auto residentHandles = - poolDescriptor.hContext->getP2PDevices(poolDescriptor.hDevice); - residentZeHandles.push_back(level_zero_device_handle); - for (auto &device : residentHandles) { - residentZeHandles.push_back(device->ZeDevice); + + // Make memory resident on the source device itself plus all peer devices + // that have explicitly enabled peer access to it. + zeDeviceHandles.push_back(level_zero_device_handle); + for (auto dev : + poolDescriptor.hContext->getDevicesWhichCanAccessAllocationsPresentOn( + poolDescriptor.hDevice)) { + zeDeviceHandles.push_back(dev->ZeDevice); } UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetResidentDevices( - hParams, residentZeHandles.data(), residentZeHandles.size())); + hParams, zeDeviceHandles.data(), zeDeviceHandles.size())); + + UR_LOG(INFO, + "memory provider will be created with {} resident device(s), " + "desc:{}", + zeDeviceHandles.size(), + logger::makeStringFromStreamable(poolDescriptor)); + } else { + UR_LOG(INFO, "memory provider does not support resident devices, desc:{}", + logger::makeStringFromStreamable(poolDescriptor)); } UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetFreePolicy( @@ -431,6 +445,50 @@ size_t ur_usm_pool_handle_t_::getTotalUsedSize() { size_t ur_usm_pool_handle_t_::getPeakUsedSize() { return allocStats.getPeak(); } +void ur_usm_pool_handle_t_::changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + poolManager.forEachPoolWithDesc([=](const auto &desc, auto pool) { + if (desc.supportsResidentDevices() && desc.hDevice && + desc.hDevice->ZeDevice == hDevice->ZeDevice) { + UR_LOG(INFO, "found {} of srcDevice:{} valid to {} peerDevice:{}", + logger::makeStringFromStreamable(desc), desc.hDevice->Id.value(), + isAdding ? "add" : "remove", peerDevice->Id.value()); + umf_memory_provider_handle_t hProvider; + umf_result_t getProviderResult = + umfPoolGetMemoryProvider(pool->umfPool.get(), &hProvider); + if (getProviderResult != UMF_RESULT_SUCCESS) { + UR_LOG(ERR, "getting memory provider failed with:{}", + getProviderResult); + return true; + } + umf_result_t changeResult = + umfLevelZeroMemoryProviderResidentDeviceChange( + hProvider, peerDevice->ZeDevice, isAdding); + if (changeResult != UMF_RESULT_SUCCESS) { + // UMF updates its internal resident-device list before calling + // zeContextMakeMemoryResident / zeContextEvictMemory, so both the + // UR peer-status table and the UMF provider state are already + // consistent when this error is observed. The failure originates + // from zeContextEvictMemory returning a non-SUCCESS result for + // device USM memory: zeContextMakeMemoryResident is called at + // allocation time (inside the UMF provider) and returns SUCCESS for + // device memory, but zeContextEvictMemory subsequently fails because + // the Level Zero driver treats the make-resident call as a no-op for + // device USM (the memory is already on the source device; no + // explicit hardware pinning on the peer is required). Aborting here + // would be wrong: the system is in a fully consistent state and + // future allocations will correctly omit the now-disabled peer from + // their resident-device set. + UR_LOG(WARN, + "changing resident devices in UMF failed with:{}, continuing", + changeResult); + } + } + return true; + }); +} + namespace ur::level_zero { ur_result_t urUSMPoolCreate( /// [in] handle of the context object diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.hpp b/unified-runtime/source/adapters/level_zero/v2/usm.hpp index fb0b83ce68f3d..5ea29d7e3220b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.hpp @@ -78,6 +78,8 @@ struct ur_usm_pool_handle_t_ : ur_object { size_t getPeakReservedSize(); size_t getTotalUsedSize(); size_t getPeakUsedSize(); + void changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, bool isAdding); UsmPool *getPool(const usm::pool_descriptor &desc); diff --git a/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp new file mode 100644 index 0000000000000..28973a567e0e1 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp @@ -0,0 +1,125 @@ +//===----------- usm_p2p.cpp - L0 Adapter ---------------------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../device.hpp" +#include "context.hpp" +#include "logger/ur_logger.hpp" + +namespace ur::level_zero { + +// Validates that two devices are compatible for P2P operations: both must have +// an assigned Id, must belong to the same platform (i.e. share the same device +// cache), and peerDevice's id must be a valid index into commandDevice->peers. +static ur_result_t validateP2PDevicePair(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + if (!commandDevice->Id.has_value() || !peerDevice->Id.has_value()) { + UR_LOG(ERR, "P2P operation requires devices with assigned ids"); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + if (commandDevice->Platform != peerDevice->Platform) { + UR_LOG(ERR, "P2P operation requires devices from the same platform"); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + if (peerDevice->Id.value() >= commandDevice->peers.size()) { + UR_LOG(ERR, + "peerDevice id:{} is out of range for commandDevice peers table " + "(size:{})", + peerDevice->Id.value(), commandDevice->peers.size()); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + return UR_RESULT_SUCCESS; +} + +static ur_result_t urUsmP2PChangePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + + UR_LOG(INFO, "user tries to {} peer access to memory of {} from {}", + (isAdding ? "enable" : "disable"), *peerDevice, *commandDevice); + + { + const auto expectedPeerStatus = + isAdding ? ur_device_handle_t_::PeerStatus::DISABLED + : ur_device_handle_t_::PeerStatus::ENABLED; + std::scoped_lock Lock(commandDevice->Mutex); + const auto existingPeerStatus = + commandDevice->peers[peerDevice->Id.value()]; + if (existingPeerStatus != expectedPeerStatus) { + UR_LOG(ERR, + "existing peer status:{} does not match expected peer status:{}", + existingPeerStatus, expectedPeerStatus); + return UR_RESULT_ERROR_INVALID_OPERATION; + } + commandDevice->peers[peerDevice->Id.value()] = + (isAdding ? ur_device_handle_t_::PeerStatus::ENABLED + : ur_device_handle_t_::PeerStatus::DISABLED); + } + + auto Platform = commandDevice->Platform; + // Copy the context list under the mutex and iterate outside the critical + // section to avoid holding ContextsMutex during potentially heavy + // changeResidentDevice calls and to reduce deadlock risk. + std::list Contexts; + { + std::scoped_lock Lock(Platform->ContextsMutex); + Contexts = Platform->Contexts; + } + UR_LOG(INFO, "changing peers in {} contexts", Contexts.size()); + for (auto Context : Contexts) { + Context->changeResidentDevice(commandDevice, peerDevice, isAdding); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + return urUsmP2PChangePeerAccessExp(commandDevice, peerDevice, true); +} + +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + return urUsmP2PChangePeerAccessExp(commandDevice, peerDevice, false); +} + +ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + + int propertyValue = 0; + switch (propName) { + case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT: { + std::scoped_lock Lock(commandDevice->Mutex); + propertyValue = commandDevice->peers[peerDevice->Id.value()] != + ur_device_handle_t_::PeerStatus::NO_CONNECTION; + break; + } + case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORT: { + ZeStruct p2pProperties; + ZE2UR_CALL(zeDeviceGetP2PProperties, + (commandDevice->ZeDevice, peerDevice->ZeDevice, &p2pProperties)); + propertyValue = + (p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS) != 0; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return ReturnValue(propertyValue); +} +} // namespace ur::level_zero diff --git a/unified-runtime/source/common/backtrace.hpp b/unified-runtime/source/common/backtrace.hpp index f94f8e8a88fb0..d3b67bed2d72e 100644 --- a/unified-runtime/source/common/backtrace.hpp +++ b/unified-runtime/source/common/backtrace.hpp @@ -8,10 +8,10 @@ #include #include -#define MAX_BACKTRACE_FRAMES 64 - namespace ur { +static constexpr int MaxBacktraceFrames = 64; + using BacktraceLine = std::string; std::vector getCurrentBacktrace(); diff --git a/unified-runtime/source/common/backtrace_lin.cpp b/unified-runtime/source/common/backtrace_lin.cpp index 86a924a3be56d..89993f3f2ca6d 100644 --- a/unified-runtime/source/common/backtrace_lin.cpp +++ b/unified-runtime/source/common/backtrace_lin.cpp @@ -13,8 +13,8 @@ namespace ur { std::vector getCurrentBacktrace() { - void *backtraceFrames[MAX_BACKTRACE_FRAMES]; - int frameCount = ::backtrace(backtraceFrames, MAX_BACKTRACE_FRAMES); + void *backtraceFrames[MaxBacktraceFrames]; + int frameCount = ::backtrace(backtraceFrames, MaxBacktraceFrames); char **backtraceStr = ::backtrace_symbols(backtraceFrames, frameCount); // TODO: implement getting demangled symbols using abi::__cxa_demangle if (backtraceStr == nullptr) { diff --git a/unified-runtime/source/common/backtrace_win.cpp b/unified-runtime/source/common/backtrace_win.cpp index fa76b1d9a772c..09dcca186d9cc 100644 --- a/unified-runtime/source/common/backtrace_win.cpp +++ b/unified-runtime/source/common/backtrace_win.cpp @@ -20,9 +20,8 @@ std::vector getCurrentBacktrace() { HANDLE process = GetCurrentProcess(); SymInitialize(process, nullptr, true); - PVOID frames[MAX_BACKTRACE_FRAMES]; - WORD frameCount = - CaptureStackBackTrace(0, MAX_BACKTRACE_FRAMES, frames, NULL); + PVOID frames[MaxBacktraceFrames]; + WORD frameCount = CaptureStackBackTrace(0, MaxBacktraceFrames, frames, NULL); if (frameCount == 0) { SymCleanup(process); diff --git a/unified-runtime/source/common/ur_pool_manager.hpp b/unified-runtime/source/common/ur_pool_manager.hpp index 198ee4e005f18..a08e4f91cfd25 100644 --- a/unified-runtime/source/common/ur_pool_manager.hpp +++ b/unified-runtime/source/common/ur_pool_manager.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -64,12 +65,13 @@ struct pool_descriptor { createFromDevices(ur_usm_pool_handle_t poolHandle, ur_context_handle_t hContext, const std::vector &devices); -}; -static inline bool -isSharedAllocationReadOnlyOnDevice(const pool_descriptor &desc) { - return desc.type == UR_USM_TYPE_SHARED && desc.deviceReadOnly; -} + bool isSharedAllocationReadOnlyOnDevice() const { + return type == UR_USM_TYPE_SHARED && deviceReadOnly; + } + + bool supportsResidentDevices() const { return type == UR_USM_TYPE_DEVICE; } +}; inline bool pool_descriptor::operator==(const pool_descriptor &other) const { static usm::detail::ddiTables ddi; @@ -99,8 +101,8 @@ inline bool pool_descriptor::operator==(const pool_descriptor &other) const { } return lhsNative == rhsNative && lhs.type == rhs.type && - (isSharedAllocationReadOnlyOnDevice(lhs) == - isSharedAllocationReadOnlyOnDevice(rhs)) && + (lhs.isSharedAllocationReadOnlyOnDevice() == + rhs.isSharedAllocationReadOnlyOnDevice()) && lhs.poolHandle == rhs.poolHandle; } @@ -152,6 +154,7 @@ inline std::vector pool_descriptor::createFromDevices( template struct pool_manager { private: + static_assert(std::is_same_v); using pool_handle_t = H *; using unique_pool_handle_t = std::unique_ptr>; using desc_to_pool_map_t = std::unordered_map; @@ -174,6 +177,8 @@ template struct pool_manager { } ur_result_t addPool(const D &desc, unique_pool_handle_t &&hPool) { + UR_LOG(INFO, "Adding USM pool {} ptr:{} into pool_manager, size:{}", desc, + hPool.get(), descToPoolMap.size()); if (!descToPoolMap.try_emplace(desc, std::move(hPool)).second) { UR_LOG(ERR, "Pool for pool descriptor: {}, already exists", desc); return UR_RESULT_ERROR_INVALID_ARGUMENT; @@ -191,12 +196,20 @@ template struct pool_manager { return it->second.get(); } + template void forEachPool(Func func) { for (const auto &[desc, pool] : descToPoolMap) { if (!func(pool.get())) break; } } + + template void forEachPoolWithDesc(Func func) { + for (const auto &[desc, pool] : descToPoolMap) { + if (!func(desc, pool.get())) + break; + } + } }; inline umf::pool_unique_handle_t @@ -238,7 +251,7 @@ template <> struct hash { } return combine_hashes(0, desc.type, native, - isSharedAllocationReadOnlyOnDevice(desc), + desc.isSharedAllocationReadOnlyOnDevice(), desc.poolHandle); } }; diff --git a/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp b/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp index 9ee1e31e85ce6..6de53189a3868 100644 --- a/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp +++ b/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp @@ -13,8 +13,6 @@ #include #include -#define MAX_BACKTRACE_FRAMES 64 - namespace ur_validation_layer { struct RefCountContext { diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 79e4964a85e12..993a23b9c071e 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -9,6 +9,9 @@ #include "uur/fixtures.h" #include "uur/utils.h" +#include +#include + using urMemoryResidencyTest = uur::urMultiDeviceContextTestTemplate<1>; UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urMemoryResidencyTest); @@ -19,9 +22,9 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { GTEST_SKIP() << "Test requires a PVC device"; } - size_t initialMemFree = 0; + uint64_t initialMemFree = 0; ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(size_t), &initialMemFree, nullptr)); + sizeof(uint64_t), &initialMemFree, nullptr)); if (initialMemFree < allocSize) { GTEST_SKIP() << "Not enough device memory available"; @@ -31,9 +34,9 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { ASSERT_SUCCESS( urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); - size_t currentMemFree = 0; + uint64_t currentMemFree = 0; ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(size_t), ¤tMemFree, nullptr)); + sizeof(uint64_t), ¤tMemFree, nullptr)); // amount of free memory should decrease after making a memory allocation // resident @@ -41,3 +44,360 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { ASSERT_SUCCESS(urUSMFree(context, ptr)); } + +struct urMemoryMultiResidencyTest : uur::urMultiDeviceContextTestTemplate<2> { + + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<2>::SetUp()); + + for (std::size_t i = 0; i < 2; i++) { + ur_bool_t usm_p2p_support = false; + ASSERT_SUCCESS( + urDeviceGetInfo(devices[i], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP, + sizeof(usm_p2p_support), &usm_p2p_support, nullptr)); + if (!usm_p2p_support) { + GTEST_SKIP() << "EXP usm p2p feature is not supported."; + } + } + + if (!uur::isPVC(devices[0]) || !uur::isPVC(devices[1])) { + GTEST_SKIP() << "Test requires PVC devices"; + } + + if (!hasHardwareP2PSupport()) { + GTEST_SKIP() << "No hardware P2P connection between devices"; + } + } + + void TearDown() override { + // Disable peer access if a test enabled it but did not clean up (e.g. due + // to an assertion failure), so subsequent tests start from a clean state. + if (peerAccessEnabled) { + EXPECT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + } + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<2>::TearDown()); + } + + // Returns true when hardware P2P connectivity exists in the direction used + // by the tests: devices[1] accessing allocations on devices[0]. + bool hasHardwareP2PSupport() { + int supported = 0; + if (urUsmP2PPeerAccessGetInfoExp( + devices[1], devices[0], UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT, + sizeof(int), &supported, nullptr) != UR_RESULT_SUCCESS) { + return false; + } + return supported != 0; + } + + // Whether peer access from devices[1] to devices[0] has been enabled by + // this test and must be disabled in TearDown. + bool peerAccessEnabled = false; +}; + +UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urMemoryMultiResidencyTest); + +// Verify that allocating USM memory on devices[0] does NOT make it resident on +// devices[1] when peer access has not been enabled. Only the peer device's +// free memory is checked: it must not decrease by allocSize. The source +// device free memory is intentionally not checked because deferred frees from +// earlier tests complete asynchronously and make the source baseline +// unreliable; that property is already covered by +// allocatingDeviceMemoryWillResultInOOM. +TEST_P(urMemoryMultiResidencyTest, allocationInitiallyAbsentOnPeer) { + static constexpr size_t allocSize = 1024 * 1024; + + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() + << "Not enough peer device memory available for reliable check"; + } + + // Allocate on devices[0] WITHOUT enabling P2P. + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + uint64_t currentMemFreePeer = 0; + ur_result_t res = + urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreePeer, nullptr); + + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_SUCCESS(res); + // Without P2P, the allocation must not be resident on the peer: + // free memory on devices[1] must not have decreased by a full allocSize. + ASSERT_GT(currentMemFreePeer, initialMemFreePeer - allocSize); +} + +// Verify that enabling peer access succeeds and that a second enable attempt +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already enabled). Confirms +// that source-device free memory decreases by at least allocSize, showing the +// allocation succeeded on devices[0] with P2P enabled. Also verifies end-to-end +// P2P data transfer: the allocation on devices[0] is filled with a known +// pattern and then read by devices[1]'s command engine; the result is checked +// for correctness to confirm the feature works in the correct direction. +// Note: peer-device free memory is not checked because +// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect +// zeContextMakeMemoryResident behaviour for device USM allocations. +TEST_P(urMemoryMultiResidencyTest, + enablePeerAccessStateMachineAndSourceAllocation) { + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + // A second enable must be rejected because access is already enabled. + ASSERT_EQ(urUsmP2PEnablePeerAccessExp(devices[1], devices[0]), + UR_RESULT_ERROR_INVALID_OPERATION); + + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Fill ptr on devices[0] with a known pattern using devices[0]'s queue. + static constexpr uint8_t fillPattern = 0xAB; + ur_queue_handle_t srcQueue = nullptr; + ur_result_t fillRes1 = urQueueCreate(context, devices[0], nullptr, &srcQueue); + ur_result_t fillRes2 = + (fillRes1 == UR_RESULT_SUCCESS) + ? urEnqueueUSMFill(srcQueue, ptr, sizeof(fillPattern), &fillPattern, + allocSize, 0, nullptr, nullptr) + : fillRes1; + ur_result_t fillRes3 = + (fillRes2 == UR_RESULT_SUCCESS) ? urQueueFinish(srcQueue) : fillRes2; + if (srcQueue) { + urQueueRelease(srcQueue); + } + + // Verify end-to-end P2P access: copy ptr (on devices[0]) to dstPtr (on + // devices[1]) using devices[1]'s queue, then read back for data verification. + void *dstPtr = nullptr; + ur_queue_handle_t peerQueue = nullptr; + std::vector hostData(allocSize); + ur_result_t p2pRes1 = urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr); + ur_result_t p2pRes2 = + (p2pRes1 == UR_RESULT_SUCCESS) + ? urQueueCreate(context, devices[1], nullptr, &peerQueue) + : p2pRes1; + // devices[1]'s engine reads ptr from devices[0] via P2P. + ur_result_t p2pRes3 = (p2pRes2 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, dstPtr, ptr, + allocSize, 0, nullptr, nullptr) + : p2pRes2; + // Read result back to host for verification. + ur_result_t p2pRes4 = + (p2pRes3 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, hostData.data(), dstPtr, + allocSize, 0, nullptr, nullptr) + : p2pRes3; + + // Save return code so ptr is freed before any ASSERT terminates the test. + uint64_t currentMemFreeSource = 0; + ur_result_t res = + urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreeSource, nullptr); + + if (peerQueue) { + urQueueRelease(peerQueue); + } + if (dstPtr) { + urUSMFree(context, dstPtr); + } + ASSERT_SUCCESS(urUSMFree(context, ptr)); + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + ASSERT_SUCCESS(res); + // Allocation is physically on devices[0]: its free memory must decrease. + ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); + + ASSERT_SUCCESS(fillRes1); + ASSERT_SUCCESS(fillRes2); + ASSERT_SUCCESS(fillRes3); + + ASSERT_SUCCESS(p2pRes1); + ASSERT_SUCCESS(p2pRes2); + ASSERT_SUCCESS(p2pRes3); + ASSERT_SUCCESS(p2pRes4); + // All bytes transferred via P2P must match the fill pattern. + EXPECT_TRUE(std::all_of(hostData.begin(), hostData.end(), + [](uint8_t b) { return b == fillPattern; })); +} + +// Verify that disabling peer access succeeds and that a second disable attempt +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already disabled). Confirms +// that source-device free memory still shows the allocation after peer access is +// disabled, proving the allocation on devices[0] remains valid. +// Note: peer-device eviction is not verified via free memory because +// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect +// zeContextEvictMemory behaviour for device USM allocations. +TEST_P(urMemoryMultiResidencyTest, + disablePeerAccessStateMachineAndSourceAllocationPersists) { + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Disable P2P; the runtime evicts the allocation from devices[1] (not + // verified here, see the function-level comment above). Save return codes so + // ptr is freed before any ASSERT terminates the test. + ur_result_t res1 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); + if (res1 == UR_RESULT_SUCCESS) { + peerAccessEnabled = false; + } + + // A second disable must be rejected because access is already disabled. + ur_result_t res2 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); + + uint64_t currentMemFreeSource = 0; + ur_result_t res3 = + urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreeSource, nullptr); + + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_SUCCESS(res1); + ASSERT_EQ(res2, UR_RESULT_ERROR_INVALID_OPERATION); + ASSERT_SUCCESS(res3); + // Allocation is physically on devices[0]: its free memory must decrease. + ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); +} + +// Verify that USM memory allocated on devices[0] and filled with a known +// pattern can be correctly read by devices[1] when P2P access is enabled. +// This confirms end-to-end P2P data transfer works in the correct direction. +TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xAB; + + // Allocate on devices[0] and fill with a known pattern. + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + // Enable P2P: devices[1] can now access allocations on devices[0]. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + // Copy srcPtr (on devices[0]) to dstPtr (on devices[1]) using devices[1]'s + // queue (P2P read), then copy dstPtr back to the host for data verification. + void *dstPtr = nullptr; + ur_queue_handle_t peerQueue = nullptr; + std::vector hostData(allocSize, 0); + ur_result_t res1 = urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr); + ur_result_t res2 = + (res1 == UR_RESULT_SUCCESS) + ? urQueueCreate(context, devices[1], nullptr, &peerQueue) + : res1; + ur_result_t res3 = (res2 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, + allocSize, 0, nullptr, nullptr) + : res2; + ur_result_t res4 = + (res3 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, hostData.data(), dstPtr, + allocSize, 0, nullptr, nullptr) + : res3; + + if (peerQueue) { + urQueueRelease(peerQueue); + } + if (dstPtr) { + urUSMFree(context, dstPtr); + } + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + ASSERT_SUCCESS(res1); + ASSERT_SUCCESS(res2); + ASSERT_SUCCESS(res3); + ASSERT_SUCCESS(res4); + // All bytes transferred via P2P must match the fill pattern. + EXPECT_TRUE(std::all_of(hostData.begin(), hostData.end(), + [](uint8_t b) { return b == fillPattern; })); +} + +// Verify that a USM allocation on devices[0] is NOT made resident on +// devices[1] when P2P access has not been enabled. The feature under test +// restricts residency, not hardware access: Level Zero hardware can still +// transfer data cross-device via the interconnect regardless of residency +// state, so the copy result is not checked here. The observable guarantee +// is that devices[1] free memory must not decrease by a full allocSize, +// proving the allocation was never pinned on the peer device. +TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xAB; + + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() + << "Not enough peer device memory available for reliable check"; + } + + // Allocate on devices[0] WITHOUT enabling P2P — must not consume + // devices[1] memory. + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + uint64_t currentMemFreePeer = 0; + ur_result_t memRes = + urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreePeer, nullptr); + + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + ASSERT_SUCCESS(memRes); + // Without P2P the allocation must not be resident on devices[1]: free + // memory on devices[1] must not have decreased by a full allocSize. + ASSERT_GT(currentMemFreePeer, initialMemFreePeer - allocSize); +}