Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions sycl/source/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,19 +212,33 @@ bool device::has(aspect Aspect) const { return impl->has(Aspect); }
void device::ext_oneapi_enable_peer_access(const device &peer) {
ur_device_handle_t Device = impl->getHandleRef();
ur_device_handle_t Peer = peer.impl->getHandleRef();
if (Device != Peer) {
detail::adapter_impl &Adapter = impl->getAdapter();
Adapter.call<detail::UrApiKind::urUsmP2PEnablePeerAccessExp>(Device, Peer);

if (Device == Peer)
return;

if (peer.get_platform() != get_platform()) {
throw exception(errc::invalid,
"Cannot enable peer access between different platforms");
}

impl->getAdapter().call<detail::UrApiKind::urUsmP2PEnablePeerAccessExp>(
Device, Peer);
}

void device::ext_oneapi_disable_peer_access(const device &peer) {
ur_device_handle_t Device = impl->getHandleRef();
ur_device_handle_t Peer = peer.impl->getHandleRef();
if (Device != Peer) {
detail::adapter_impl &Adapter = impl->getAdapter();
Adapter.call<detail::UrApiKind::urUsmP2PDisablePeerAccessExp>(Device, Peer);

if (Device == Peer)
return;

if (peer.get_platform() != get_platform()) {
throw exception(errc::invalid,
"Cannot disable peer access between different platforms");
}

impl->getAdapter().call<detail::UrApiKind::urUsmP2PDisablePeerAccessExp>(
Device, Peer);
}

bool device::ext_oneapi_can_access_peer(const device &peer,
Expand Down
120 changes: 120 additions & 0 deletions sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Verify that the Level Zero v2 adapter correctly makes USM device memory
// resident on peer devices when P2P access is enabled.
//
// Phase 1: Allocates memory on dev0, fills it with a known pattern, enables
// P2P access from dev1 to dev0, then uses dev1's queue to copy the data to
// the host and verifies all values match the fill pattern.
//
// Phase 2 (opposite direction): Allocates memory on dev1, fills it with a
// different pattern, enables P2P access from dev0 to dev1, then uses dev0's
// queue to copy the data to the host and verifies correctness.
//
// REQUIRES: level_zero && two-or-more-gpu-devices
// UNSUPPORTED: level_zero_v1_adapter
// UNSUPPORTED-INTENDED: Test is specific to the Level Zero v2 adapter.
//
// RUN: %{build} -o %t.out
// RUN: env UR_LOADER_USE_LEVEL_ZERO_V2=1 %{run} %t.out

#include <iostream>
#include <vector>

#include <sycl/detail/core.hpp>
#include <sycl/platform.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

// Allocate N ints on srcQueue's device, fill with fillVal, enable P2P so that
// dstDev can access srcDev's allocations, copy to host via dstQueue, verify
// all values, then clean up. Returns false on failure.
static bool testP2PRead(context &ctx, queue &srcQueue, device &srcDev,
queue &dstQueue, device &dstDev, size_t N, int fillVal,
const char *label) {
int *src = sycl::malloc_device<int>(N, srcQueue);
if (!src) {
std::cout << label << ": device alloc failed. Skipping.\n";
return true; // not a test failure
}
srcQueue.fill(src, fillVal, N).wait();

// Enable P2P: dstDev may now access allocations on srcDev. Under the
// Level Zero v2 adapter this also makes the srcDev allocation resident
// on dstDev.
std::cout << "Enabling P2P: dstDev may now access allocations on srcDev.\n";
dstDev.ext_oneapi_enable_peer_access(srcDev);

std::vector<int> result(N, 0);
dstQueue.memcpy(result.data(), src, N * sizeof(int)).wait();

sycl::free(src, ctx);
std::cout
<< "Disabling P2P: dstDev may no longer access allocations on srcDev.\n";
dstDev.ext_oneapi_disable_peer_access(srcDev);

for (size_t i = 0; i < N; ++i) {
if (result[i] != fillVal) {
std::cout << label << ": FAIL at index " << i << ": got " << result[i]
<< ", expected " << fillVal << "\n";
return false;
}
}
std::cout << label << ": OK\n";
return true;
}

int main() {
// Find a platform with at least two GPU devices.
std::vector<device> gpus;
for (auto &plat : platform::get_platforms()) {
gpus = plat.get_devices(info::device_type::gpu);
if (gpus.size() >= 2)
break;
}

if (gpus.size() < 2) {
std::cout << "Test requires at least two GPU devices on the same platform. "
"Skipping.\n";
return 0;
}

device &dev0 = gpus[0];
device &dev1 = gpus[1];

std::cout << "Device 0: " << dev0.get_info<info::device::name>() << "\n";
std::cout << "Device 1: " << dev1.get_info<info::device::name>() << "\n";

// Both devices share a single context for cross-device USM.
context ctx({dev0, dev1});
queue q0(ctx, dev0);
queue q1(ctx, dev1);

constexpr size_t N = 1024;

// Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0).
std::cout << "Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0).\n";
if (!dev1.ext_oneapi_can_access_peer(
dev0, ext::oneapi::peer_access::access_supported)) {
std::cout << "No hardware P2P support (dev1->dev0). Skipping.\n";
return 0;
}
if (!testP2PRead(ctx, q0, dev0, q1, dev1, N, 0x42,
"Phase 1 (dev1 reads dev0)"))
return 1;

// Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1).
std::cout
<< "Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1).\n";
if (!dev0.ext_oneapi_can_access_peer(
dev1, ext::oneapi::peer_access::access_supported)) {
std::cout << "No hardware P2P support (dev0->dev1). Skipping phase 2.\n";
std::cout << "PASS\n";
return 0;
}
if (!testP2PRead(ctx, q1, dev1, q0, dev0, N, 0x55,
"Phase 2 (dev0 reads dev1)"))
return 1;

std::cout << "PASS\n";
return 0;
}
7 changes: 7 additions & 0 deletions sycl/test-e2e/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,13 @@ def open_check_file(file_name):
if "opencl:cpu" in sycl_ls_output:
config.available_features.add("opencl-cpu-rt")

# Count physical GPU devices: each physical GPU produces one output line
# that contains ":gpu]". Add a feature when at least two are present so
# tests requiring multi-GPU hardware can be skipped on single-GPU machines.
gpu_device_lines = [l for l in sycl_ls_output.splitlines() if ":gpu]" in l]
if len(gpu_device_lines) >= 2:
config.available_features.add("two-or-more-gpu-devices")

if len(config.sycl_devices) == 1 and config.sycl_devices[0] == "all":
devices = set()
for line in sycl_ls_output.splitlines():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/mutable_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
Expand Down Expand Up @@ -194,6 +193,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm_p2p.cpp
)
install_ur_library(ur_adapter_level_zero_v2)

Expand Down
2 changes: 2 additions & 0 deletions unified-runtime/source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ ur_result_t urContextCreate(

Context->initialize();
*RetContext = reinterpret_cast<ur_context_handle_t>(Context);
// TODO: delete below 'if' when memory isolation in the context is
// implemented in the driver
if (IndirectAccessTrackingEnabled) {
std::scoped_lock<ur_shared_mutex> Lock(Platform->ContextsMutex);
Platform->Contexts.push_back(*RetContext);
Expand Down
21 changes: 21 additions & 0 deletions unified-runtime/source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2353,3 +2353,24 @@ void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle,
void *HostPtr) {
ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr));
}

std::ostream &operator<<(std::ostream &os,
ur_device_handle_t_ const &device_handle) {
if (device_handle.Id.has_value()) {
return os << device_handle.Id.value();
}
return os << "NONE";
}

std::ostream &operator<<(std::ostream &os,
ur_device_handle_t_::PeerStatus peer_status) {
switch (peer_status) {
case ur_device_handle_t_::PeerStatus::DISABLED:
return os << "DISABLED";
case ur_device_handle_t_::PeerStatus::ENABLED:
return os << "ENABLED";
case ur_device_handle_t_::PeerStatus::NO_CONNECTION:
return os << "NO_CONNECTION";
}
return os << "UNKNOWN";
}
14 changes: 13 additions & 1 deletion unified-runtime/source/adapters/level_zero/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,17 +261,29 @@ struct ur_device_handle_t_ : ur_object {
std::unordered_map<ur_exp_image_native_handle_t, ze_image_handle_t>
ZeOffsetToImageHandleMap;

// unique ephemeral identifer of the device in the adapter
// Devices which user enabled p2p access by
// urUsmP2P(Enable|Disable)PeerAccessExp. Devices are indexed by device id.
enum class PeerStatus : char { ENABLED, DISABLED, NO_CONNECTION };
std::vector<PeerStatus>
peers; // info if our device can access given peer device allocations

// unique ephemeral identifier of the device in the adapter
std::optional<DeviceId> Id;

ur::RefCount RefCount;
};

std::ostream &operator<<(std::ostream &os,
ur_device_handle_t_ const &device_handle);
std::ostream &operator<<(std::ostream &os,
ur_device_handle_t_::PeerStatus peer_status);

// Collects a flat vector of unique devices for USM memory pool creation.
// Traverses the input devices and their sub-devices, ensuring each Level Zero
// device handle appears only once in the result.
inline std::vector<ur_device_handle_t> CollectDevicesForUsmPoolCreation(
const std::vector<ur_device_handle_t> &Devices) {

std::vector<ur_device_handle_t> DevicesAndSubDevices;
std::unordered_set<ze_device_handle_t> Seen;

Expand Down
43 changes: 40 additions & 3 deletions unified-runtime/source/adapters/level_zero/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -746,9 +746,9 @@ ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
std::shared_lock<ur_shared_mutex> Lock(URDevicesCacheMutex);
auto it = std::find_if(URDevicesCache.begin(), URDevicesCache.end(),
[&](std::unique_ptr<ur_device_handle_t_> &D) {
return D.get()->ZeDevice == ZeDevice &&
(D.get()->RootDevice == nullptr ||
D.get()->RootDevice->RootDevice == nullptr);
return D->ZeDevice == ZeDevice &&
(D->RootDevice == nullptr ||
D->RootDevice->RootDevice == nullptr);
});
if (it != URDevicesCache.end()) {
return (*it).get();
Expand Down Expand Up @@ -914,6 +914,43 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() {
ZeDeviceSynchronizeSupported = Supported;
}

for (auto &dev : URDevicesCache) {
dev->peers = std::vector<ur_device_handle_t_::PeerStatus>(
URDevicesCache.size(), ur_device_handle_t_::PeerStatus::NO_CONNECTION);

for (size_t peerId = 0; peerId < URDevicesCache.size(); ++peerId) {
if (peerId == dev->Id.value())
continue;

ZeStruct<ze_device_p2p_properties_t> p2pProperties;
ZE2UR_CALL(
zeDeviceGetP2PProperties,
(dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2pProperties));
if (!(p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)) {
UR_LOG(INFO,
"p2p access to memory of dev:{} from dev:{} not possible due to "
"lack of p2p property",
peerId, dev->Id.value());
continue;
}

ze_bool_t p2p;
ZE2UR_CALL(zeDeviceCanAccessPeer,
(dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2p));
if (!p2p) {
UR_LOG(INFO,
"p2p access to memory of dev:{} from dev:{} not possible due to "
"no connection",
peerId, dev->Id.value());
continue;
}

UR_LOG(INFO, "p2p access to memory of dev:{} from dev:{} can be enabled",
peerId, dev->Id.value());
dev->peers[peerId] = ur_device_handle_t_::PeerStatus::DISABLED;
}
}

return UR_RESULT_SUCCESS;
}

Expand Down
9 changes: 4 additions & 5 deletions unified-runtime/source/adapters/level_zero/platform.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,10 @@ struct ur_platform_handle_t_ : ur::handle_base<ur::level_zero::ddi_getter>,
uint32_t VersionMinor,
uint32_t VersionBuild);

// Keep track of all contexts in the platform. This is needed to manage
// a lifetime of memory allocations in each context when there are kernels
// with indirect access.
// TODO: should be deleted when memory isolation in the context is implemented
// in the driver.
// Keep track of all contexts in the platform. In v1 L0 this is needed to
// manage a lifetime of memory allocations in each context when there are
// kernels with indirect access. In v2 it is used during
// ext_oneapi_enable_peer_access and ext_oneapi_disable_peer_access calls.
std::list<ur_context_handle_t> Contexts;
ur_shared_mutex ContextsMutex;

Expand Down
18 changes: 12 additions & 6 deletions unified-runtime/source/adapters/level_zero/usm_p2p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,23 @@

namespace ur::level_zero {

ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t /*commandDevice*/,
ur_device_handle_t /*peerDevice*/) {
ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice,
ur_device_handle_t peerDevice) {

// L0 has peer devices enabled by default
UR_LOG(INFO,
"ignored enabling peer access from {} to memory of {}, because P2P is "
"always enabled in Level Zero V1 adapter",
(void *)commandDevice, (void *)peerDevice);
return UR_RESULT_SUCCESS;
}

ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t /*commandDevice*/,
ur_device_handle_t /*peerDevice*/) {
ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice,
ur_device_handle_t peerDevice) {

// L0 has peer devices enabled by default
UR_LOG(INFO,
"ignored disabling peer access from {} to memory of {}, because P2P "
"is always enabled in Level Zero V1 adapter",
(void *)commandDevice, (void *)peerDevice);
return UR_RESULT_SUCCESS;
}

Expand Down
Loading
Loading