From 787c6355c1aaa492066a681d7d2664709f70b6fb Mon Sep 17 00:00:00 2001 From: mudler <2420543+mudler@users.noreply.github.com> Date: Sat, 27 Jun 2026 22:32:53 +0000 Subject: [PATCH 1/2] :arrow_up: Update TheTom/llama-cpp-turboquant Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- backend/cpp/turboquant/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile index a32adf0b62ae..8b593f8d9d35 100644 --- a/backend/cpp/turboquant/Makefile +++ b/backend/cpp/turboquant/Makefile @@ -1,7 +1,7 @@ # Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant. # Auto-bumped nightly by .github/workflows/bump_deps.yaml. -TURBOQUANT_VERSION?=7d9715f1f071fa07c7b2ad3dbfd320b314139e65 +TURBOQUANT_VERSION?=a33ef00b13476e9c609caecc3c1c015b8615011d LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant CMAKE_ARGS?= From bd8a735ff05b32a07cdc63acf88507f981faed60 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 27 Jun 2026 22:23:36 +0000 Subject: [PATCH 2/2] fix(turboquant): re-anchor HIP event patch for fork a33ef00b (#10235) The fork a33ef00b HIP-ported ggml_cuda_copy2d_across_devices() itself (it now guards the cudaMemcpy3DPeer fast path with #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) and falls back to a plain 2D device-to-device copy), so the former copy2d hunk's anchors went stale. Retire that hunk and re-anchor the still-needed event hunk: ggml_backend_cuda_device_event_new() still uses plain cudaEventCreate, which ggml's HIP shim does not alias, so keep the cudaEventCreateWithFlags (cudaEventDisableTiming) rewrite. Validated with apply-patches.sh against a fresh clone at the pinned commit. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- .../0001-hip-guard-copy2d-peer-fastpath.patch | 58 ++++++------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch b/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch index 71e55f621afb..1ac2786c232b 100644 --- a/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch +++ b/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch @@ -1,55 +1,33 @@ hip: port the turboquant CUDA additions that ggml's HIP shim doesn't cover -The turboquant fork adds/modifies a few ggml-cuda.cu spots with CUDA APIs -that ggml's HIP (and MUSA) compatibility layer does not provide, breaking -the -gpu-rocm-hipblas-turboquant build: +The turboquant fork adds/modifies ggml-cuda.cu spots with CUDA APIs that +ggml's HIP (and MUSA) compatibility layer does not provide, breaking the +-gpu-rocm-hipblas-turboquant build: - 1. ggml_cuda_copy2d_across_devices() (host-staged cross-device copy for - split mul_mat output) uses the CUDA 3D-peer copy APIs - cudaMemcpy3DPeerParms / make_cudaPitchedPtr / make_cudaExtent / - cudaMemcpy3DPeerAsync. HIP genuinely does not support these (see the - fork's own comment "HIP does not support cudaMemcpy3DPeerAsync"), so - guard the peer fast path with #if !defined(GGML_USE_HIP) && - !defined(GGML_USE_MUSA) -- matching how the fork already guards the - same API for the sibling 2D copy -- and fall through to the existing - cudaMemcpyAsync staging fallback below (functionally identical, - slightly slower on multi-GPU ROCm). + ggml_backend_cuda_device_event_new() creates its event with plain + cudaEventCreate, which ggml's HIP shim does not alias (it only aliases + cudaEventCreateWithFlags). Use cudaEventCreateWithFlags(..., + cudaEventDisableTiming) -- exactly what the rest of this file already + does (cf. ggml_cuda_set_main_device extras, copy_event) and HIP-safe. - 2. ggml_backend_cuda_device_event_new() creates its event with plain - cudaEventCreate, which ggml's HIP shim does not alias (it only aliases - cudaEventCreateWithFlags). Use cudaEventCreateWithFlags(..., - cudaEventDisableTiming) -- exactly what the rest of this file already - does (cf. lines ~1034, ~3461) and HIP-safe. +CUDA builds are unaffected. Drop this patch once the fork HIP-ports the +event create; apply-patches.sh fails fast if the anchor goes stale. -CUDA builds are unaffected. Drop the relevant hunk once the fork HIP-ports -these; apply-patches.sh fails fast if an anchor goes stale. +Note: the fork has since HIP-ported ggml_cuda_copy2d_across_devices() +itself (it now guards the cudaMemcpy3DPeer fast path with +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) and falls back to a +plain 2D device-to-device copy), so the former copy2d hunk was retired. diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 0427e6b..6352e6a 100644 +index 08cfd37f..b97f0012 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -1933,6 +1933,7 @@ static cudaError_t ggml_cuda_copy2d_across_devices( - size_t width, size_t height, cudaStream_t dst_stream, cudaStream_t src_stream) { - - const auto & info = ggml_cuda_info(); -+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) // 3D-peer copy types unmapped by ggml's HIP/MUSA shim; use staging fallback below - if (info.peer_access[src_device][dst_device]) { - cudaMemcpy3DPeerParms p = {}; - p.dstDevice = dst_device; -@@ -1942,6 +1943,7 @@ static cudaError_t ggml_cuda_copy2d_across_devices( - p.extent = make_cudaExtent(width, height, 1); - return cudaMemcpy3DPeerAsync(&p, dst_stream); - } -+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - - // Fallback: stage all rows through a single contiguous pinned buffer - int prev_device = ggml_cuda_get_device(); -@@ -5714,7 +5716,7 @@ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_ +@@ -5794,7 +5794,7 @@ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_ ggml_cuda_set_device(dev_ctx->device); - + cudaEvent_t event; - CUDA_CHECK(cudaEventCreate(&event)); + CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - + return new ggml_backend_event { /* .device = */ dev,