cuda: skip large batches

cuda panics on batches larger than 1024 so skip those and fallback to cpu

cuda: skip large batches
cuda panics on batches larger than 1024 so skip those and fallback to cpu
0796d79d · Michael Yang · Michael Yang · 92981ae3 · 0796d79d · 0796d79d
Commit 0796d79d authored Nov 18, 2025 by Michael Yang Committed by Michael Yang Nov 18, 2025
9 changed files
--- a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
+++ b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
@@ -38,7 +38,7 @@ index 44ae76d66..639d551a2 100644
 #ifdef __cplusplus
 }
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index d2c278a35..221e29509 100644
+index ca02ea079..c12b069e5 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();

--- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
+++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
@@ -11,7 +11,7 @@ vidmem optimization.
 1 file changed, 1 insertion(+), 4 deletions(-)
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 221e29509..18b7cbccf 100644
+index c12b069e5..76c78c2ea 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr

--- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
+++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
@@ -50,7 +50,7 @@ Subject: [PATCH] Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 18b7cbccf..53b57c179 100644
+index 76c78c2ea..7669ed206 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -488,6 +488,7 @@ struct vk_device_struct {

--- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
+++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
@@ -58,7 +58,7 @@ index 639d551a2..e5c446d1d 100644
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 53b57c179..b2855b078 100644
+index 7669ed206..63a762ec2 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;

--- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
+++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
@@ -31,7 +31,7 @@ Add new backend tests.
 6 files changed, 371 insertions(+), 117 deletions(-)
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b2855b078..aaf4334b5 100644
+index 63a762ec2..db92a7901 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {

--- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
+++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
@@ -9,7 +9,7 @@ Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
 2 files changed, 16 insertions(+), 4 deletions(-)
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index aaf4334b5..3604ceb04 100644
+index db92a7901..e959674d1 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {

--- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
+++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
@@ -20,7 +20,7 @@ Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
 1 file changed, 13 insertions(+), 7 deletions(-)
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3604ceb04..80185d9f0 100644
+index e959674d1..903050b0b 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);

--- a/llama/patches/0036-ggml-cuda-skip-large-batches.patch
+++ b/llama/patches/0036-ggml-cuda-skip-large-batches.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Tue, 18 Nov 2025 11:13:04 -0800
+Subject: [PATCH] ggml-cuda: skip large batches
+cuda panics on batches larger than 1024 so mark it as unsupported to
+fallback to cpu
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
+ 1 file changed, 3 insertions(+)
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index f1a20e7fe..1a71e07c9 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+                     return false;
+                 }
+                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
+                    return false;
+                }
+ #ifdef GGML_USE_MUSA
+                 const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                 if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                    return false;
                }
+                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
+                    return false;
+                }
 #ifdef GGML_USE_MUSA
                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
                if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {