ggml: Backport scale kernel fixes

The GGML scale kernel uses signed 32-bit ints to represent the number of elements in the tensor. For large images, mistral-small3.2 overflows this, triggering CUDA errors due to negative arguments. Currently, this can happen when the user passes a large image to mistral-small3.2. However, with upcoming changes to reserve CUDA memory, it happens every time mistral-small is loaded as we reserve using a worst case batch. This patch is part of an upstream GGML commit and should be removed after GGML is updated past 0a1b398 "ggml: add ops for WAN video model (cuda && cpu) (#15669)". Fixes #10388

ggml: Backport scale kernel fixes
The GGML scale kernel uses signed 32-bit ints to represent the number of elements in the tensor. For large images, mistral-small3.2 overflows this, triggering CUDA errors due to negative arguments. Currently, this can happen when the user passes a large image to mistral-small3.2. However, with upcoming changes to reserve CUDA memory, it happens every time mistral-small is loaded as we reserve using a worst case batch. This patch is part of an upstream GGML commit and should be removed after GGML is updated past 0a1b398 "ggml: add ops for WAN video model (cuda && cpu) (#15669)". Fixes #10388
efaee8c2 · Jesse Gross · Jesse Gross · 734b57da · efaee8c2 · efaee8c2
Commit efaee8c2 authored Sep 23, 2025 by Jesse Gross Committed by Jesse Gross Sep 30, 2025
Showing with 67 additions and 9 deletions

llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch +57 -0

ml/backend/ggml/ggml/src/ggml-cuda/scale.cu ml/backend/ggml/ggml/src/ggml-cuda/scale.cu +10 -9

No files found.
--- a/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
+++ b/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Tue, 23 Sep 2025 15:41:58 -0700
+Subject: [PATCH] ggml: Backport scale kernel fixes
+The GGML scale kernel uses signed 32-bit ints to represent
+the number of elements in the tensor. For large images,
+mistral-small3.2 overflows this, triggering CUDA errors due
+to negative arguments.
+Currently, this can happen when the user passes a large image
+to mistral-small3.2. However, with upcoming changes to reserve
+CUDA memory, it happens every time mistral-small is loaded as
+we reserve using a worst case batch.
+This patch is part of an upstream GGML commit and should be removed
+after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
+(cuda && cpu) (#15669)".
+Fixes #10388
+---
+ ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
+ 1 file changed, 10 insertions(+), 9 deletions(-)
+diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
+index 2ee9e5889..0ddeff6a1 100644
+--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
+@@ -1,18 +1,19 @@
+ #include "scale.cuh"
+-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
+-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF
+-    if (i >= k) {
+-        return;
+-    }
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+    int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+    int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
+-    dst[i] = scale * x[i] + bias;
+    for (int64_t i = tid; i < nelements; i += stride) {
+        dst[i] = scale * x[i] + bias;
+    }
+ }
+-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
+-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+    const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
+ }
+ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
 #include "scale.cuh"
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
+#define MAX_GRIDDIM_X 0x7FFFFFFF
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
-        return;
+    int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
-    }
+    int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
+    for (int64_t i = tid; i < nelements; i += stride) {
        dst[i] = scale * x[i] + bias;
+    }
 }
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
 }
 void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {