ollamarunner: Re-enable worst case graph preallocation.

Worst case graph preallocation was disabled by a27462b7 "ollamarunner: Temporarily disable worst case graph preallocation" since it caused crashes with large batches when not using the GPU. This backports upstream llama.cpp commit f057808 "ggml: Don't assert fail when tensor data changes (#13222)", which fixes the underlying bug and allows reverting the previous workaround.

ollamarunner: Re-enable worst case graph preallocation.
Worst case graph preallocation was disabled by a27462b7 "ollamarunner: Temporarily disable worst case graph preallocation" since it caused crashes with large batches when not using the GPU. This backports upstream llama.cpp commit f057808 "ggml: Don't assert fail when tensor data changes (#13222)", which fixes the underlying bug and allows reverting the previous workaround.
c2f5d666 · Jesse Gross · Jesse Gross · 57fb759f · c2f5d666 · c2f5d666
Commit c2f5d666 authored May 02, 2025 by Jesse Gross Committed by Jesse Gross May 02, 2025
3 changed files
--- a/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@kernel.org>
+Date: Thu, 1 May 2025 13:46:10 -0700
+Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
+
+The following scenario will cause an assertion failure in the graph
+allocator:
+ - Build and allocate a graph containing a tensor with a non-NULL data
+   pointer
+ - Build and allocate a new graph where that data is NULL
+
+Result:
+ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
+
+This happens during revalidation because we think that memory should
+have been previously allocated based on the current graph but in
+reality the previous graph was different. In this situation, we
+should do a full reallocation pass.
+---
+ ggml/src/ggml-alloc.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index a3d3f690..5fd379f6 100644
+--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+     size_t node_size = 0;
+     if (!node->data && !node->view_src) {
+-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        // If we previously had data but don't now then reallocate
+        if (talloc->buffer_id < 0) {
+            return false;
+        }
+         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+     }
+     return talloc->size_max >= node_size;
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
    size_t node_size = 0;
    if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        // If we previously had data but don't now then reallocate
+        if (talloc->buffer_id < 0) {
+            return false;
+        }
        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    }
    return talloc->size_max >= node_size;

--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -715,9 +715,7 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }

-// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
-// to the GPU
-/*func (s *Server) reserveWorstCaseGraph() error {
+func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

@@ -760,7 +758,7 @@ func (m *multiLPath) String() string {
 	}

 	return nil
-}*/
+}

 func (s *Server) loadModel(
 	ctx context.Context,
@@ -797,10 +795,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	/*err = s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}*/
+	}

 	s.status = llm.ServerStatusReady
 	s.ready.Done()