ollamarunner: Temporarily disable worst case graph preallocation

When we later have a large batch running purely on a CPU, this results the error: GGML_ASSERT(talloc->buffer_id >= 0) Disabling this means that we will incrementally reallocate memory as the graph grows. Fixes #10410

ollamarunner: Temporarily disable worst case graph preallocation
When we later have a large batch running purely on a CPU, this results the error: GGML_ASSERT(talloc->buffer_id >= 0) Disabling this means that we will incrementally reallocate memory as the graph grows. Fixes #10410
a27462b7 · Jesse Gross · Jesse Gross · 6bf0b819 · a27462b7
Commit a27462b7 authored Apr 29, 2025 by Jesse Gross Committed by Jesse Gross Apr 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

runner/ollamarunner/runner.go runner/ollamarunner/runner.go +6 -4

No files found.
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -723,7 +723,9 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }

-func (s *Server) reserveWorstCaseGraph() error {
+// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
+// to the GPU
+/*func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

@@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 	}

 	return nil
-}
+}*/

 func (s *Server) loadModel(
 	ctx context.Context,
@@ -803,10 +805,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	err = s.reserveWorstCaseGraph()
+	/*err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}
+	}*/

 	s.status = llm.ServerStatusReady
 	s.ready.Done()