Commit a27462b7 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

ollamarunner: Temporarily disable worst case graph preallocation

When we later have a large batch running purely on a CPU, this
results the error:
GGML_ASSERT(talloc->buffer_id >= 0)

Disabling this means that we will incrementally reallocate memory
as the graph grows.

Fixes #10410
parent 6bf0b819
...@@ -723,7 +723,9 @@ func (m *multiLPath) String() string { ...@@ -723,7 +723,9 @@ func (m *multiLPath) String() string {
return strings.Join(*m, ", ") return strings.Join(*m, ", ")
} }
func (s *Server) reserveWorstCaseGraph() error { // TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
// to the GPU
/*func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext() ctx := s.model.Backend().NewContext()
defer ctx.Close() defer ctx.Close()
...@@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error { ...@@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error {
} }
return nil return nil
} }*/
func (s *Server) loadModel( func (s *Server) loadModel(
ctx context.Context, ctx context.Context,
...@@ -803,10 +805,10 @@ func (s *Server) loadModel( ...@@ -803,10 +805,10 @@ func (s *Server) loadModel(
s.seqs = make([]*Sequence, s.parallel) s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
err = s.reserveWorstCaseGraph() /*err = s.reserveWorstCaseGraph()
if err != nil { if err != nil {
panic(err) panic(err)
} }*/
s.status = llm.ServerStatusReady s.status = llm.ServerStatusReady
s.ready.Done() s.ready.Done()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment