llm: Clamp batch size to context size

The context must always be able to store the current batch, so if the user requests a small context then we should also shrink the batch to match. This also fixes the TestLongInputContext test on the new engine. (The old engine already has this behavior.)

llm: Clamp batch size to context size
The context must always be able to store the current batch, so if the user requests a small context then we should also shrink the batch to match. This also fixes the TestLongInputContext test on the new engine. (The old engine already has this behavior.)
e119783e · Jesse Gross · Jesse Gross · 1a558f98 · e119783e · e119783e
Commit e119783e authored Sep 08, 2025 by Jesse Gross Committed by Jesse Gross Sep 08, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 3 deletions

integration/context_test.go integration/context_test.go +1 -1

llm/server.go llm/server.go +2 -0

runner/ollamarunner/cache.go runner/ollamarunner/cache.go +2 -2

No files found.
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
 }
 func TestContextExhaustion(t *testing.T) {

--- a/llm/server.go
+++ b/llm/server.go
@@ -173,6 +173,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		opts.NumCtx = int(trainCtx)
 	}
+	opts.NumBatch = min(opts.NumBatch, opts.NumCtx)
 	loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
 	defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()

--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -34,8 +34,8 @@ type InputCache struct {
 func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
 	numCtx := kvSize / int32(numSlots)
-	if numCtx < 1 {
+	if int(numCtx) < batchSize {
-		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+		return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots)
 	}
 	slots := make([]InputCacheSlot, numSlots)