Merge pull request #10451 from ollama/revert-10364-drifkin/context-length

Revert "increase default context length to 4096"

Merge pull request #10451 from ollama/revert-10364-drifkin/context-length
Revert "increase default context length to 4096"
a25f3f82 · Devon Rifkin · GitHub · 5cfc1c39 · dd93e1af · a25f3f82
Unverified Commit a25f3f82 authored Apr 28, 2025 by Devon Rifkin Committed by GitHub Apr 28, 2025
7 changed files
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
-				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
 ## How can I specify the context window size?
-By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 
+By default, Ollama uses a context window size of 2048 tokens. 
 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:
 ```shell
-/set parameter num_ctx 8192
+/set parameter num_ctx 4096
 ```
 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 8192
+    "num_ctx": 4096
  }
 }'
 ```

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
 )
 func String(s string) func() string {
@@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }
-func Int64(key string, defaultValue int64) func() int64 {
-	return func() int64 {
-		if s := Var(key); s != "" {
-			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
-				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
-			} else {
-				return n
-			}
-		}
-		return defaultValue
-	}
-}
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		// Informational

--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
 }
 func TestContextLength(t *testing.T) {
-	cases := map[string]int64{
+	cases := map[string]uint{
-		"":     -1,
+		"":     2048,
 		"4096": 4096,
 	}

--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})
 		if w.Code != http.StatusOK {
@@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})
 		if w.Code != http.StatusOK {
@@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})
 		if w.Code != http.StatusOK {

--- a/server/sched.go
+++ b/server/sched.go
@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 4
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")
@@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }
-const (
-	defaultContextLength  = 4096
-	smallGpuContextLength = 2048
-)
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}
-					if pending.origNumCtx == -1 {
-						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
-							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
-							pending.opts.NumCtx = smallGpuContextLength
-							pending.origNumCtx = smallGpuContextLength
-						} else {
-							pending.opts.NumCtx = defaultContextLength
-							pending.origNumCtx = defaultContextLength
-						}
-					}
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs

--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }