gpt-oss: disable quantized kv cache (#11929)

df335aac · Michael Yang · GitHub · 026bc292 · df335aac
Unverified Commit df335aac authored Aug 15, 2025 by Michael Yang Committed by GitHub Aug 15, 2025
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

fs/ggml/ggml.go fs/ggml/ggml.go +5 -0

No files found.
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -752,6 +752,11 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
+	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
+		// gpt-oss uses attention with sinks which does not support quantized cache types
+		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
+		return cacheType == "f16"
+	}
 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }