Unverified Commit df335aac authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

gpt-oss: disable quantized kv cache (#11929)

parent 026bc292
...@@ -752,6 +752,11 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { ...@@ -752,6 +752,11 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
// SupportsKVCacheType checks if the requested cache type is supported // SupportsKVCacheType checks if the requested cache type is supported
func (f GGML) SupportsKVCacheType(cacheType string) bool { func (f GGML) SupportsKVCacheType(cacheType string) bool {
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types ", "mode", arch)
return cacheType == "f16"
}
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment