"graphbolt/src/cuda/gpu_cache.cu" did not exist on "528b041c51aae91afb7b40c031010f24cfcd3cf8"
Commit 19e6796e authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Support KV cache quantization with gpt-oss

With the new version of GGML in #12245, KV cache quantization
no longer causes a fallback to CPU.
parent 33801c15
......@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
return true
}
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types", "model", arch)
return false
}
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment