ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.

ml: Add support for quantized KV cache
Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
4100ed7b · Jesse Gross · Jesse Gross · f52b2615 · 4100ed7b · 4100ed7b
Commit 4100ed7b authored Feb 21, 2025 by Jesse Gross Committed by Jesse Gross Mar 07, 2025
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 3 deletions

ml/backend.go ml/backend.go +3 -1

ml/backend/ggml/ggml.go ml/backend/ggml/ggml.go +8 -0

runner/ollamarunner/cache.go runner/ollamarunner/cache.go +2 -2

No files found.
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -215,7 +215,7 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
 		return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
 			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
 		})
-	case DTypeF16:
+	case DTypeF16, DTypeQ80, DTypeQ40:
 		f32 := ctx.Empty(DTypeF32, t.Shape()...)
 		f32 = t.Copy(ctx, f32)
 		return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
@@ -283,5 +283,7 @@ const (
 	DTypeOther DType = iota
 	DTypeF32
 	DTypeF16
+	DTypeQ80
+	DTypeQ40
 	DTypeI32
 )
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -535,6 +535,10 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cdtype = C.GGML_TYPE_F32
 	case ml.DTypeF16:
 		cdtype = C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		cdtype = C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		cdtype = C.GGML_TYPE_Q4_0
 	case ml.DTypeI32:
 		cdtype = C.GGML_TYPE_I32
 	default:
@@ -680,6 +684,10 @@ func (t *Tensor) DType() ml.DType {
 		return ml.DTypeF32
 	case C.GGML_TYPE_F16:
 		return ml.DTypeF16
+	case C.GGML_TYPE_Q8_0:
+		return ml.DTypeQ80
+	case C.GGML_TYPE_Q4_0:
+		return ml.DTypeQ40
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
 	default:

--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -58,9 +58,9 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
 func kvCacheTypeFromStr(s string) ml.DType {
 	switch s {
 	case "q8_0":
-		panic("kv cache quantization not yet implemented")
+		return ml.DTypeQ80
 	case "q4_0":
-		panic("kv cache quantization not yet implemented")
+		return ml.DTypeQ40
 	default:
 		return ml.DTypeF16
 	}