Move quantization to new backend (#10363)

* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.

Move quantization to new backend (#10363)
* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.
42481045 · Daniel Hiltgen · GitHub · 95e744be · 42481045 · 42481045
Unverified Commit 42481045 authored May 06, 2025 by Daniel Hiltgen Committed by GitHub May 06, 2025
19 changed files
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -74,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@@ -1607,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
-    {
-        LLM_ARCH_MISTRAL3,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-        }
-    },
    {
        LLM_ARCH_UNKNOWN,
        {

--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -76,7 +76,6 @@ enum llm_arch {
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_SOLAR,
    LLM_ARCH_WAVTOKENIZER_DEC,
-    LLM_ARCH_MISTRAL3,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_UNKNOWN,

--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -1437,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MISTRAL3: break;
        default: throw std::runtime_error("unsupported model architecture");
    }
@@ -13752,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_SOLAR:
        case LLM_ARCH_BAILINGMOE:
-        case LLM_ARCH_MISTRAL3:
            return LLAMA_ROPE_TYPE_NORM;
        // the pairs of head values are offset by n_rot/2

--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // This used to be a regex, but <regex> has an extreme cost to compile times.
        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-        // don't quantize vision stuff
-        quantize &= name.find("v.") == std::string::npos;
-        quantize &= name.find("mm.") == std::string::npos;
        // quantize only 2D and 3D tensors (experts)
        quantize &= (ggml_n_dims(tensor) >= 2);

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -460,24 +460,6 @@ func (m *Model) NEmbd() int {
 	return int(C.llama_model_n_embd(m.c))
 }
-func Quantize(infile, outfile string, ftype uint32) error {
-	cinfile := C.CString(infile)
-	defer C.free(unsafe.Pointer(cinfile))
-	coutfile := C.CString(outfile)
-	defer C.free(unsafe.Pointer(coutfile))
-	params := C.llama_model_quantize_default_params()
-	params.nthread = -1
-	params.ftype = ftype
-	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
-		return fmt.Errorf("llama_model_quantize: %d", rc)
-	}
-	return nil
-}
 // vision processing
 type ClipContext struct {
 	c *C.struct_clip_ctx

--- a/llama/patches/0016-add-model-quantizations.patch
+++ b/llama/patches/0016-add-model-quantizations.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 20:39:32 -0700
-Subject: [PATCH] add model quantizations
-a temporary patch to add model quantization for
-models not supported in llama.cpp
---
- src/llama-arch.cpp  | 17 +++++++++++++++++
- src/llama-arch.h    |  1 +
- src/llama-model.cpp |  2 ++
- src/llama-quant.cpp |  4 ++++
- 4 files changed, 24 insertions(+)
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index eb7b5325..df42d1a5 100644
--- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-     { LLM_ARCH_PLM,              "plm"              },
-     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
-@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
-         },
-     },
-+    {
-+        LLM_ARCH_MISTRAL3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-+        }
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index bc8a4f0b..bda9d071 100644
--- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -76,6 +76,7 @@ enum llm_arch {
-     LLM_ARCH_CHAMELEON,
-     LLM_ARCH_SOLAR,
-     LLM_ARCH_WAVTOKENIZER_DEC,
-+    LLM_ARCH_MISTRAL3,
-     LLM_ARCH_PLM,
-     LLM_ARCH_BAILINGMOE,
-     LLM_ARCH_UNKNOWN,
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 9d099f11..ef70486d 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     default: type = LLM_TYPE_UNKNOWN;
-                 }
-             } break;
-+        case LLM_ARCH_MISTRAL3: break;
-         default: throw std::runtime_error("unsupported model architecture");
-     }
-@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         case LLM_ARCH_CHAMELEON:
-         case LLM_ARCH_SOLAR:
-         case LLM_ARCH_BAILINGMOE:
-+        case LLM_ARCH_MISTRAL3:
-             return LLAMA_ROPE_TYPE_NORM;
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 223e1f3f..8ae6dde8 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         // This used to be a regex, but <regex> has an extreme cost to compile times.
-         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-+        // don't quantize vision stuff
-+        quantize &= name.find("v.") == std::string::npos;
-+        quantize &= name.find("mm.") == std::string::npos;
-+
-         // quantize only 2D and 3D tensors (experts)
-         quantize &= (ggml_n_dims(tensor) >= 2);
--- a/llama/patches/0017-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0017-add-ollama-vocab-for-grammar-support.patch
--- a/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	inputLayerCount := 5
-	tensors := []ggml.Tensor{
+	tensors := []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range meta.Tensors().Items() {
+		t := t
 		g.Go(func() error {
 			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
 			for i := range tts {

--- a/ml/backend/ggml/quantization.go
+++ b/ml/backend/ggml/quantization.go
+package ggml
+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src
+// #include <stdlib.h>
+// #include <stdint.h>
+// #include "ggml.h"
+// #include "ggml-cpu.h"
+// #include "ggml-backend.h"
+// #include "ggml-quants.h"
+import "C"
+import (
+	"unsafe"
+	fsggml "github.com/ollama/ollama/fs/ggml"
+)
+// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it
+func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 {
+	f32s := make([]float32, nelements)
+	elems := C.int64_t(nelements)
+	switch dtype {
+	case C.GGML_TYPE_F16:
+		C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q4_0:
+		C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q4_1:
+		C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q5_0:
+		C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q5_1:
+		C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q8_0:
+		C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q2_K:
+		C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q3_K:
+		C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q4_K:
+		C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q5_K:
+		C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_Q6_K:
+		C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_BF16:
+		C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	default:
+		panic("unsupported quantization format")
+	}
+	return f32s
+}
+func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte {
+	buf := make([]byte, len(f32s)*4) // upper bound on size
+	nPerRow := C.int64_t(shape[0])
+	nrows := C.int64_t(1)
+	if len(shape) > 1 {
+		nrows = C.int64_t(shape[1])
+	}
+	shape2 := C.int64_t(1)
+	if len(shape) > 2 {
+		shape2 = C.int64_t(shape[2])
+	}
+	nelements_matrix := nPerRow * nrows
+	newSize := C.size_t(0)
+	for i03 := C.int64_t(0); i03 < shape2; i03++ {
+		f32s_03 := i03 * nelements_matrix
+		buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows
+		newSize += C.ggml_quantize_chunk(
+			uint32(newType),
+			(*C.float)(&f32s[f32s_03]),
+			unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)),
+			0,
+			nrows,
+			nPerRow,
+			nil)
+	}
+	return buf[:newSize]
+}
+func QuantizationVersion() uint32 {
+	return uint32(C.GGML_QNT_VERSION)
+}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -765,7 +765,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
 	return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
 }
-func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
+func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
 	t.Helper()
 	f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")

--- a/server/create.go
+++ b/server/create.go
@@ -15,6 +15,7 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
+	"sync/atomic"
 	"github.com/gin-gonic/gin"
@@ -23,7 +24,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
 func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) {
 	ft := layer.GGML.KV().FileType()
-	fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
+	var doneBytes atomic.Uint64
+	totalBytes := uint64(layer.Size) - layer.GGML.Tensors().Offset
-	want, err := ggml.ParseFileType(quantizeType)
+	fnWrap := func(n uint64) {
+		done := doneBytes.Add(n)
+		progress := float32(done) / float32(totalBytes)
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+	}
+	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {
 		return nil, err
 	}
@@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	if err != nil {
 		return nil, err
 	}
+	fp, err := os.Open(blob)
+	if err != nil {
+		return nil, err
+	}
+	defer fp.Close()
 	temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType)
 	if err != nil {
@@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	defer temp.Close()
 	defer os.Remove(temp.Name())
-	if err := llama.Quantize(blob, temp.Name(), uint32(want)); err != nil {
+	if err := quantize(fp, temp, layer.GGML, ftype, fnWrap); err != nil {
 		return nil, err
 	}
+	temp.Seek(0, io.SeekStart)
+	fn(api.ProgressResponse{Status: "verifying conversion"})
 	newLayer, err := NewLayer(temp, layer.MediaType)
 	if err != nil {
 		return nil, err
 	}
 	if _, err := temp.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}
@@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
 	}
 	return &layerGGML{newLayer, f}, nil
 }

--- a/server/model.go
+++ b/server/model.go
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()
-			f, _, err := ggml.Decode(blob, 1024)
+			f, _, err := ggml.Decode(blob, -1)
 			if err != nil {
 				return nil, err
 			}

--- a/server/quantization.go
+++ b/server/quantization.go
+package server
+import (
+	"fmt"
+	"io"
+	"log/slog"
+	"maps"
+	"os"
+	"strings"
+	"unsafe"
+	fsggml "github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml/backend/ggml"
+)
+type quantizer struct {
+	*os.File
+	offset     uint64
+	from, to   *fsggml.Tensor
+	progressFn func(n uint64)
+}
+func (q quantizer) WriteTo(w io.Writer) (int64, error) {
+	quantize := q.from.Kind != q.to.Kind
+	sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size()))
+	if !quantize {
+		n, err := io.Copy(w, sr)
+		q.progressFn(q.from.Size())
+		return n, err
+	}
+	data, err := io.ReadAll(sr)
+	if err != nil {
+		slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
+		return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
+	}
+	var f32s []float32
+	newType := fsggml.TensorType(q.to.Kind)
+	if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
+		f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements())
+	} else {
+		f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements())
+	}
+	data = ggml.Quantize(newType, f32s, q.from.Shape)
+	n, err := w.Write(data)
+	q.progressFn(q.from.Size())
+	return int64(n), err
+}
+type quantizeState struct {
+	nAttnV    int  // Number of attn_*v* weight tensors
+	nFfnDown  int  // Number of ffn_down tensors
+	iAttnV    int  // Running counter of number of attn_v tensors that have been processed
+	iFfnDown  int  // Running counter of number of ffn_down tensors that have been processed
+	hasOutput bool // used to figure out if a model shares tok_embd with the output weight
+}
+func useMoreBits(iLayer, nLayers int) bool {
+	return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
+}
+func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
+	// Ported from llama_tensor_get_type, removed unsupported quantization types
+	nExperts := max(1, kv.Uint("expert_count", 0))
+	if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") {
+		nx := shape[0]
+		qk_k := newType.BlockSize()
+		if nx%qk_k != 0 {
+			newType = fsggml.TensorTypeQ8_0
+		} else if newType != fsggml.TensorTypeQ8_0 {
+			newType = fsggml.TensorTypeQ6_K
+		}
+	} else if strings.Contains(name, "attn_v.weight") {
+		if ftype == fsggml.FileTypeQ2_K {
+			if kv.GQA() >= 4 {
+				newType = fsggml.TensorTypeQ4_K
+			} else {
+				newType = fsggml.TensorTypeQ3_K
+			}
+		} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
+			newType = fsggml.TensorTypeQ4_K
+		} else if ftype == fsggml.FileTypeQ3_K_M {
+			if qs.iAttnV < 2 {
+				newType = fsggml.TensorTypeQ5_K
+			} else {
+				newType = fsggml.TensorTypeQ4_K
+			}
+		} else if ftype == fsggml.FileTypeQ3_K_L {
+			newType = fsggml.TensorTypeQ5_K
+		} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
+			useMoreBits(qs.iAttnV, qs.nAttnV) {
+			newType = fsggml.TensorTypeQ6_K
+		} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
+			newType = fsggml.TensorTypeQ5_K
+		}
+		// TODO
+		// if (qs.model.type == LLM_TYPE_70B) {
+		// 	// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
+		// 	// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
+		// 	// nearly negligible increase in model size by quantizing this tensor with more bits:
+		// 	if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K;
+		// }
+		if nExperts == 8 {
+			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+			newType = fsggml.TensorTypeQ8_0
+		}
+		qs.iAttnV++
+	} else if strings.Contains(name, "attn_k.weight") {
+		if nExperts == 8 {
+			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+			newType = fsggml.TensorTypeQ8_0
+		}
+	} else if strings.Contains(name, "ffn_down") {
+		iLayer := qs.iFfnDown
+		n_layer := qs.nFfnDown
+		if ftype == fsggml.FileTypeQ2_K {
+			newType = fsggml.TensorTypeQ3_K
+		} else if ftype == fsggml.FileTypeQ2_K_S {
+			if iLayer < n_layer/8 {
+				newType = fsggml.TensorTypeQ4_K
+			}
+		} else if ftype == fsggml.FileTypeQ3_K_M {
+			if iLayer < n_layer/16 {
+				newType = fsggml.TensorTypeQ5_K
+			} else if useMoreBits(iLayer, n_layer) {
+				newType = fsggml.TensorTypeQ4_K
+			} else {
+				newType = fsggml.TensorTypeQ3_K
+			}
+		} else if ftype == fsggml.FileTypeQ3_K_L {
+			newType = fsggml.TensorTypeQ5_K
+		} else if ftype == fsggml.FileTypeQ4_K_M {
+			if useMoreBits(iLayer, n_layer) {
+				newType = fsggml.TensorTypeQ6_K
+			}
+		} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
+			newType = fsggml.TensorTypeQ6_K
+		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
+			newType = fsggml.TensorTypeQ5_K
+		}
+		qs.iFfnDown++
+	} else if strings.Contains(name, "attn_output.weight") {
+		if nExperts == 8 {
+			if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
+				ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
+				newType = fsggml.TensorTypeQ5_K
+			}
+		} else {
+			if ftype == fsggml.FileTypeQ2_K {
+				newType = fsggml.TensorTypeQ3_K
+			} else if ftype == fsggml.FileTypeQ3_K_M {
+				newType = fsggml.TensorTypeQ4_K
+			} else if ftype == fsggml.FileTypeQ3_K_L {
+				newType = fsggml.TensorTypeQ5_K
+			}
+		}
+	} else if strings.Contains(name, "attn_qkv.weight") {
+		if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
+			newType = fsggml.TensorTypeQ4_K
+		} else if ftype == fsggml.FileTypeQ4_K_M {
+			newType = fsggml.TensorTypeQ5_K
+		} else if ftype == fsggml.FileTypeQ5_K_M {
+			newType = fsggml.TensorTypeQ6_K
+		}
+	}
+	if newType.IsQuantized() {
+		nx := shape[0]
+		ny := uint64(1)
+		if len(shape) > 1 {
+			ny = shape[1]
+		}
+		qk_k := newType.BlockSize()
+		if nx%qk_k != 0 {
+			slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s.  Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
+			newType = fsggml.TensorTypeF16
+		}
+	}
+	return newType
+}
+func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
+	kv := maps.Clone(orig.KV())
+	kv["general.file_type"] = newFileType
+	// kv["general.quantization_version"] = ggml.QuantizationVersion()
+	qs := &quantizeState{}
+	// Build up the quantize state so newType can adjust types
+	layerCount := 0
+	for k, l := range orig.Tensors().GroupLayers() {
+		if strings.HasPrefix(k, "blk.") {
+			layerCount++
+		}
+		for _, tensor := range l {
+			if strings.Contains(tensor.Name, "attn_v.weight") ||
+				strings.Contains(tensor.Name, "attn_qkv.weight") ||
+				strings.Contains(tensor.Name, "attn_kv_b.weight") {
+				qs.nAttnV++
+			} else if tensor.Name == "output.weight" {
+				qs.hasOutput = true
+			}
+		}
+	}
+	qs.nFfnDown = layerCount
+	origTensors := orig.Tensors().Items()
+	outputTensors := make([]*fsggml.Tensor, len(origTensors))
+	for i, tensor := range origTensors {
+		tensor := tensor
+		newType := newType(tensor, kv, qs, newFileType)
+		newTensor := &fsggml.Tensor{
+			Name:  tensor.Name,
+			Shape: tensor.Shape,
+			Kind:  uint32(newType),
+		}
+		outputTensors[i] = newTensor
+		outputTensors[i].WriterTo = quantizer{
+			File:       in,
+			offset:     orig.Tensors().Offset + tensor.Offset,
+			from:       tensor,
+			to:         newTensor,
+			progressFn: progressFn,
+		}
+	}
+	return fsggml.WriteGGUF(out, kv, outputTensors)
+}
+func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType {
+	defaultType := ftype.ToTensorType()
+	name := t.Name
+	quantize := strings.HasSuffix(name, "weight")
+	// don't quantize vision stuff
+	quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v."))
+	quantize = quantize && !strings.Contains(name, "mm.")
+	// quantize only 2D and 3D tensors (experts)
+	quantize = quantize && (len(t.Shape) >= 2)
+	// do not quantize norm tensors
+	quantize = quantize && !strings.Contains(name, "_norm.weight")
+	// do not quantize expert gating tensors
+	quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
+	// do not quantize positional embeddings and token types (BERT)
+	quantize = quantize && (name != "position_embd.weight")
+	quantize = quantize && (name != "token_types.weight")
+	// do not quantize Mamba's small yet 2D weights
+	// NOTE: can't use LLM_TN here because the layer number is not known
+	quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight")
+	// do not quantize RWKV's time_mix_first tensors
+	quantize = quantize && !strings.Contains(name, "time_mix_first.weight")
+	quantize = quantize && !strings.Contains(name, "time_mix_w1.weight")
+	quantize = quantize && !strings.Contains(name, "time_mix_w2.weight")
+	quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight")
+	quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight")
+	quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight")
+	// do not quantize relative position bias (T5)
+	quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
+	newType := fsggml.TensorType(t.Kind)
+	if quantize {
+		// get more optimal quantization type based on the tensor shape, layer, etc.
+		newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
+		if newType != defaultType {
+			slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType)
+		}
+	}
+	return newType
+}
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -24,7 +24,7 @@ import (
 var stream bool = false
-func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
+func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
 	t.Helper()
 	t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))

--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
 		"tokenizer.ggml.tokens":         []string{""},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []ggml.Tensor{
+	}, []*ggml.Tensor{
 		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) {
 		_, digest := createBinFile(t, ggml.KV{
 			"general.architecture": "bert",
 			"bert.pooling_type":    uint32(0),
-		}, []ggml.Tensor{})
+		}, []*ggml.Tensor{})
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model:  "bert",
 			Files:  map[string]string{"bert.gguf": digest},
@@ -643,7 +643,7 @@ func TestGenerate(t *testing.T) {
 		"tokenizer.ggml.tokens":         []string{""},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []ggml.Tensor{
+	}, []*ggml.Tensor{
 		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -698,7 +698,7 @@ func TestGenerate(t *testing.T) {
 		_, digest := createBinFile(t, ggml.KV{
 			"general.architecture": "bert",
 			"bert.pooling_type":    uint32(0),
-		}, []ggml.Tensor{})
+		}, []*ggml.Tensor{})
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model:  "bert",

--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -126,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []ggml.Tensor{
+	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}))