You need to sign in or sign up before continuing.
quantization.go 8.18 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
package server

import (
	"fmt"
	"io"
	"log/slog"
	"maps"
	"os"
	"strings"
	"unsafe"

	fsggml "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml/backend/ggml"
)

type quantizer struct {
	*os.File
	offset     uint64
	from, to   *fsggml.Tensor
	progressFn func(n uint64)
}

func (q quantizer) WriteTo(w io.Writer) (int64, error) {
	quantize := q.from.Kind != q.to.Kind
	sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size()))
	if !quantize {
		n, err := io.Copy(w, sr)
		q.progressFn(q.from.Size())
		return n, err
	}
	data, err := io.ReadAll(sr)
	if err != nil {
		slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
		return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
	}
	var f32s []float32
	newType := fsggml.TensorType(q.to.Kind)
	if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
		f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements())
	} else {
		f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements())
	}
	data = ggml.Quantize(newType, f32s, q.from.Shape)
	n, err := w.Write(data)
	q.progressFn(q.from.Size())
	return int64(n), err
}

type quantizeState struct {
	nAttnV    int  // Number of attn_*v* weight tensors
	nFfnDown  int  // Number of ffn_down tensors
	iAttnV    int  // Running counter of number of attn_v tensors that have been processed
	iFfnDown  int  // Running counter of number of ffn_down tensors that have been processed
	hasOutput bool // used to figure out if a model shares tok_embd with the output weight
}

func useMoreBits(iLayer, nLayers int) bool {
	return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
}

func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
	// Ported from llama_tensor_get_type, removed unsupported quantization types
	nExperts := max(1, kv.Uint("expert_count", 0))
	if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") {
		nx := shape[0]
		qk_k := newType.BlockSize()
		if nx%qk_k != 0 {
			newType = fsggml.TensorTypeQ8_0
		} else if newType != fsggml.TensorTypeQ8_0 {
			newType = fsggml.TensorTypeQ6_K
		}
	} else if strings.Contains(name, "attn_v.weight") {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
		if (ftype == fsggml.FileTypeQ4_K_M) &&
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
			useMoreBits(qs.iAttnV, qs.nAttnV) {
			newType = fsggml.TensorTypeQ6_K
		} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
			newType = fsggml.TensorTypeQ5_K
		}

		// TODO
		// if (qs.model.type == LLM_TYPE_70B) {
		// 	// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
		// 	// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
		// 	// nearly negligible increase in model size by quantizing this tensor with more bits:
		// 	if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K;
		// }

		if nExperts == 8 {
			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
			newType = fsggml.TensorTypeQ8_0
		}
		qs.iAttnV++
	} else if strings.Contains(name, "attn_k.weight") {
		if nExperts == 8 {
			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
			newType = fsggml.TensorTypeQ8_0
		}
	} else if strings.Contains(name, "ffn_down") {
		iLayer := qs.iFfnDown
		n_layer := qs.nFfnDown
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
		if ftype == fsggml.FileTypeQ4_K_M {
102
103
104
105
106
107
108
109
110
			if useMoreBits(iLayer, n_layer) {
				newType = fsggml.TensorTypeQ6_K
			}
		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
			newType = fsggml.TensorTypeQ5_K
		}
		qs.iFfnDown++
	} else if strings.Contains(name, "attn_output.weight") {
		if nExperts == 8 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
111
			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
112
113
114
115
				newType = fsggml.TensorTypeQ5_K
			}
		}
	} else if strings.Contains(name, "attn_qkv.weight") {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
		if ftype == fsggml.FileTypeQ4_K_M {
117
118
119
120
121
122
123
			newType = fsggml.TensorTypeQ5_K
		}
	}

	if newType.IsQuantized() {
		nx := shape[0]
		qk_k := newType.BlockSize()
124
125

		// Check if first dimension is divisible by block size
126
		if nx%qk_k != 0 {
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
			// Store the original type for logging
			originalType := newType

			// Select appropriate fallback based on original type
			switch newType {
			case fsggml.TensorTypeQ4_K:
				newType = fsggml.TensorTypeQ5_0
			case fsggml.TensorTypeQ5_K:
				newType = fsggml.TensorTypeQ5_1
			case fsggml.TensorTypeQ6_K:
				newType = fsggml.TensorTypeQ8_0
			}

			// Final check - if still incompatible, fall back to F16
			if nx%newType.BlockSize() != 0 {
				newType = fsggml.TensorTypeF16
			}

			slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
				nx, qk_k, originalType.String(), newType.String()))
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
		}
	}
	return newType
}

func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
	kv := maps.Clone(orig.KV())
	kv["general.file_type"] = newFileType
	// kv["general.quantization_version"] = ggml.QuantizationVersion()
	qs := &quantizeState{}
	// Build up the quantize state so newType can adjust types
	layerCount := 0
	for k, l := range orig.Tensors().GroupLayers() {
		if strings.HasPrefix(k, "blk.") {
			layerCount++
		}
		for _, tensor := range l {
			if strings.Contains(tensor.Name, "attn_v.weight") ||
				strings.Contains(tensor.Name, "attn_qkv.weight") ||
				strings.Contains(tensor.Name, "attn_kv_b.weight") {
				qs.nAttnV++
			} else if tensor.Name == "output.weight" {
				qs.hasOutput = true
			}
		}
	}
	qs.nFfnDown = layerCount

	origTensors := orig.Tensors().Items()
	outputTensors := make([]*fsggml.Tensor, len(origTensors))
	for i, tensor := range origTensors {
		tensor := tensor
		newType := newType(tensor, kv, qs, newFileType)
		newTensor := &fsggml.Tensor{
			Name:  tensor.Name,
			Shape: tensor.Shape,
			Kind:  uint32(newType),
		}
		outputTensors[i] = newTensor
		outputTensors[i].WriterTo = quantizer{
			File:       in,
			offset:     orig.Tensors().Offset + tensor.Offset,
			from:       tensor,
			to:         newTensor,
			progressFn: progressFn,
		}
	}
	return fsggml.WriteGGUF(out, kv, outputTensors)
}

func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType {
	defaultType := ftype.ToTensorType()
	name := t.Name
	quantize := strings.HasSuffix(name, "weight")

	// don't quantize vision stuff
	quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v."))
	quantize = quantize && !strings.Contains(name, "mm.")

	// quantize only 2D and 3D tensors (experts)
	quantize = quantize && (len(t.Shape) >= 2)

	// do not quantize norm tensors
	quantize = quantize && !strings.Contains(name, "_norm.weight")

	// do not quantize expert gating tensors
	quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")

	// do not quantize positional embeddings and token types (BERT)
	quantize = quantize && (name != "position_embd.weight")
	quantize = quantize && (name != "token_types.weight")

	// do not quantize Mamba's small yet 2D weights
	// NOTE: can't use LLM_TN here because the layer number is not known
	quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight")

	// do not quantize RWKV's time_mix_first tensors
	quantize = quantize && !strings.Contains(name, "time_mix_first.weight")
	quantize = quantize && !strings.Contains(name, "time_mix_w1.weight")
	quantize = quantize && !strings.Contains(name, "time_mix_w2.weight")
	quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight")
	quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight")
	quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight")

	// do not quantize relative position bias (T5)
	quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")

234
235
	quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")

236
237
238
239
240
241
242
243
244
245
	newType := fsggml.TensorType(t.Kind)
	if quantize {
		// get more optimal quantization type based on the tensor shape, layer, etc.
		newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
		if newType != defaultType {
			slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType)
		}
	}
	return newType
}