ggml.go 20.3 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"slices"
Michael Yang's avatar
Michael Yang committed
11
	"strings"
12

Michael Yang's avatar
Michael Yang committed
13
	"github.com/ollama/ollama/fs/util/bufioutil"
14
15
)

Michael Yang's avatar
Michael Yang committed
16
17
18
type GGML struct {
	container
	model
19
	Length int64
Michael Yang's avatar
Michael Yang committed
20
}
21

Michael Yang's avatar
Michael Yang committed
22
type model interface {
Michael Yang's avatar
Michael Yang committed
23
	KV() KV
Michael Yang's avatar
Michael Yang committed
24
	Tensors() Tensors
25
26
}

27
28
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
29
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
30
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
31
32
}

33
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
34
	return kv.String("general.type", "unknown")
35
36
}

Michael Yang's avatar
Michael Yang committed
37
func (kv KV) ParameterCount() uint64 {
38
39
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
40
41
}

42
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
43
	if t := kv.Uint("general.file_type"); t > 0 {
44
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
45
46
	}

47
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
48
49
50
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
51
52
53
54
55
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
56
57
}

58
59
60
61
62
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
63
64
}

65
66
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
67
68
}

69
70
71
72
73
74
75
76
77
78
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
79
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
80
81
82
83
84
85
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
86
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
87
88
89
}

func (kv KV) EmbeddingHeadCountV() uint64 {
90
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
91
92
93
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
94
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
95
96
}

Michael Yang's avatar
Michael Yang committed
97
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
98
99
100
101
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
102
103
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
104
105
106
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
107
108
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
109
110
111
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
112
113
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
114
115
}

116
func (kv KV) Bool(key string, defaultValue ...bool) bool {
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
148
149
}

Michael Yang's avatar
Michael Yang committed
150
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
151
152
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
153
154
}

Michael Yang's avatar
Michael Yang committed
155
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
156
157
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
158
159
}

Michael Yang's avatar
Michael Yang committed
160
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
161
162
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
163
164
}

Patrick Devine's avatar
Patrick Devine committed
165
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
166
167
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
168
169
}

Michael Yang's avatar
Michael Yang committed
170
171
172
173
174
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

175
func (kv KV) OllamaEngineRequired() bool {
176
177
	return slices.Contains([]string{
		"gemma3",
178
		"gemma3n",
179
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
180
		"llama4",
181
		"mllama",
182
		"qwen25vl",
183
		"gptoss", "gpt-oss",
184
	}, kv.Architecture())
185
186
}

Michael Yang's avatar
Michael Yang committed
187
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
188
189
190
191
192
193
194
195
196
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
197
198
}

199
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
200
201
202
203
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

204
205
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
206
207
	}

208
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
209
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
210
211
}

212
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
213
	items  []*Tensor
214
	Offset uint64
Michael Yang's avatar
Michael Yang committed
215
}
Michael Yang's avatar
Michael Yang committed
216

Michael Yang's avatar
Michael Yang committed
217
218
219
220
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
221

Michael Yang's avatar
Michael Yang committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
242
			}
Michael Yang's avatar
Michael Yang committed
243
		}
244

Michael Yang's avatar
Michael Yang committed
245
246
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
247
248
		}

Michael Yang's avatar
Michael Yang committed
249
250
251
252
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
253
254
255
256
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
257
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
258
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
259
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
260
261
262
263
264
	}

	return size
}

265
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
266
267
268
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
269
270

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
271
	Shape []uint64 `json:"shape"`
272

Michael Yang's avatar
Michael Yang committed
273
	io.WriterTo `json:"-"`
274
275
}

276
277
278
279
280
281
282
283
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

284
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
285
	return TensorType(t.Kind).BlockSize()
286
287
288
289
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
290
291
292
293
294
295
296
297
298
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
299
		return 1
Michael Yang's avatar
Michael Yang committed
300
301
302
	case
		2,  // Q4_0
		3,  // Q4_1
Michael Yang's avatar
Michael Yang committed
303
		4,  // MXFP4
Michael Yang's avatar
Michael Yang committed
304
305
306
307
308
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
309
		return 32
Michael Yang's avatar
Michael Yang committed
310
	default:
311
312
313
314
315
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
316
317
318
319
320
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
321

322
323
	switch t {
	case TensorTypeF32:
324
		return 4
325
	case TensorTypeF16:
326
		return 2
327
	case TensorTypeQ4_0:
328
		return 2 + blockSize/2
329
	case TensorTypeQ4_1:
330
		return 2 + 2 + blockSize/2
331
	case TensorTypeMXFP4, 39:
Michael Yang's avatar
Michael Yang committed
332
		return 1 + blockSize/2
333
	case TensorTypeQ5_0:
334
		return 2 + 4 + blockSize/2
335
	case TensorTypeQ5_1:
336
		return 2 + 2 + 4 + blockSize/2
337
	case TensorTypeQ8_0:
338
		return 2 + blockSize
339
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
340
		return 2 + 2 + blockSize
341
	case TensorTypeQ2_K:
342
		return blockSize/16 + blockSize/4 + 2 + 2
343
	case TensorTypeQ3_K:
344
		return blockSize/8 + blockSize/4 + 12 + 2
345
	case TensorTypeQ4_K:
346
		return 2 + 2 + 12 + blockSize/2
347
	case TensorTypeQ5_K:
348
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
349
	case TensorTypeQ6_K:
350
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
351
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
352
		return 4 + blockSize + 2*blockSize/16
353
	case tensorTypeIQ2_XXS:
354
		return 2 + 2*blockSize/8
355
	case tensorTypeIQ2_XS:
356
		return 2 + 2*blockSize/8 + blockSize/32
357
	case tensorTypeIQ3_XXS:
358
		return 2 + blockSize/4 + blockSize/8
359
	case tensorTypeIQ1_S:
360
		return 2 + blockSize/8 + blockSize/16
361
	case tensorTypeIQ4_NL:
362
		return 2 + blockSize/2
363
	case tensorTypeIQ3_S:
364
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
365
	case tensorTypeIQ2_S:
366
		return 2 + blockSize/4 + blockSize/16
367
	case tensorTypeIQ4_XS:
368
		return 2 + 2 + blockSize/2 + blockSize/64
369
	case TensorTypeI8:
370
		return 1
371
	case TensorTypeI16:
372
		return 2
373
	case TensorTypeI32:
374
		return 4
375
	case TensorTypeI64:
376
		return 8
377
	case TensorTypeF64:
378
		return 8
379
	case tensorTypeIQ1_M:
380
		return blockSize/8 + blockSize/16 + blockSize/32
381
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
382
		return 2
383
384
385
386
387
	default:
		return 0
	}
}

388
func (t Tensor) Elements() uint64 {
389
390
391
392
393
394
395
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
396
func (t Tensor) Size() uint64 {
397
	return t.Elements() * t.typeSize() / t.blockSize()
398
399
}

400
func (t Tensor) Type() string {
401
	return TensorType(t.Kind).String()
402
403
}

404
405
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
406
	Decode(io.ReadSeeker) (model, error)
407
408
409
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
410
	// Magic constant for `ggml` files (unversioned).
411
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
412
	// Magic constant for `ggml` files (versioned, ggmf).
413
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
414
	// Magic constant for `ggml` files (versioned, ggjt).
415
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
416
	// Magic constant for `ggla` files (LoRA adapter).
417
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
418
	// Magic constant for `gguf` files (versioned, gguf)
419
420
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
421
422
)

Bruce MacDonald's avatar
Bruce MacDonald committed
423
424
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
425
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
426
427
428
429
430
431
432
433
434
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
435
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
436
437
438
439
440
441
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
442
// Decode decodes a GGML model from the given reader.
443
444
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
445
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
446
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
447
448
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

449
	var magic uint32
Michael Yang's avatar
Michael Yang committed
450
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
451
		return nil, err
452
453
454
	}

	var c container
455
456
	switch magic {
	case FILE_MAGIC_GGUF_LE:
457
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
458
	case FILE_MAGIC_GGUF_BE:
459
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
460
	default:
461
		return nil, errors.New("invalid file magic")
462
463
	}

Michael Yang's avatar
Michael Yang committed
464
	model, err := c.Decode(rs)
465
	if err != nil {
466
		return nil, err
467
468
	}

Michael Yang's avatar
Michael Yang committed
469
470
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
471
		return nil, err
Michael Yang's avatar
Michael Yang committed
472
473
	}

474
	// final model type
475
476
477
	return &GGML{
		container: c,
		model:     model,
478
479
		Length:    offset,
	}, nil
480
}
Michael Yang's avatar
Michael Yang committed
481

482
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
483
484
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
485
	embedding := f.KV().EmbeddingLength()
486
487
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
488
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
489

490
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
491
492
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
493

Michael Yang's avatar
Michael Yang committed
494
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
495

496
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
497
	var kvTotal uint64
498
499
500
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
501
		kvTotal += kv[i]
502
	}
Michael Yang's avatar
Michael Yang committed
503

Michael Yang's avatar
Michael Yang committed
504
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
505
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
506
507
508
509
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
510
511
512

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
513
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
514
515
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
516

Michael Yang's avatar
Michael Yang committed
517
518
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
519
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
520
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
521
522
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
523
524
525
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
526
527
528
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
529
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
530
531
532
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
533
534
535
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
536
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
537
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
538
			if slices.Contains(crossAttentionLayers, int32(i)) {
539
540
541
542
543
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
544
545
		}

Michael Yang's avatar
Michael Yang committed
546
547
548
549
550
551
552
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
553
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
554
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
555
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
556
557
558
559
560
561
562
563
564
565
566
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
567
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
568
569
570
571
572
573
574
575
576
577
578
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
579

580
581
582
583
584
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

585
586
587
588
589
590
591
592
593
594
595
596
597
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
598
599
600
601
602
603
604
605
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
606
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
623

Michael Yang's avatar
Michael Yang committed
624
625
626
627
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
628
629
630
631
632
633
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
634
635
636
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
637
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
638
639
640
641
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
642
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
643
		)
Michael Yang's avatar
Michael Yang committed
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
670
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
671
672
673
674
675
676
677
678
679
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
680
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
Michael Yang's avatar
Michael Yang committed
681
682
	}

Michael Yang's avatar
Michael Yang committed
683
	return
Michael Yang's avatar
Michael Yang committed
684
}
685

686
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
687
688
689
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
690

Michael Yang's avatar
Michael Yang committed
691
	for name, layer := range llm.Tensors().GroupLayers() {
692
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
693
694
			for _, tensor := range layer {
				weights += tensor.Size()
695
696
			}
		}
Michael Yang's avatar
Michael Yang committed
697
	}
698

Michael Yang's avatar
Michael Yang committed
699
700
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
701
702
703
704
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
705

Michael Yang's avatar
Michael Yang committed
706
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
707

Michael Yang's avatar
Michael Yang committed
708
709
710
711
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
712

Michael Yang's avatar
Michael Yang committed
713
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
714
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
715

Michael Yang's avatar
Michael Yang committed
716
717
	switch llm.KV().Architecture() {
	case "mllama":
718
719
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
720
721
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

722
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
723
			imageSize*imageSize*numChannels*maxNumTiles +
724
725
726
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
727
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
728
729
730
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
731
732
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
733
734
735

		numPatches := maxPixels / (patchSize * patchSize)

736
737
738
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
739
740
741
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
742
743
744
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
745
746
747
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
748
	}
Michael Yang's avatar
Michael Yang committed
749

750
751
752
	return weights, graphSize
}

753
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
754
755
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
756
757
758
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
759
760
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
761
762
763
764
765
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
766
767
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
768
769
770
771
772
773
774
775
776
777
778
779
780
781
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}