ggml.go 17.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
36
	return keyValue(kv, "general.parameter_count", uint64(0))
Michael Yang's avatar
Michael Yang committed
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
40
41
	if t := kv.Uint("general.file_type"); t > 0 {
		return fileType(t)
Michael Yang's avatar
Michael Yang committed
42
43
	}

Michael Yang's avatar
Michael Yang committed
44
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
45
46
47
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
48
49
50
51
52
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
57
58
59
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
60
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

Michael Yang's avatar
Michael Yang committed
63
64
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
65
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
72
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
73
74
75
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
76
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
77
78
}

Michael Yang's avatar
Michael Yang committed
79
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
80
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
81
82
83
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
84
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
85
86
}

Michael Yang's avatar
Michael Yang committed
87
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

103
104
105
106
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
107
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
Michael Yang's avatar
Michael Yang committed
108
	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
109
110
}

Michael Yang's avatar
Michael Yang committed
111
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
Michael Yang's avatar
Michael Yang committed
112
	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
113
114
}

Michael Yang's avatar
Michael Yang committed
115
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
Michael Yang's avatar
Michael Yang committed
116
	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
117
118
}

Patrick Devine's avatar
Patrick Devine committed
119
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
120
	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
Patrick Devine's avatar
Patrick Devine committed
121
122
}

123
func (kv KV) OllamaEngineRequired() bool {
124
125
126
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
127
		"llama4",
128
	}, kv.Architecture())
129
130
}

Michael Yang's avatar
Michael Yang committed
131
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
132
133
134
135
136
137
138
139
140
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
141
142
}

Michael Yang's avatar
Michael Yang committed
143
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
144
145
146
147
148
149
150
151
152
153
154
155
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

	slog.Warn("key not found", "key", key, "default", defaultValue[0])
	return defaultValue[0]
}

156
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
157
	items  []*Tensor
158
	Offset uint64
Michael Yang's avatar
Michael Yang committed
159
}
Michael Yang's avatar
Michael Yang committed
160

Michael Yang's avatar
Michael Yang committed
161
162
163
164
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
165

Michael Yang's avatar
Michael Yang committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
186
			}
Michael Yang's avatar
Michael Yang committed
187
		}
188

Michael Yang's avatar
Michael Yang committed
189
190
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
191
192
		}

Michael Yang's avatar
Michael Yang committed
193
194
195
196
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
197
198
199
200
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
201
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
202
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
203
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
204
205
206
207
208
	}

	return size
}

209
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
210
211
212
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
213
214

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
215
	Shape []uint64 `json:"shape"`
216

Michael Yang's avatar
Michael Yang committed
217
	io.WriterTo `json:"-"`
218
219
}

220
221
222
223
224
225
226
227
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

228
func (t Tensor) blockSize() uint64 {
229
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
230
231
232
233
234
235
236
237
238
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
239
		return 1
Michael Yang's avatar
Michael Yang committed
240
241
242
243
244
245
246
247
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
248
		return 32
Michael Yang's avatar
Michael Yang committed
249
	default:
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
Michael Yang's avatar
Michael Yang committed
273
		return 2 + 2 + blockSize
274
275
276
277
278
279
280
281
282
283
284
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
Michael Yang's avatar
Michael Yang committed
285
		return 4 + blockSize + 2*blockSize/16
286
287
288
289
290
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
Michael Yang's avatar
Michael Yang committed
314
315
	case 30: // BF16
		return 2
316
317
318
319
320
321
322
323
324
325
326
327
328
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
329
func (t Tensor) Size() uint64 {
330
331
332
	return t.parameters() * t.typeSize() / t.blockSize()
}

333
334
335
336
func (t Tensor) Type() string {
	return fileType(t.Kind).String()
}

337
338
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
339
	Decode(io.ReadSeeker) (model, error)
340
341
342
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
343
	// Magic constant for `ggml` files (unversioned).
344
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
345
	// Magic constant for `ggml` files (versioned, ggmf).
346
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
347
	// Magic constant for `ggml` files (versioned, ggjt).
348
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
349
	// Magic constant for `ggla` files (LoRA adapter).
350
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
351
	// Magic constant for `gguf` files (versioned, gguf)
352
353
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
354
355
)

Bruce MacDonald's avatar
Bruce MacDonald committed
356
357
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
358
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
359
360
361
362
363
364
365
366
367
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
368
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
369
370
371
372
373
374
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
375
// Decode decodes a GGML model from the given reader.
376
377
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
378
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
379
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
380
381
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

382
	var magic uint32
Michael Yang's avatar
Michael Yang committed
383
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
384
		return nil, 0, err
385
386
387
	}

	var c container
388
389
	switch magic {
	case FILE_MAGIC_GGUF_LE:
390
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
391
	case FILE_MAGIC_GGUF_BE:
392
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
393
	default:
Michael Yang's avatar
Michael Yang committed
394
		return nil, 0, errors.New("invalid file magic")
395
396
	}

Michael Yang's avatar
Michael Yang committed
397
	model, err := c.Decode(rs)
398
	if err != nil {
Michael Yang's avatar
Michael Yang committed
399
		return nil, 0, err
400
401
	}

Michael Yang's avatar
Michael Yang committed
402
403
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
404
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
405
406
	}

407
	// final model type
408
409
410
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
411
	}, offset, nil
412
}
Michael Yang's avatar
Michael Yang committed
413

414
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
415
416
417
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
418
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
419

Michael Yang's avatar
Michael Yang committed
420
421
422
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
423

Michael Yang's avatar
Michael Yang committed
424
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
425

426
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
427
428
429
430
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
431

Michael Yang's avatar
Michael Yang committed
432
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
433
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
434
435
436
437
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
438
439
440

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
441
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
442
443
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
444

Michael Yang's avatar
Michael Yang committed
445
446
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
447
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
448
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
449
450
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
451
452
453
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
454
455
456
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
457
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
458
459
460
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
461
462
463
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
464
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
465
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
466
			if slices.Contains(crossAttentionLayers, int32(i)) {
467
468
469
470
471
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
472
473
		}

Michael Yang's avatar
Michael Yang committed
474
475
476
477
478
479
480
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
481
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
482
483
484
485
486
487
488
489
490
491
492
493
494
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
				ropeFreqsCount = ropeFreqsWeights.parameters()
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
495
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
496
497
498
499
500
501
502
503
504
505
506
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
507
508
509
510
511
512
513
514
515
516
517
518
519
520

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
521
522
523
524
525
526
527
528
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
529
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
546

Michael Yang's avatar
Michael Yang committed
547
548
549
550
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
551
552
553
554
555
556
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
557
558
559
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
560
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
561
562
563
564
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
565
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
566
		)
Michael Yang's avatar
Michael Yang committed
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
593
594
	}

Michael Yang's avatar
Michael Yang committed
595
	return
Michael Yang's avatar
Michael Yang committed
596
}
597

598
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
599
600
601
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
602

Michael Yang's avatar
Michael Yang committed
603
	for name, layer := range llm.Tensors().GroupLayers() {
604
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
605
606
			for _, tensor := range layer {
				weights += tensor.Size()
607
608
			}
		}
Michael Yang's avatar
Michael Yang committed
609
	}
610

Michael Yang's avatar
Michael Yang committed
611
612
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
613
614
615
616
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
617

Michael Yang's avatar
Michael Yang committed
618
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
619

Michael Yang's avatar
Michael Yang committed
620
621
622
623
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
624

Michael Yang's avatar
Michael Yang committed
625
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
626
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
627

Michael Yang's avatar
Michael Yang committed
628
629
	switch llm.KV().Architecture() {
	case "mllama":
630
631
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
632
633
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

634
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
635
			imageSize*imageSize*numChannels*maxNumTiles +
636
637
638
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
639
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
640
641
642
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
Michael Yang's avatar
memory  
Michael Yang committed
643
644
645
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
646
	}
Michael Yang's avatar
Michael Yang committed
647

648
649
650
	return weights, graphSize
}

651
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
652
653
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
654
655
656
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
657
658
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
659
660
661
662
663
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
664
665
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
666
667
668
669
670
671
672
673
674
675
676
677
678
679
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}