ggml.go 16.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
36
	return keyValue[uint64](kv, "general.parameter_count")
Michael Yang's avatar
Michael Yang committed
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
40
41
	if t := kv.Uint("general.file_type"); t > 0 {
		return fileType(t)
Michael Yang's avatar
Michael Yang committed
42
43
	}

Michael Yang's avatar
Michael Yang committed
44
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
45
46
47
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
48
49
50
51
52
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
57
58
59
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
60
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

Michael Yang's avatar
Michael Yang committed
63
64
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
65
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
72
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
73
74
75
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
76
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
77
78
}

Michael Yang's avatar
Michael Yang committed
79
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
80
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
81
82
83
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
84
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
85
86
}

Michael Yang's avatar
Michael Yang committed
87
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

103
104
105
106
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
	r := keyValue(kv, key, &array{})
	s := make([]string, r.size)
	for i := range r.size {
		s[i] = r.values[i].(string)
	}

	return s
}

func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
	r := keyValue(kv, key, &array{})
	s := make([]uint32, r.size)
	for i := range r.size {
		s[i] = uint32(r.values[i].(int32))
	}

Michael Yang's avatar
Michael Yang committed
124
125
126
	return s
}

Patrick Devine's avatar
Patrick Devine committed
127
128
129
130
131
132
133
134
135
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
	r := keyValue(kv, key, &array{})
	s := make([]float32, r.size)
	for i := range r.size {
		s[i] = float32(r.values[i].(float32))
	}
	return s
}

136
137
138
139
func (kv KV) OllamaEngineRequired() bool {
	return kv.Architecture() == "gemma3"
}

140
func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
141
142
143
144
145
146
147
148
149
150
151
152
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

	slog.Warn("key not found", "key", key, "default", defaultValue[0])
	return defaultValue[0]
}

153
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
154
	items  []*Tensor
155
	Offset uint64
Michael Yang's avatar
Michael Yang committed
156
}
Michael Yang's avatar
Michael Yang committed
157

Michael Yang's avatar
Michael Yang committed
158
159
160
161
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
162

Michael Yang's avatar
Michael Yang committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
183
			}
Michael Yang's avatar
Michael Yang committed
184
		}
185

Michael Yang's avatar
Michael Yang committed
186
187
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
188
189
		}

Michael Yang's avatar
Michael Yang committed
190
191
192
193
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
194
195
196
197
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
198
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
199
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
200
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
201
202
203
204
205
	}

	return size
}

206
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
207
208
209
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
210
211

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
212
	Shape []uint64 `json:"shape"`
213

Michael Yang's avatar
Michael Yang committed
214
	io.WriterTo `json:"-"`
215
216
}

217
218
219
220
221
222
223
224
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

225
func (t Tensor) blockSize() uint64 {
226
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
227
228
229
230
231
232
233
234
235
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
236
		return 1
Michael Yang's avatar
Michael Yang committed
237
238
239
240
241
242
243
244
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
245
		return 32
Michael Yang's avatar
Michael Yang committed
246
	default:
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
Michael Yang's avatar
Michael Yang committed
270
		return 2 + 2 + blockSize
271
272
273
274
275
276
277
278
279
280
281
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
Michael Yang's avatar
Michael Yang committed
282
		return 4 + blockSize + 2*blockSize/16
283
284
285
286
287
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
Michael Yang's avatar
Michael Yang committed
311
312
	case 30: // BF16
		return 2
313
314
315
316
317
318
319
320
321
322
323
324
325
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
326
func (t Tensor) Size() uint64 {
327
328
329
	return t.parameters() * t.typeSize() / t.blockSize()
}

330
331
332
333
func (t Tensor) Type() string {
	return fileType(t.Kind).String()
}

334
335
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
336
	Decode(io.ReadSeeker) (model, error)
337
338
339
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
340
	// Magic constant for `ggml` files (unversioned).
341
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
342
	// Magic constant for `ggml` files (versioned, ggmf).
343
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
344
	// Magic constant for `ggml` files (versioned, ggjt).
345
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
346
	// Magic constant for `ggla` files (LoRA adapter).
347
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
348
	// Magic constant for `gguf` files (versioned, gguf)
349
350
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
351
352
)

Bruce MacDonald's avatar
Bruce MacDonald committed
353
354
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
355
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
356
357
358
359
360
361
362
363
364
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
365
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
366
367
368
369
370
371
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
372
// Decode decodes a GGML model from the given reader.
373
374
375
376
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
377
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
378
379
380
381
382
383
	if maxArraySize == 0 {
		maxArraySize = 1024
	}

	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

384
	var magic uint32
Michael Yang's avatar
Michael Yang committed
385
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
386
		return nil, 0, err
387
388
389
	}

	var c container
390
391
	switch magic {
	case FILE_MAGIC_GGUF_LE:
392
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
393
	case FILE_MAGIC_GGUF_BE:
394
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
395
	default:
Michael Yang's avatar
Michael Yang committed
396
		return nil, 0, errors.New("invalid file magic")
397
398
	}

Michael Yang's avatar
Michael Yang committed
399
	model, err := c.Decode(rs)
400
	if err != nil {
Michael Yang's avatar
Michael Yang committed
401
		return nil, 0, err
402
403
	}

Michael Yang's avatar
Michael Yang committed
404
405
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
406
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
407
408
	}

409
	// final model type
410
411
412
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
413
	}, offset, nil
414
}
Michael Yang's avatar
Michael Yang committed
415

Michael Yang's avatar
Michael Yang committed
416
417
418
419
420
func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
Michael Yang's avatar
Michael Yang committed
421

Michael Yang's avatar
Michael Yang committed
422
423
424
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
425

Michael Yang's avatar
Michael Yang committed
426
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
427

428
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
429
	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
430

Michael Yang's avatar
Michael Yang committed
431
	switch f.KV().Architecture() {
Michael Yang's avatar
Michael Yang committed
432
	case "llama":
Michael Yang's avatar
Michael Yang committed
433
434
435
436
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
437
438
439

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
440
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
441
442
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
443

Michael Yang's avatar
Michael Yang committed
444
445
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
Michael Yang committed
446
			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
Michael Yang's avatar
Michael Yang committed
447
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
448
449
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
450
451
452
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
453
454
455
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
456
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
457
458
459
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
460
461
462
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
463
		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
Michael Yang's avatar
Michael Yang committed
464
465
466
			kv = headsKV *
				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
				(2* // sizeof(float16)
Michael Yang's avatar
Michael Yang committed
467
					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
Michael Yang's avatar
Michael Yang committed
468
469
470
471
472
473
474
					context +
					4* // sizeof(float32)
						uint64(crossAttentionLayers.size)* // num cross attention layers
						visionTokens*
						tiles)
		}

Michael Yang's avatar
Michael Yang committed
475
476
477
478
479
480
481
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
482
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
483
484
485
486
487
488
489
490
491
492
493
494
495
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
				ropeFreqsCount = ropeFreqsWeights.parameters()
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
496
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
497
498
499
500
501
502
503
504
505
506
507
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
Michael Yang's avatar
Michael Yang committed
508
509
510
511
512
513
514
515
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
516
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
533

Michael Yang's avatar
Michael Yang committed
534
535
536
537
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
538
539
540
541
542
543
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
544
545
546
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
547
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
548
549
550
551
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
552
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
553
		)
Michael Yang's avatar
Michael Yang committed
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
580
581
	}

Michael Yang's avatar
Michael Yang committed
582
	return
Michael Yang's avatar
Michael Yang committed
583
}
584

585
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
586
587
588
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
589

Michael Yang's avatar
Michael Yang committed
590
	for name, layer := range llm.Tensors().GroupLayers() {
591
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
592
593
			for _, tensor := range layer {
				weights += tensor.Size()
594
595
			}
		}
Michael Yang's avatar
Michael Yang committed
596
	}
597

Michael Yang's avatar
Michael Yang committed
598
599
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
600
601
602
603
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
604

Michael Yang's avatar
Michael Yang committed
605
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
606

Michael Yang's avatar
Michael Yang committed
607
608
609
610
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
611

Michael Yang's avatar
Michael Yang committed
612
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
613
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
614

Michael Yang's avatar
Michael Yang committed
615
616
	switch llm.KV().Architecture() {
	case "mllama":
617
618
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
619
620
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

621
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
622
			imageSize*imageSize*numChannels*maxNumTiles +
623
624
625
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
Michael Yang's avatar
Michael Yang committed
626
	case "gemma3":
Michael Yang's avatar
Michael Yang committed
627
628
629
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
630
	}
Michael Yang's avatar
Michael Yang committed
631

632
633
634
	return weights, graphSize
}

635
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
636
637
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
638
639
640
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
641
642
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
643
644
645
646
647
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
648
649
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
650
651
652
653
654
655
656
657
658
659
660
661
662
663
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}