ggml.go 16.6 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
36
	return keyValue[uint64](kv, "general.parameter_count")
Michael Yang's avatar
Michael Yang committed
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
40
41
	if t := kv.Uint("general.file_type"); t > 0 {
		return fileType(t)
Michael Yang's avatar
Michael Yang committed
42
43
	}

Michael Yang's avatar
Michael Yang committed
44
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
45
46
47
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
48
49
50
51
52
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
57
58
59
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
60
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

Michael Yang's avatar
Michael Yang committed
63
64
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
65
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
72
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
73
74
75
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
76
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
77
78
}

Michael Yang's avatar
Michael Yang committed
79
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
80
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
81
82
83
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
84
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
85
86
}

Michael Yang's avatar
Michael Yang committed
87
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

103
104
105
106
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
107
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
Michael Yang's avatar
Michael Yang committed
108
	return keyValue(kv, key, &array[string]{}).values
Michael Yang's avatar
Michael Yang committed
109
110
111
}

func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
Michael Yang's avatar
Michael Yang committed
112
	return keyValue(kv, key, &array[uint32]{}).values
Michael Yang's avatar
Michael Yang committed
113
114
}

Patrick Devine's avatar
Patrick Devine committed
115
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
116
	return keyValue(kv, key, &array[float32]{}).values
Patrick Devine's avatar
Patrick Devine committed
117
118
}

119
func (kv KV) OllamaEngineRequired() bool {
120
121
122
123
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
	}, kv.Architecture())
124
125
}

Michael Yang's avatar
Michael Yang committed
126
127
128
129
130
131
type valueTypes interface {
	string | uint32 | uint64 | float32 | bool |
		*array[string] | *array[uint32] | *array[uint64] | *array[float32] | *array[bool]
}

func keyValue[T valueTypes](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
132
133
134
135
136
137
138
139
140
141
142
143
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

	slog.Warn("key not found", "key", key, "default", defaultValue[0])
	return defaultValue[0]
}

144
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
145
	items  []*Tensor
146
	Offset uint64
Michael Yang's avatar
Michael Yang committed
147
}
Michael Yang's avatar
Michael Yang committed
148

Michael Yang's avatar
Michael Yang committed
149
150
151
152
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
153

Michael Yang's avatar
Michael Yang committed
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
174
			}
Michael Yang's avatar
Michael Yang committed
175
		}
176

Michael Yang's avatar
Michael Yang committed
177
178
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
179
180
		}

Michael Yang's avatar
Michael Yang committed
181
182
183
184
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
185
186
187
188
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
189
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
190
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
191
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
192
193
194
195
196
	}

	return size
}

197
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
198
199
200
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
201
202

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
203
	Shape []uint64 `json:"shape"`
204

Michael Yang's avatar
Michael Yang committed
205
	io.WriterTo `json:"-"`
206
207
}

208
209
210
211
212
213
214
215
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

216
func (t Tensor) blockSize() uint64 {
217
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
218
219
220
221
222
223
224
225
226
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
227
		return 1
Michael Yang's avatar
Michael Yang committed
228
229
230
231
232
233
234
235
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
236
		return 32
Michael Yang's avatar
Michael Yang committed
237
	default:
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
Michael Yang's avatar
Michael Yang committed
261
		return 2 + 2 + blockSize
262
263
264
265
266
267
268
269
270
271
272
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
Michael Yang's avatar
Michael Yang committed
273
		return 4 + blockSize + 2*blockSize/16
274
275
276
277
278
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
Michael Yang's avatar
Michael Yang committed
302
303
	case 30: // BF16
		return 2
304
305
306
307
308
309
310
311
312
313
314
315
316
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
317
func (t Tensor) Size() uint64 {
318
319
320
	return t.parameters() * t.typeSize() / t.blockSize()
}

321
322
323
324
func (t Tensor) Type() string {
	return fileType(t.Kind).String()
}

325
326
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
327
	Decode(io.ReadSeeker) (model, error)
328
329
330
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
331
	// Magic constant for `ggml` files (unversioned).
332
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
333
	// Magic constant for `ggml` files (versioned, ggmf).
334
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
335
	// Magic constant for `ggml` files (versioned, ggjt).
336
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
337
	// Magic constant for `ggla` files (LoRA adapter).
338
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
339
	// Magic constant for `gguf` files (versioned, gguf)
340
341
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
342
343
)

Bruce MacDonald's avatar
Bruce MacDonald committed
344
345
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
346
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
347
348
349
350
351
352
353
354
355
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
356
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
357
358
359
360
361
362
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
363
// Decode decodes a GGML model from the given reader.
364
365
366
367
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
368
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
369
370
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

371
	var magic uint32
Michael Yang's avatar
Michael Yang committed
372
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
373
		return nil, 0, err
374
375
376
	}

	var c container
377
378
	switch magic {
	case FILE_MAGIC_GGUF_LE:
379
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
380
	case FILE_MAGIC_GGUF_BE:
381
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
382
	default:
Michael Yang's avatar
Michael Yang committed
383
		return nil, 0, errors.New("invalid file magic")
384
385
	}

Michael Yang's avatar
Michael Yang committed
386
	model, err := c.Decode(rs)
387
	if err != nil {
Michael Yang's avatar
Michael Yang committed
388
		return nil, 0, err
389
390
	}

Michael Yang's avatar
Michael Yang committed
391
392
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
393
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
394
395
	}

396
	// final model type
397
398
399
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
400
	}, offset, nil
401
}
Michael Yang's avatar
Michael Yang committed
402

403
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
404
405
406
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
407
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
408

Michael Yang's avatar
Michael Yang committed
409
410
411
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
412

Michael Yang's avatar
Michael Yang committed
413
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
414

415
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
416
417
418
419
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
420

Michael Yang's avatar
Michael Yang committed
421
	switch f.KV().Architecture() {
Michael Yang's avatar
Michael Yang committed
422
	case "llama":
Michael Yang's avatar
Michael Yang committed
423
424
425
426
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
427
428
429

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
430
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
431
432
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
433

Michael Yang's avatar
Michael Yang committed
434
435
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
Michael Yang committed
436
			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
Michael Yang's avatar
Michael Yang committed
437
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
438
439
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
440
441
442
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
443
444
445
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
446
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
447
448
449
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
450
451
452
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

453
454
455
456
457
458
459
460
		crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
		for i := range kv {
			if slices.Contains(crossAttentionLayers, uint32(i)) {
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
461
462
		}

Michael Yang's avatar
Michael Yang committed
463
464
465
466
467
468
469
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
470
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
471
472
473
474
475
476
477
478
479
480
481
482
483
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
				ropeFreqsCount = ropeFreqsWeights.parameters()
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
484
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
485
486
487
488
489
490
491
492
493
494
495
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
496
497
498
499
500
501
502
503
504
505
506
507
508
509

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
510
511
512
513
514
515
516
517
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
518
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
535

Michael Yang's avatar
Michael Yang committed
536
537
538
539
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
540
541
542
543
544
545
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
546
547
548
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
549
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
550
551
552
553
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
554
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
555
		)
Michael Yang's avatar
Michael Yang committed
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
582
583
	}

Michael Yang's avatar
Michael Yang committed
584
	return
Michael Yang's avatar
Michael Yang committed
585
}
586

587
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
588
589
590
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
591

Michael Yang's avatar
Michael Yang committed
592
	for name, layer := range llm.Tensors().GroupLayers() {
593
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
594
595
			for _, tensor := range layer {
				weights += tensor.Size()
596
597
			}
		}
Michael Yang's avatar
Michael Yang committed
598
	}
599

Michael Yang's avatar
Michael Yang committed
600
601
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
602
603
604
605
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
606

Michael Yang's avatar
Michael Yang committed
607
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
608

Michael Yang's avatar
Michael Yang committed
609
610
611
612
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
613

Michael Yang's avatar
Michael Yang committed
614
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
615
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
616

Michael Yang's avatar
Michael Yang committed
617
618
	switch llm.KV().Architecture() {
	case "mllama":
619
620
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
621
622
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

623
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
624
			imageSize*imageSize*numChannels*maxNumTiles +
625
626
627
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
628
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
629
630
631
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
632
	}
Michael Yang's avatar
Michael Yang committed
633

634
635
636
	return weights, graphSize
}

637
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
638
639
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
640
641
642
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
643
644
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
645
646
647
648
649
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
650
651
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
652
653
654
655
656
657
658
659
660
661
662
663
664
665
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}