ggml.go 18 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
36
	return keyValue(kv, "general.parameter_count", uint64(0))
Michael Yang's avatar
Michael Yang committed
37
38
}

39
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
40
	if t := kv.Uint("general.file_type"); t > 0 {
41
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
42
43
	}

44
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
45
46
47
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
48
49
50
51
52
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
57
58
59
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
60
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

Michael Yang's avatar
Michael Yang committed
63
64
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
65
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
72
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
73
74
75
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
76
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
77
78
}

Michael Yang's avatar
Michael Yang committed
79
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
80
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
81
82
83
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
84
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
85
86
}

Michael Yang's avatar
Michael Yang committed
87
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

103
104
105
106
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
107
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
Michael Yang's avatar
Michael Yang committed
108
	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
109
110
}

Michael Yang's avatar
Michael Yang committed
111
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
Michael Yang's avatar
Michael Yang committed
112
	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
113
114
}

Michael Yang's avatar
Michael Yang committed
115
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
Michael Yang's avatar
Michael Yang committed
116
	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
117
118
}

Patrick Devine's avatar
Patrick Devine committed
119
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
120
	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
Patrick Devine's avatar
Patrick Devine committed
121
122
}

123
func (kv KV) OllamaEngineRequired() bool {
124
125
126
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
127
		"llama4",
128
		"mllama",
129
		"qwen25vl",
130
	}, kv.Architecture())
131
132
}

Michael Yang's avatar
Michael Yang committed
133
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
134
135
136
137
138
139
140
141
142
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
143
144
}

Michael Yang's avatar
Michael Yang committed
145
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
146
147
148
149
150
151
152
153
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

154
	slog.Debug("key not found", "key", key, "default", defaultValue[0])
Michael Yang's avatar
Michael Yang committed
155
156
157
	return defaultValue[0]
}

158
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
159
	items  []*Tensor
160
	Offset uint64
Michael Yang's avatar
Michael Yang committed
161
}
Michael Yang's avatar
Michael Yang committed
162

Michael Yang's avatar
Michael Yang committed
163
164
165
166
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
167

Michael Yang's avatar
Michael Yang committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
188
			}
Michael Yang's avatar
Michael Yang committed
189
		}
190

Michael Yang's avatar
Michael Yang committed
191
192
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
193
194
		}

Michael Yang's avatar
Michael Yang committed
195
196
197
198
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
199
200
201
202
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
203
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
204
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
205
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
206
207
208
209
210
	}

	return size
}

211
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
212
213
214
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
215
216

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
217
	Shape []uint64 `json:"shape"`
218

Michael Yang's avatar
Michael Yang committed
219
	io.WriterTo `json:"-"`
220
221
}

222
223
224
225
226
227
228
229
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

230
func (t Tensor) blockSize() uint64 {
231
232
233
234
235
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
236
237
238
239
240
241
242
243
244
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
245
		return 1
Michael Yang's avatar
Michael Yang committed
246
247
248
249
250
251
252
253
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
254
		return 32
Michael Yang's avatar
Michael Yang committed
255
	default:
256
257
258
259
260
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
261
262
263
264
265
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
266

267
268
	switch t {
	case TensorTypeF32:
269
		return 4
270
	case TensorTypeF16:
271
		return 2
272
	case TensorTypeQ4_0:
273
		return 2 + blockSize/2
274
	case TensorTypeQ4_1:
275
		return 2 + 2 + blockSize/2
276
	case TensorTypeQ5_0:
277
		return 2 + 4 + blockSize/2
278
	case TensorTypeQ5_1:
279
		return 2 + 2 + 4 + blockSize/2
280
	case TensorTypeQ8_0:
281
		return 2 + blockSize
282
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
283
		return 2 + 2 + blockSize
284
	case TensorTypeQ2_K:
285
		return blockSize/16 + blockSize/4 + 2 + 2
286
	case TensorTypeQ3_K:
287
		return blockSize/8 + blockSize/4 + 12 + 2
288
	case TensorTypeQ4_K:
289
		return 2 + 2 + 12 + blockSize/2
290
	case TensorTypeQ5_K:
291
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
292
	case TensorTypeQ6_K:
293
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
294
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
295
		return 4 + blockSize + 2*blockSize/16
296
	case tensorTypeIQ2_XXS:
297
		return 2 + 2*blockSize/8
298
	case tensorTypeIQ2_XS:
299
		return 2 + 2*blockSize/8 + blockSize/32
300
	case tensorTypeIQ3_XXS:
301
		return 2 + blockSize/4 + blockSize/8
302
	case tensorTypeIQ1_S:
303
		return 2 + blockSize/8 + blockSize/16
304
	case tensorTypeIQ4_NL:
305
		return 2 + blockSize/2
306
	case tensorTypeIQ3_S:
307
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
308
	case tensorTypeIQ2_S:
309
		return 2 + blockSize/4 + blockSize/16
310
	case tensorTypeIQ4_XS:
311
		return 2 + 2 + blockSize/2 + blockSize/64
312
	case TensorTypeI8:
313
		return 1
314
	case TensorTypeI16:
315
		return 2
316
	case TensorTypeI32:
317
		return 4
318
	case TensorTypeI64:
319
		return 8
320
	case TensorTypeF64:
321
		return 8
322
	case tensorTypeIQ1_M:
323
		return blockSize/8 + blockSize/16 + blockSize/32
324
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
325
		return 2
326
327
328
329
330
	default:
		return 0
	}
}

331
func (t Tensor) Elements() uint64 {
332
333
334
335
336
337
338
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
339
func (t Tensor) Size() uint64 {
340
	return t.Elements() * t.typeSize() / t.blockSize()
341
342
}

343
func (t Tensor) Type() string {
344
	return TensorType(t.Kind).String()
345
346
}

347
348
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
349
	Decode(io.ReadSeeker) (model, error)
350
351
352
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
353
	// Magic constant for `ggml` files (unversioned).
354
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
355
	// Magic constant for `ggml` files (versioned, ggmf).
356
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
357
	// Magic constant for `ggml` files (versioned, ggjt).
358
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
359
	// Magic constant for `ggla` files (LoRA adapter).
360
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
361
	// Magic constant for `gguf` files (versioned, gguf)
362
363
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
364
365
)

Bruce MacDonald's avatar
Bruce MacDonald committed
366
367
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
368
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
369
370
371
372
373
374
375
376
377
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
378
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
379
380
381
382
383
384
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
385
// Decode decodes a GGML model from the given reader.
386
387
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
388
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
389
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
390
391
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

392
	var magic uint32
Michael Yang's avatar
Michael Yang committed
393
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
394
		return nil, 0, err
395
396
397
	}

	var c container
398
399
	switch magic {
	case FILE_MAGIC_GGUF_LE:
400
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
401
	case FILE_MAGIC_GGUF_BE:
402
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
403
	default:
Michael Yang's avatar
Michael Yang committed
404
		return nil, 0, errors.New("invalid file magic")
405
406
	}

Michael Yang's avatar
Michael Yang committed
407
	model, err := c.Decode(rs)
408
	if err != nil {
Michael Yang's avatar
Michael Yang committed
409
		return nil, 0, err
410
411
	}

Michael Yang's avatar
Michael Yang committed
412
413
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
414
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
415
416
	}

417
	// final model type
418
419
420
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
421
	}, offset, nil
422
}
Michael Yang's avatar
Michael Yang committed
423

424
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
425
426
427
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
428
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
429

Michael Yang's avatar
Michael Yang committed
430
431
432
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
433

Michael Yang's avatar
Michael Yang committed
434
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
435

436
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
437
438
439
440
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
441

Michael Yang's avatar
Michael Yang committed
442
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
443
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
444
445
446
447
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
448
449
450

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
451
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
452
453
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
454

Michael Yang's avatar
Michael Yang committed
455
456
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
457
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
458
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
459
460
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
461
462
463
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
464
465
466
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
467
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
468
469
470
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
471
472
473
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
474
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
475
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
476
			if slices.Contains(crossAttentionLayers, int32(i)) {
477
478
479
480
481
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
482
483
		}

Michael Yang's avatar
Michael Yang committed
484
485
486
487
488
489
490
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
491
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
492
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
493
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
494
495
496
497
498
499
500
501
502
503
504
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
505
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
506
507
508
509
510
511
512
513
514
515
516
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
517
518
519
520
521
522
523
524
525
526
527
528
529
530

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
531
532
533
534
535
536
537
538
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
539
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
556

Michael Yang's avatar
Michael Yang committed
557
558
559
560
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
561
562
563
564
565
566
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
567
568
569
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
570
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
571
572
573
574
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
575
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
576
		)
Michael Yang's avatar
Michael Yang committed
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
603
604
	}

Michael Yang's avatar
Michael Yang committed
605
	return
Michael Yang's avatar
Michael Yang committed
606
}
607

608
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
609
610
611
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
612

Michael Yang's avatar
Michael Yang committed
613
	for name, layer := range llm.Tensors().GroupLayers() {
614
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
615
616
			for _, tensor := range layer {
				weights += tensor.Size()
617
618
			}
		}
Michael Yang's avatar
Michael Yang committed
619
	}
620

Michael Yang's avatar
Michael Yang committed
621
622
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
623
624
625
626
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
627

Michael Yang's avatar
Michael Yang committed
628
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
629

Michael Yang's avatar
Michael Yang committed
630
631
632
633
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
634

Michael Yang's avatar
Michael Yang committed
635
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
636
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
637

Michael Yang's avatar
Michael Yang committed
638
639
	switch llm.KV().Architecture() {
	case "mllama":
640
641
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
642
643
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

644
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
645
			imageSize*imageSize*numChannels*maxNumTiles +
646
647
648
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
649
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
650
651
652
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
653
654
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
655
656
657

		numPatches := maxPixels / (patchSize * patchSize)

658
659
660
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
661
662
663
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
664
665
666
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
667
668
669
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
670
	}
Michael Yang's avatar
Michael Yang committed
671

672
673
674
	return weights, graphSize
}

675
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
676
677
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
678
679
680
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
681
682
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
683
684
685
686
687
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
688
689
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
690
691
692
693
694
695
696
697
698
699
700
701
702
703
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}