ggml.go 20.3 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"slices"
Michael Yang's avatar
Michael Yang committed
11
	"strings"
12

Michael Yang's avatar
Michael Yang committed
13
	"github.com/ollama/ollama/fs/util/bufioutil"
14
15
)

Michael Yang's avatar
Michael Yang committed
16
17
18
type GGML struct {
	container
	model
19
	Length int64
Michael Yang's avatar
Michael Yang committed
20
}
21

Michael Yang's avatar
Michael Yang committed
22
type model interface {
Michael Yang's avatar
Michael Yang committed
23
	KV() KV
Michael Yang's avatar
Michael Yang committed
24
	Tensors() Tensors
25
26
}

27
28
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
29
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
30
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
31
32
}

33
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
34
	return kv.String("general.type", "unknown")
35
36
}

Michael Yang's avatar
Michael Yang committed
37
func (kv KV) ParameterCount() uint64 {
38
39
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
40
41
}

42
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
43
	if t := kv.Uint("general.file_type"); t > 0 {
44
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
45
46
	}

47
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
48
49
50
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
51
52
53
54
55
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
56
57
}

58
59
60
61
62
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
63
64
}

65
66
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
67
68
}

69
70
71
72
73
74
75
76
77
78
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
79
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
80
81
82
83
84
85
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
86
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
87
88
89
}

func (kv KV) EmbeddingHeadCountV() uint64 {
90
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
91
92
93
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
94
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
95
96
}

Michael Yang's avatar
Michael Yang committed
97
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
98
99
100
101
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
102
103
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
104
105
106
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
107
108
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
109
110
111
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
112
113
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
114
115
}

116
func (kv KV) Bool(key string, defaultValue ...bool) bool {
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
148
149
}

Michael Yang's avatar
Michael Yang committed
150
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
151
152
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
153
154
}

Michael Yang's avatar
Michael Yang committed
155
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
156
157
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
158
159
}

Michael Yang's avatar
Michael Yang committed
160
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
161
162
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
163
164
}

Patrick Devine's avatar
Patrick Devine committed
165
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
166
167
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
168
169
}

Michael Yang's avatar
Michael Yang committed
170
171
172
173
174
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

175
func (kv KV) OllamaEngineRequired() bool {
176
177
	return slices.Contains([]string{
		"gemma3",
178
		"gemma3n",
179
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
180
		"llama4",
181
		"mllama",
182
		"qwen25vl",
Michael Yang's avatar
Michael Yang committed
183
		"gptoss",
184
	}, kv.Architecture())
185
186
}

Michael Yang's avatar
Michael Yang committed
187
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
188
189
190
191
192
193
194
195
196
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
197
198
}

199
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
200
201
202
203
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

204
205
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
206
207
	}

208
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
209
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
210
211
}

212
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
213
	items  []*Tensor
214
	Offset uint64
Michael Yang's avatar
Michael Yang committed
215
}
Michael Yang's avatar
Michael Yang committed
216

Michael Yang's avatar
Michael Yang committed
217
218
219
220
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
221

Michael Yang's avatar
Michael Yang committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
242
			}
Michael Yang's avatar
Michael Yang committed
243
		}
244

Michael Yang's avatar
Michael Yang committed
245
246
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
247
248
		}

Michael Yang's avatar
Michael Yang committed
249
250
251
252
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
253
254
255
256
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
257
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
258
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
259
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
260
261
262
263
264
	}

	return size
}

265
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
266
267
268
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
269
270

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
271
	Shape []uint64 `json:"shape"`
272

Michael Yang's avatar
Michael Yang committed
273
	io.WriterTo `json:"-"`
274
275
}

276
277
278
279
280
281
282
283
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

284
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
285
	return TensorType(t.Kind).BlockSize()
286
287
288
289
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
290
291
292
293
294
295
296
297
298
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
299
		return 1
Michael Yang's avatar
Michael Yang committed
300
301
302
	case
		2,  // Q4_0
		3,  // Q4_1
Michael Yang's avatar
Michael Yang committed
303
		4,  // MXFP4
Michael Yang's avatar
Michael Yang committed
304
305
306
307
308
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
309
		return 32
Michael Yang's avatar
Michael Yang committed
310
	default:
311
312
313
314
315
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
316
317
318
319
320
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
321

322
323
	switch t {
	case TensorTypeF32:
324
		return 4
325
	case TensorTypeF16:
326
		return 2
327
	case TensorTypeQ4_0:
328
		return 2 + blockSize/2
329
	case TensorTypeQ4_1:
330
		return 2 + 2 + blockSize/2
Michael Yang's avatar
Michael Yang committed
331
332
	case TensorTypeMXFP4:
		return 1 + blockSize/2
333
	case TensorTypeQ5_0:
334
		return 2 + 4 + blockSize/2
335
	case TensorTypeQ5_1:
336
		return 2 + 2 + 4 + blockSize/2
337
	case TensorTypeQ8_0:
338
		return 2 + blockSize
339
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
340
		return 2 + 2 + blockSize
341
	case TensorTypeQ2_K:
342
		return blockSize/16 + blockSize/4 + 2 + 2
343
	case TensorTypeQ3_K:
344
		return blockSize/8 + blockSize/4 + 12 + 2
345
	case TensorTypeQ4_K:
346
		return 2 + 2 + 12 + blockSize/2
347
	case TensorTypeQ5_K:
348
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
349
	case TensorTypeQ6_K:
350
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
351
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
352
		return 4 + blockSize + 2*blockSize/16
353
	case tensorTypeIQ2_XXS:
354
		return 2 + 2*blockSize/8
355
	case tensorTypeIQ2_XS:
356
		return 2 + 2*blockSize/8 + blockSize/32
357
	case tensorTypeIQ3_XXS:
358
		return 2 + blockSize/4 + blockSize/8
359
	case tensorTypeIQ1_S:
360
		return 2 + blockSize/8 + blockSize/16
361
	case tensorTypeIQ4_NL:
362
		return 2 + blockSize/2
363
	case tensorTypeIQ3_S:
364
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
365
	case tensorTypeIQ2_S:
366
		return 2 + blockSize/4 + blockSize/16
367
	case tensorTypeIQ4_XS:
368
		return 2 + 2 + blockSize/2 + blockSize/64
369
	case TensorTypeI8:
370
		return 1
371
	case TensorTypeI16:
372
		return 2
373
	case TensorTypeI32:
374
		return 4
375
	case TensorTypeI64:
376
		return 8
377
	case TensorTypeF64:
378
		return 8
379
	case tensorTypeIQ1_M:
380
		return blockSize/8 + blockSize/16 + blockSize/32
381
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
382
		return 2
383
384
385
386
387
	default:
		return 0
	}
}

388
func (t Tensor) Elements() uint64 {
389
390
391
392
393
394
395
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
396
func (t Tensor) Size() uint64 {
397
	return t.Elements() * t.typeSize() / t.blockSize()
398
399
}

400
func (t Tensor) Type() string {
401
	return TensorType(t.Kind).String()
402
403
}

404
405
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
406
	Decode(io.ReadSeeker) (model, error)
407
408
409
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
410
	// Magic constant for `ggml` files (unversioned).
411
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
412
	// Magic constant for `ggml` files (versioned, ggmf).
413
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
414
	// Magic constant for `ggml` files (versioned, ggjt).
415
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
416
	// Magic constant for `ggla` files (LoRA adapter).
417
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
418
	// Magic constant for `gguf` files (versioned, gguf)
419
420
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
421
422
)

Bruce MacDonald's avatar
Bruce MacDonald committed
423
424
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
425
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
426
427
428
429
430
431
432
433
434
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
435
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
436
437
438
439
440
441
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
442
// Decode decodes a GGML model from the given reader.
443
444
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
445
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
446
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
447
448
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

449
	var magic uint32
Michael Yang's avatar
Michael Yang committed
450
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
451
		return nil, err
452
453
454
	}

	var c container
455
456
	switch magic {
	case FILE_MAGIC_GGUF_LE:
457
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
458
	case FILE_MAGIC_GGUF_BE:
459
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
460
	default:
461
		return nil, errors.New("invalid file magic")
462
463
	}

Michael Yang's avatar
Michael Yang committed
464
	model, err := c.Decode(rs)
465
	if err != nil {
466
		return nil, err
467
468
	}

Michael Yang's avatar
Michael Yang committed
469
470
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
471
		return nil, err
Michael Yang's avatar
Michael Yang committed
472
473
	}

474
	// final model type
475
476
477
	return &GGML{
		container: c,
		model:     model,
478
479
		Length:    offset,
	}, nil
480
}
Michael Yang's avatar
Michael Yang committed
481

482
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
483
	embedding := f.KV().EmbeddingLength()
484
485
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
486
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
487

488
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
489
490
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
491

Michael Yang's avatar
Michael Yang committed
492
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
493

494
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
495
	var kvTotal uint64
496
497
498
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
499
		kvTotal += kv[i]
500
	}
Michael Yang's avatar
Michael Yang committed
501

Michael Yang's avatar
Michael Yang committed
502
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
503
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
504
505
506
507
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
508
509
510

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
511
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
512
513
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
514

Michael Yang's avatar
Michael Yang committed
515
516
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
517
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
518
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
519
520
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
521
522
523
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
524
525
526
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
527
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
528
529
530
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
531
532
533
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
534
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
535
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
536
			if slices.Contains(crossAttentionLayers, int32(i)) {
537
538
539
540
541
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
542
543
		}

Michael Yang's avatar
Michael Yang committed
544
545
546
547
548
549
550
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
551
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
552
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
553
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
554
555
556
557
558
559
560
561
562
563
564
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
565
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
566
567
568
569
570
571
572
573
574
575
576
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
577

578
579
580
581
582
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

583
584
585
586
587
588
589
590
591
592
593
594
595
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
596
597
598
599
600
601
602
603
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
604
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
621

Michael Yang's avatar
Michael Yang committed
622
623
624
625
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
626
627
628
629
630
631
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
632
633
634
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
635
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
636
637
638
639
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
640
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
641
		)
Michael Yang's avatar
Michael Yang committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
668
669
670
671
672
673
674
675
676
677
678
679
	case "gptoss":
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
		partialOffload = 2 * fullOffload
Michael Yang's avatar
Michael Yang committed
680
681
	}

Michael Yang's avatar
Michael Yang committed
682
	return
Michael Yang's avatar
Michael Yang committed
683
}
684

685
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
686
687
688
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
689

Michael Yang's avatar
Michael Yang committed
690
	for name, layer := range llm.Tensors().GroupLayers() {
691
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
692
693
			for _, tensor := range layer {
				weights += tensor.Size()
694
695
			}
		}
Michael Yang's avatar
Michael Yang committed
696
	}
697

Michael Yang's avatar
Michael Yang committed
698
699
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
700
701
702
703
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
704

Michael Yang's avatar
Michael Yang committed
705
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
706

Michael Yang's avatar
Michael Yang committed
707
708
709
710
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
711

Michael Yang's avatar
Michael Yang committed
712
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
713
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
714

Michael Yang's avatar
Michael Yang committed
715
716
	switch llm.KV().Architecture() {
	case "mllama":
717
718
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
719
720
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

721
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
722
			imageSize*imageSize*numChannels*maxNumTiles +
723
724
725
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
726
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
727
728
729
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
730
731
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
732
733
734

		numPatches := maxPixels / (patchSize * patchSize)

735
736
737
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
738
739
740
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
741
742
743
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
744
745
746
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
747
	}
Michael Yang's avatar
Michael Yang committed
748

749
750
751
	return weights, graphSize
}

752
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
753
754
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
755
756
757
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
758
759
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
760
761
762
763
764
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
765
766
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
767
768
769
770
771
772
773
774
775
776
777
778
779
780
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}