ggml.go 21.1 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"math"
11
	"slices"
Michael Yang's avatar
Michael Yang committed
12
	"strings"
13

14
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
15
	"github.com/ollama/ollama/fs/util/bufioutil"
16
17
)

Michael Yang's avatar
Michael Yang committed
18
19
20
type GGML struct {
	container
	model
21
	Length int64
Michael Yang's avatar
Michael Yang committed
22
}
23

Michael Yang's avatar
Michael Yang committed
24
type model interface {
Michael Yang's avatar
Michael Yang committed
25
	KV() KV
Michael Yang's avatar
Michael Yang committed
26
	Tensors() Tensors
27
28
}

29
30
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
31
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
33
34
}

35
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
36
	return kv.String("general.type", "unknown")
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) ParameterCount() uint64 {
40
41
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
42
43
}

44
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
45
	if t := kv.Uint("general.file_type"); t > 0 {
46
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
47
48
	}

49
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
50
51
52
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
53
54
55
56
57
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
58
59
}

60
61
62
63
64
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
65
66
}

67
68
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
69
70
}

71
72
73
74
75
76
77
78
79
80
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
81
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
82
83
84
85
86
87
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
88
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
89
90
91
}

func (kv KV) EmbeddingHeadCountV() uint64 {
92
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
93
94
95
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
96
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
97
98
}

Michael Yang's avatar
Michael Yang committed
99
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
100
101
102
103
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
104
105
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
106
107
108
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
109
110
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
111
112
113
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
114
115
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
116
117
}

118
func (kv KV) Bool(key string, defaultValue ...bool) bool {
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
150
151
}

Michael Yang's avatar
Michael Yang committed
152
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
153
154
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
155
156
}

Michael Yang's avatar
Michael Yang committed
157
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
158
159
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
160
161
}

Michael Yang's avatar
Michael Yang committed
162
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
163
164
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
165
166
}

Patrick Devine's avatar
Patrick Devine committed
167
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
168
169
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
170
171
}

Michael Yang's avatar
Michael Yang committed
172
173
174
175
176
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

177
func (kv KV) OllamaEngineRequired() bool {
178
179
	return slices.Contains([]string{
		"gemma3",
180
		"gemma3n",
181
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
182
		"llama4",
183
		"mllama",
184
		"qwen25vl",
185
		"gptoss", "gpt-oss",
186
	}, kv.Architecture())
187
188
}

Michael Yang's avatar
Michael Yang committed
189
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
190
191
192
193
194
195
196
197
198
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
199
200
}

201
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
202
203
204
205
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

206
207
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
208
209
	}

210
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
211
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
212
213
}

214
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
215
	items  []*Tensor
216
	Offset uint64
Michael Yang's avatar
Michael Yang committed
217
}
Michael Yang's avatar
Michael Yang committed
218

Michael Yang's avatar
Michael Yang committed
219
220
221
222
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
223

Michael Yang's avatar
Michael Yang committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
244
			}
Michael Yang's avatar
Michael Yang committed
245
		}
246

Michael Yang's avatar
Michael Yang committed
247
248
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
249
250
		}

Michael Yang's avatar
Michael Yang committed
251
252
253
254
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
255
256
257
258
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
259
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
260
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
261
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
262
263
264
265
266
	}

	return size
}

267
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
268
269
270
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
271
272

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
273
	Shape []uint64 `json:"shape"`
274

Michael Yang's avatar
Michael Yang committed
275
	io.WriterTo `json:"-"`
276
277
}

278
279
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
280
		return math.MaxInt
281
282
283
284
285
	}

	return
}

286
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
287
	return TensorType(t.Kind).BlockSize()
288
289
290
291
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
292
	case
293
294
295
296
297
298
299
300
		TensorTypeF32,
		TensorTypeF16,
		TensorTypeI8,
		TensorTypeI16,
		TensorTypeI32,
		TensorTypeI64,
		TensorTypeF64,
		TensorTypeBF16:
301
		return 1
Michael Yang's avatar
Michael Yang committed
302
	case
303
304
305
306
307
308
309
310
		TensorTypeQ4_0,
		TensorTypeQ4_1,
		TensorTypeQ5_0,
		TensorTypeQ5_1,
		TensorTypeQ8_0,
		TensorTypeQ8_1,
		tensorTypeIQ4_NL,
		4, TensorTypeMXFP4:
311
		return 32
Michael Yang's avatar
Michael Yang committed
312
	default:
313
314
315
316
317
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
318
319
320
321
322
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
323

324
325
	switch t {
	case TensorTypeF32:
326
		return 4
327
	case TensorTypeF16:
328
		return 2
329
	case TensorTypeQ4_0:
330
		return 2 + blockSize/2
331
	case TensorTypeQ4_1:
332
		return 2 + 2 + blockSize/2
333
	case TensorTypeQ5_0:
334
		return 2 + 4 + blockSize/2
335
	case TensorTypeQ5_1:
336
		return 2 + 2 + 4 + blockSize/2
337
	case TensorTypeQ8_0:
338
		return 2 + blockSize
339
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
340
		return 2 + 2 + blockSize
341
	case TensorTypeQ2_K:
342
		return blockSize/16 + blockSize/4 + 2 + 2
343
	case TensorTypeQ3_K:
344
		return blockSize/8 + blockSize/4 + 12 + 2
345
	case TensorTypeQ4_K:
346
		return 2 + 2 + 12 + blockSize/2
347
	case TensorTypeQ5_K:
348
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
349
	case TensorTypeQ6_K:
350
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
351
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
352
		return 4 + blockSize + 2*blockSize/16
353
	case tensorTypeIQ2_XXS:
354
		return 2 + 2*blockSize/8
355
	case tensorTypeIQ2_XS:
356
		return 2 + 2*blockSize/8 + blockSize/32
357
	case tensorTypeIQ3_XXS:
358
		return 2 + blockSize/4 + blockSize/8
359
	case tensorTypeIQ1_S:
360
		return 2 + blockSize/8 + blockSize/16
361
	case tensorTypeIQ4_NL:
362
		return 2 + blockSize/2
363
	case tensorTypeIQ3_S:
364
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
365
	case tensorTypeIQ2_S:
366
		return 2 + blockSize/4 + blockSize/16
367
	case tensorTypeIQ4_XS:
368
		return 2 + 2 + blockSize/2 + blockSize/64
369
	case TensorTypeI8:
370
		return 1
371
	case TensorTypeI16:
372
		return 2
373
	case TensorTypeI32:
374
		return 4
375
	case TensorTypeI64:
376
		return 8
377
	case TensorTypeF64:
378
		return 8
379
	case tensorTypeIQ1_M:
380
		return blockSize/8 + blockSize/16 + blockSize/32
381
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
382
		return 2
383
384
	case 4, TensorTypeMXFP4:
		return 1 + blockSize/2
385
386
387
388
389
	default:
		return 0
	}
}

390
func (t Tensor) Elements() uint64 {
391
392
393
394
395
396
397
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
398
func (t Tensor) Size() uint64 {
399
	return t.Elements() * t.typeSize() / t.blockSize()
400
401
}

402
func (t Tensor) Type() string {
403
	return TensorType(t.Kind).String()
404
405
}

406
407
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
408
	Decode(io.ReadSeeker) (model, error)
409
410
411
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
412
	// Magic constant for `ggml` files (unversioned).
413
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
414
	// Magic constant for `ggml` files (versioned, ggmf).
415
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
416
	// Magic constant for `ggml` files (versioned, ggjt).
417
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
418
	// Magic constant for `ggla` files (LoRA adapter).
419
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
420
	// Magic constant for `gguf` files (versioned, gguf)
421
422
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
423
424
)

Bruce MacDonald's avatar
Bruce MacDonald committed
425
426
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
427
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
428
429
430
431
432
433
434
435
436
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
437
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
438
439
440
441
442
443
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
444
// Decode decodes a GGML model from the given reader.
445
446
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
447
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
448
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
449
450
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

451
	var magic uint32
Michael Yang's avatar
Michael Yang committed
452
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
453
		return nil, err
454
455
456
	}

	var c container
457
458
	switch magic {
	case FILE_MAGIC_GGUF_LE:
459
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
460
	case FILE_MAGIC_GGUF_BE:
461
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
462
	default:
463
		return nil, errors.New("invalid file magic")
464
465
	}

Michael Yang's avatar
Michael Yang committed
466
	model, err := c.Decode(rs)
467
	if err != nil {
468
		return nil, err
469
470
	}

Michael Yang's avatar
Michael Yang committed
471
472
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
473
		return nil, err
Michael Yang's avatar
Michael Yang committed
474
475
	}

476
	// final model type
477
478
479
	return &GGML{
		container: c,
		model:     model,
480
481
		Length:    offset,
	}, nil
482
}
Michael Yang's avatar
Michael Yang committed
483

484
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
485
486
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
487
	embedding := f.KV().EmbeddingLength()
488
489
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
490
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
491

492
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
493
494
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
495

Michael Yang's avatar
Michael Yang committed
496
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
497

498
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
499
	var kvTotal uint64
500
501
502
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
503
		kvTotal += kv[i]
504
	}
Michael Yang's avatar
Michael Yang committed
505

Michael Yang's avatar
Michael Yang committed
506
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
507
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
508
509
510
511
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
512
513
514

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
515
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
516
517
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
518

Michael Yang's avatar
Michael Yang committed
519
520
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
521
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
522
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
523
524
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
525
526
527
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
528
529
530
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
531
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
532
533
534
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
535
536
537
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
538
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
539
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
540
			if slices.Contains(crossAttentionLayers, int32(i)) {
541
542
543
544
545
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
546
547
		}

Michael Yang's avatar
Michael Yang committed
548
549
550
551
552
553
554
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
555
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
556
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
557
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
558
559
560
561
562
563
564
565
566
567
568
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
569
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
570
571
572
573
574
575
576
577
578
579
580
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
581

582
583
584
585
586
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

587
588
589
590
591
592
593
594
595
596
597
598
599
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
600
601
602
603
604
605
606
607
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
608
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
625

Michael Yang's avatar
Michael Yang committed
626
627
628
629
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
630
631
632
633
634
635
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
636
637
638
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
639
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
640
641
642
643
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
644
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
645
		)
Michael Yang's avatar
Michael Yang committed
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
672
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
673
674
675
676
677
678
679
680
681
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
682

683
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
684
685
686
687
		if useFlashAttention {
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
688
689
	}

Michael Yang's avatar
Michael Yang committed
690
	return
Michael Yang's avatar
Michael Yang committed
691
}
692

693
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
694
695
696
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
697

Michael Yang's avatar
Michael Yang committed
698
	for name, layer := range llm.Tensors().GroupLayers() {
699
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
700
701
			for _, tensor := range layer {
				weights += tensor.Size()
702
703
			}
		}
Michael Yang's avatar
Michael Yang committed
704
	}
705

Michael Yang's avatar
Michael Yang committed
706
707
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
708
709
710
711
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
712

Michael Yang's avatar
Michael Yang committed
713
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
714

Michael Yang's avatar
Michael Yang committed
715
716
717
718
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
719

Michael Yang's avatar
Michael Yang committed
720
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
721
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
722

Michael Yang's avatar
Michael Yang committed
723
724
	switch llm.KV().Architecture() {
	case "mllama":
725
726
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
727
728
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

729
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
730
			imageSize*imageSize*numChannels*maxNumTiles +
731
732
733
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
734
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
735
736
737
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
738
739
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
740
741
742

		numPatches := maxPixels / (patchSize * patchSize)

743
744
745
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
746
747
748
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
749
750
751
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
752
753
754
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
755
	}
Michael Yang's avatar
Michael Yang committed
756

757
758
759
	return weights, graphSize
}

760
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
761
func (f GGML) SupportsKVCacheType(cacheType string) bool {
762
763
764
765
766
	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
		// gpt-oss uses attention with sinks which does not support quantized cache types
		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
		return cacheType == "f16"
	}
Michael Yang's avatar
Michael Yang committed
767
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
768
769
770
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
771
772
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
773
774
775
776
777
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
778
779
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
780
781
782
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

783
784
785
786
787
788
789
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
		"gptoss", "gpt-oss",
	}, f.KV().String("general.architecture"))
}

790
791
792
793
794
795
796
797
798
799
800
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}