ggml.go 21 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"slices"
Michael Yang's avatar
Michael Yang committed
11
	"strings"
12

13
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
14
	"github.com/ollama/ollama/fs/util/bufioutil"
15
16
)

Michael Yang's avatar
Michael Yang committed
17
18
19
type GGML struct {
	container
	model
20
	Length int64
Michael Yang's avatar
Michael Yang committed
21
}
22

Michael Yang's avatar
Michael Yang committed
23
type model interface {
Michael Yang's avatar
Michael Yang committed
24
	KV() KV
Michael Yang's avatar
Michael Yang committed
25
	Tensors() Tensors
26
27
}

28
29
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
30
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
31
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
32
33
}

34
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
35
	return kv.String("general.type", "unknown")
36
37
}

Michael Yang's avatar
Michael Yang committed
38
func (kv KV) ParameterCount() uint64 {
39
40
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
41
42
}

43
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
44
	if t := kv.Uint("general.file_type"); t > 0 {
45
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
46
47
	}

48
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
49
50
51
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
52
53
54
55
56
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
57
58
}

59
60
61
62
63
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
64
65
}

66
67
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
68
69
}

70
71
72
73
74
75
76
77
78
79
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
80
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
81
82
83
84
85
86
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
87
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
88
89
90
}

func (kv KV) EmbeddingHeadCountV() uint64 {
91
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
92
93
94
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
95
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
96
97
}

Michael Yang's avatar
Michael Yang committed
98
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
103
104
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
105
106
107
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
108
109
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
110
111
112
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
113
114
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
115
116
}

117
func (kv KV) Bool(key string, defaultValue ...bool) bool {
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
149
150
}

Michael Yang's avatar
Michael Yang committed
151
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
152
153
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
154
155
}

Michael Yang's avatar
Michael Yang committed
156
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
157
158
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
159
160
}

Michael Yang's avatar
Michael Yang committed
161
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
162
163
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
164
165
}

Patrick Devine's avatar
Patrick Devine committed
166
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
167
168
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
169
170
}

Michael Yang's avatar
Michael Yang committed
171
172
173
174
175
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

176
func (kv KV) OllamaEngineRequired() bool {
177
178
	return slices.Contains([]string{
		"gemma3",
179
		"gemma3n",
180
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
181
		"llama4",
182
		"mllama",
183
		"qwen25vl",
184
		"gptoss", "gpt-oss",
185
	}, kv.Architecture())
186
187
}

Michael Yang's avatar
Michael Yang committed
188
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
189
190
191
192
193
194
195
196
197
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
198
199
}

200
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
201
202
203
204
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

205
206
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
207
208
	}

209
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
210
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
211
212
}

213
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
214
	items  []*Tensor
215
	Offset uint64
Michael Yang's avatar
Michael Yang committed
216
}
Michael Yang's avatar
Michael Yang committed
217

Michael Yang's avatar
Michael Yang committed
218
219
220
221
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
222

Michael Yang's avatar
Michael Yang committed
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
243
			}
Michael Yang's avatar
Michael Yang committed
244
		}
245

Michael Yang's avatar
Michael Yang committed
246
247
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
248
249
		}

Michael Yang's avatar
Michael Yang committed
250
251
252
253
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
254
255
256
257
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
258
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
259
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
260
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
261
262
263
264
265
	}

	return size
}

266
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
267
268
269
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
270
271

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
272
	Shape []uint64 `json:"shape"`
273

Michael Yang's avatar
Michael Yang committed
274
	io.WriterTo `json:"-"`
275
276
}

277
278
279
280
281
282
283
284
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

285
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
286
	return TensorType(t.Kind).BlockSize()
287
288
289
290
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
291
292
293
294
295
296
297
298
299
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
300
		return 1
Michael Yang's avatar
Michael Yang committed
301
302
303
	case
		2,  // Q4_0
		3,  // Q4_1
Michael Yang's avatar
Michael Yang committed
304
		4,  // MXFP4
Michael Yang's avatar
Michael Yang committed
305
306
307
308
309
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
310
		return 32
Michael Yang's avatar
Michael Yang committed
311
	default:
312
313
314
315
316
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
317
318
319
320
321
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
322

323
324
	switch t {
	case TensorTypeF32:
325
		return 4
326
	case TensorTypeF16:
327
		return 2
328
	case TensorTypeQ4_0:
329
		return 2 + blockSize/2
330
	case TensorTypeQ4_1:
331
		return 2 + 2 + blockSize/2
332
	case TensorTypeMXFP4, 39:
Michael Yang's avatar
Michael Yang committed
333
		return 1 + blockSize/2
334
	case TensorTypeQ5_0:
335
		return 2 + 4 + blockSize/2
336
	case TensorTypeQ5_1:
337
		return 2 + 2 + 4 + blockSize/2
338
	case TensorTypeQ8_0:
339
		return 2 + blockSize
340
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
341
		return 2 + 2 + blockSize
342
	case TensorTypeQ2_K:
343
		return blockSize/16 + blockSize/4 + 2 + 2
344
	case TensorTypeQ3_K:
345
		return blockSize/8 + blockSize/4 + 12 + 2
346
	case TensorTypeQ4_K:
347
		return 2 + 2 + 12 + blockSize/2
348
	case TensorTypeQ5_K:
349
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
350
	case TensorTypeQ6_K:
351
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
352
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
353
		return 4 + blockSize + 2*blockSize/16
354
	case tensorTypeIQ2_XXS:
355
		return 2 + 2*blockSize/8
356
	case tensorTypeIQ2_XS:
357
		return 2 + 2*blockSize/8 + blockSize/32
358
	case tensorTypeIQ3_XXS:
359
		return 2 + blockSize/4 + blockSize/8
360
	case tensorTypeIQ1_S:
361
		return 2 + blockSize/8 + blockSize/16
362
	case tensorTypeIQ4_NL:
363
		return 2 + blockSize/2
364
	case tensorTypeIQ3_S:
365
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
366
	case tensorTypeIQ2_S:
367
		return 2 + blockSize/4 + blockSize/16
368
	case tensorTypeIQ4_XS:
369
		return 2 + 2 + blockSize/2 + blockSize/64
370
	case TensorTypeI8:
371
		return 1
372
	case TensorTypeI16:
373
		return 2
374
	case TensorTypeI32:
375
		return 4
376
	case TensorTypeI64:
377
		return 8
378
	case TensorTypeF64:
379
		return 8
380
	case tensorTypeIQ1_M:
381
		return blockSize/8 + blockSize/16 + blockSize/32
382
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
383
		return 2
384
385
386
387
388
	default:
		return 0
	}
}

389
func (t Tensor) Elements() uint64 {
390
391
392
393
394
395
396
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
397
func (t Tensor) Size() uint64 {
398
	return t.Elements() * t.typeSize() / t.blockSize()
399
400
}

401
func (t Tensor) Type() string {
402
	return TensorType(t.Kind).String()
403
404
}

405
406
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
407
	Decode(io.ReadSeeker) (model, error)
408
409
410
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
411
	// Magic constant for `ggml` files (unversioned).
412
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
413
	// Magic constant for `ggml` files (versioned, ggmf).
414
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
415
	// Magic constant for `ggml` files (versioned, ggjt).
416
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
417
	// Magic constant for `ggla` files (LoRA adapter).
418
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
419
	// Magic constant for `gguf` files (versioned, gguf)
420
421
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
422
423
)

Bruce MacDonald's avatar
Bruce MacDonald committed
424
425
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
426
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
427
428
429
430
431
432
433
434
435
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
436
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
437
438
439
440
441
442
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
443
// Decode decodes a GGML model from the given reader.
444
445
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
446
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
447
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
448
449
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

450
	var magic uint32
Michael Yang's avatar
Michael Yang committed
451
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
452
		return nil, err
453
454
455
	}

	var c container
456
457
	switch magic {
	case FILE_MAGIC_GGUF_LE:
458
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
459
	case FILE_MAGIC_GGUF_BE:
460
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
461
	default:
462
		return nil, errors.New("invalid file magic")
463
464
	}

Michael Yang's avatar
Michael Yang committed
465
	model, err := c.Decode(rs)
466
	if err != nil {
467
		return nil, err
468
469
	}

Michael Yang's avatar
Michael Yang committed
470
471
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
472
		return nil, err
Michael Yang's avatar
Michael Yang committed
473
474
	}

475
	// final model type
476
477
478
	return &GGML{
		container: c,
		model:     model,
479
480
		Length:    offset,
	}, nil
481
}
Michael Yang's avatar
Michael Yang committed
482

483
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
484
485
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
486
	embedding := f.KV().EmbeddingLength()
487
488
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
489
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
490

491
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
492
493
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
494

Michael Yang's avatar
Michael Yang committed
495
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
496

497
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
498
	var kvTotal uint64
499
500
501
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
502
		kvTotal += kv[i]
503
	}
Michael Yang's avatar
Michael Yang committed
504

Michael Yang's avatar
Michael Yang committed
505
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
506
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
507
508
509
510
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
511
512
513

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
514
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
515
516
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
517

Michael Yang's avatar
Michael Yang committed
518
519
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
520
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
521
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
522
523
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
524
525
526
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
527
528
529
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
530
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
531
532
533
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
534
535
536
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
537
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
538
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
539
			if slices.Contains(crossAttentionLayers, int32(i)) {
540
541
542
543
544
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
545
546
		}

Michael Yang's avatar
Michael Yang committed
547
548
549
550
551
552
553
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
554
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
555
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
556
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
557
558
559
560
561
562
563
564
565
566
567
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
568
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
569
570
571
572
573
574
575
576
577
578
579
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
580

581
582
583
584
585
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

586
587
588
589
590
591
592
593
594
595
596
597
598
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
599
600
601
602
603
604
605
606
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
607
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
624

Michael Yang's avatar
Michael Yang committed
625
626
627
628
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
629
630
631
632
633
634
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
635
636
637
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
638
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
639
640
641
642
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
643
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
644
		)
Michael Yang's avatar
Michael Yang committed
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
671
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
672
673
674
675
676
677
678
679
680
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
681

682
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
683
684
685
686
		if useFlashAttention {
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
687
688
	}

Michael Yang's avatar
Michael Yang committed
689
	return
Michael Yang's avatar
Michael Yang committed
690
}
691

692
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
693
694
695
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
696

Michael Yang's avatar
Michael Yang committed
697
	for name, layer := range llm.Tensors().GroupLayers() {
698
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
699
700
			for _, tensor := range layer {
				weights += tensor.Size()
701
702
			}
		}
Michael Yang's avatar
Michael Yang committed
703
	}
704

Michael Yang's avatar
Michael Yang committed
705
706
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
707
708
709
710
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
711

Michael Yang's avatar
Michael Yang committed
712
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
713

Michael Yang's avatar
Michael Yang committed
714
715
716
717
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
718

Michael Yang's avatar
Michael Yang committed
719
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
720
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
721

Michael Yang's avatar
Michael Yang committed
722
723
	switch llm.KV().Architecture() {
	case "mllama":
724
725
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
726
727
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

728
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
729
			imageSize*imageSize*numChannels*maxNumTiles +
730
731
732
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
733
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
734
735
736
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
737
738
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
739
740
741

		numPatches := maxPixels / (patchSize * patchSize)

742
743
744
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
745
746
747
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
748
749
750
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
751
752
753
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
754
	}
Michael Yang's avatar
Michael Yang committed
755

756
757
758
	return weights, graphSize
}

759
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
760
func (f GGML) SupportsKVCacheType(cacheType string) bool {
761
762
763
764
765
	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
		// gpt-oss uses attention with sinks which does not support quantized cache types
		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
		return cacheType == "f16"
	}
Michael Yang's avatar
Michael Yang committed
766
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
767
768
769
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
770
771
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
772
773
774
775
776
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
777
778
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
779
780
781
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

782
783
784
785
786
787
788
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
		"gptoss", "gpt-oss",
	}, f.KV().String("general.architecture"))
}

789
790
791
792
793
794
795
796
797
798
799
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}