ggml.go 24.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"math"
11
	"slices"
Michael Yang's avatar
Michael Yang committed
12
	"strings"
13

14
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
15
	"github.com/ollama/ollama/fs/util/bufioutil"
16
17
)

Michael Yang's avatar
Michael Yang committed
18
19
20
type GGML struct {
	container
	model
21
	Length int64
Michael Yang's avatar
Michael Yang committed
22
}
23

Michael Yang's avatar
Michael Yang committed
24
type model interface {
Michael Yang's avatar
Michael Yang committed
25
	KV() KV
Michael Yang's avatar
Michael Yang committed
26
	Tensors() Tensors
27
28
}

29
30
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
31
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
33
34
}

35
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
36
	return kv.String("general.type", "unknown")
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) ParameterCount() uint64 {
40
41
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
42
43
}

44
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
45
	if t := kv.Uint("general.file_type"); t > 0 {
46
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
47
48
	}

49
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
50
51
52
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
53
54
55
56
57
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
58
59
}

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
func (kv KV) HeadCount() []uint64 {
	headCountDefault := uint32(1)
	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
	if len(headCount) == 1 {
		headCountDefault = headCount[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCount) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCount) {
			out[i] = uint64(headCountDefault)
		} else {
			out[i] = uint64(headCount[i])
		}
	}
	return out
}

81
82
func (kv KV) HeadCountMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
83
84
}

85
86
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
87
88
}

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
func (kv KV) HeadCountKV() []uint64 {
	headCountKVDefault := uint32(1)
	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
	if len(headCountKV) == 1 {
		headCountKVDefault = headCountKV[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCountKV) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCountKV) {
			out[i] = uint64(headCountKVDefault)
		} else {
			out[i] = uint64(headCountKV[i])
		}
	}
	return out
}

110
111
112
113
114
115
116
117
118
119
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
120
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
121
122
123
124
125
126
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
127
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
128
129
130
}

func (kv KV) EmbeddingHeadCountV() uint64 {
131
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
132
133
134
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
135
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
136
137
}

Michael Yang's avatar
Michael Yang committed
138
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
139
140
141
	return kv.String("tokenizer.chat_template")
}

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// ssm architecture parameters

func (kv KV) SSMConvKernel() uint64 {
	return uint64(kv.Uint("ssm.conv_kernel"))
}

func (kv KV) SSMInnerSize() uint64 {
	return uint64(kv.Uint("ssm.inner_size"))
}

func (kv KV) SSMStateSize() uint64 {
	return uint64(kv.Uint("ssm.state_size"))
}

func (kv KV) SSMGroupCount() uint64 {
	return uint64(kv.Uint("ssm.group_count"))
}

// general types

Michael Yang's avatar
Michael Yang committed
162
func (kv KV) String(key string, defaultValue ...string) string {
163
164
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
165
166
167
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
168
169
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
170
171
172
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
173
174
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
175
176
}

177
func (kv KV) Bool(key string, defaultValue ...bool) bool {
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
193
194
195
196
197
	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
	return slices.Min(arrVal), slices.Max(arrVal)
}

func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
198
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
199
		return []uint32{u32}
200
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
201
		return u32s.values
202
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
203
204
205
206
207
208
		dst := make([]uint32, len(i32s.values))
		for i, v := range i32s.values {
			if v < 0 {
				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
			}
			dst[i] = uint32(v)
209
		}
210
		return dst
211
212
	}

213
	return []uint32{defaultValue}
214
215
}

Michael Yang's avatar
Michael Yang committed
216
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
217
218
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
219
220
}

Michael Yang's avatar
Michael Yang committed
221
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
222
223
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
224
225
}

Michael Yang's avatar
Michael Yang committed
226
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
227
228
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
229
230
}

Patrick Devine's avatar
Patrick Devine committed
231
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
232
233
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
234
235
}

Michael Yang's avatar
Michael Yang committed
236
237
238
239
240
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

241
func (kv KV) OllamaEngineRequired() bool {
242
243
	return slices.Contains([]string{
		"gemma3",
244
		"gemma3n",
245
		"mistral3",
246
		"qwen3",
247
		"qwen3moe",
Michael Yang's avatar
llama4  
Michael Yang committed
248
		"llama4",
249
		"mllama",
250
		"qwen25vl",
251
		"gptoss", "gpt-oss",
252
	}, kv.Architecture())
253
254
}

Michael Yang's avatar
Michael Yang committed
255
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
256
257
258
259
260
261
262
263
264
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
265
266
}

267
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
268
269
270
271
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

272
273
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
274
275
	}

276
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
277
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
278
279
}

280
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
281
	items  []*Tensor
282
	Offset uint64
Michael Yang's avatar
Michael Yang committed
283
}
Michael Yang's avatar
Michael Yang committed
284

Michael Yang's avatar
Michael Yang committed
285
286
287
288
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
289

Michael Yang's avatar
Michael Yang committed
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
310
			}
Michael Yang's avatar
Michael Yang committed
311
		}
312

Michael Yang's avatar
Michael Yang committed
313
314
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
315
316
		}

Michael Yang's avatar
Michael Yang committed
317
318
319
320
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
321
322
323
324
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
325
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
326
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
327
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
328
329
330
331
332
	}

	return size
}

333
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
334
335
336
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
337
338

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
339
	Shape []uint64 `json:"shape"`
340

Michael Yang's avatar
Michael Yang committed
341
	io.WriterTo `json:"-"`
342
343
}

344
345
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
346
		return math.MaxInt
347
348
349
350
351
	}

	return
}

352
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
353
	return TensorType(t.Kind).BlockSize()
354
355
356
357
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
358
	case
359
360
361
362
363
364
365
366
		TensorTypeF32,
		TensorTypeF16,
		TensorTypeI8,
		TensorTypeI16,
		TensorTypeI32,
		TensorTypeI64,
		TensorTypeF64,
		TensorTypeBF16:
367
		return 1
Michael Yang's avatar
Michael Yang committed
368
	case
369
370
371
372
373
374
375
376
		TensorTypeQ4_0,
		TensorTypeQ4_1,
		TensorTypeQ5_0,
		TensorTypeQ5_1,
		TensorTypeQ8_0,
		TensorTypeQ8_1,
		tensorTypeIQ4_NL,
		4, TensorTypeMXFP4:
377
		return 32
Michael Yang's avatar
Michael Yang committed
378
	default:
379
380
381
382
383
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
384
385
386
387
388
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
389

390
391
	switch t {
	case TensorTypeF32:
392
		return 4
393
	case TensorTypeF16:
394
		return 2
395
	case TensorTypeQ4_0:
396
		return 2 + blockSize/2
397
	case TensorTypeQ4_1:
398
		return 2 + 2 + blockSize/2
399
	case TensorTypeQ5_0:
400
		return 2 + 4 + blockSize/2
401
	case TensorTypeQ5_1:
402
		return 2 + 2 + 4 + blockSize/2
403
	case TensorTypeQ8_0:
404
		return 2 + blockSize
405
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
406
		return 2 + 2 + blockSize
407
	case TensorTypeQ2_K:
408
		return blockSize/16 + blockSize/4 + 2 + 2
409
	case TensorTypeQ3_K:
410
		return blockSize/8 + blockSize/4 + 12 + 2
411
	case TensorTypeQ4_K:
412
		return 2 + 2 + 12 + blockSize/2
413
	case TensorTypeQ5_K:
414
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
415
	case TensorTypeQ6_K:
416
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
417
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
418
		return 4 + blockSize + 2*blockSize/16
419
	case tensorTypeIQ2_XXS:
420
		return 2 + 2*blockSize/8
421
	case tensorTypeIQ2_XS:
422
		return 2 + 2*blockSize/8 + blockSize/32
423
	case tensorTypeIQ3_XXS:
424
		return 2 + blockSize/4 + blockSize/8
425
	case tensorTypeIQ1_S:
426
		return 2 + blockSize/8 + blockSize/16
427
	case tensorTypeIQ4_NL:
428
		return 2 + blockSize/2
429
	case tensorTypeIQ3_S:
430
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
431
	case tensorTypeIQ2_S:
432
		return 2 + blockSize/4 + blockSize/16
433
	case tensorTypeIQ4_XS:
434
		return 2 + 2 + blockSize/2 + blockSize/64
435
	case TensorTypeI8:
436
		return 1
437
	case TensorTypeI16:
438
		return 2
439
	case TensorTypeI32:
440
		return 4
441
	case TensorTypeI64:
442
		return 8
443
	case TensorTypeF64:
444
		return 8
445
	case tensorTypeIQ1_M:
446
		return blockSize/8 + blockSize/16 + blockSize/32
447
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
448
		return 2
449
450
	case 4, TensorTypeMXFP4:
		return 1 + blockSize/2
451
452
453
454
455
	default:
		return 0
	}
}

456
func (t Tensor) Elements() uint64 {
457
458
459
460
461
462
463
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
464
func (t Tensor) Size() uint64 {
465
	return t.Elements() * t.typeSize() / t.blockSize()
466
467
}

468
func (t Tensor) Type() string {
469
	return TensorType(t.Kind).String()
470
471
}

472
473
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
474
	Decode(io.ReadSeeker) (model, error)
475
476
477
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
478
	// Magic constant for `ggml` files (unversioned).
479
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
480
	// Magic constant for `ggml` files (versioned, ggmf).
481
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
482
	// Magic constant for `ggml` files (versioned, ggjt).
483
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
484
	// Magic constant for `ggla` files (LoRA adapter).
485
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
486
	// Magic constant for `gguf` files (versioned, gguf)
487
488
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
489
490
)

Bruce MacDonald's avatar
Bruce MacDonald committed
491
492
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
493
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
494
495
496
497
498
499
500
501
502
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
503
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
504
505
506
507
508
509
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
510
// Decode decodes a GGML model from the given reader.
511
512
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
513
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
514
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
515
516
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

517
	var magic uint32
Michael Yang's avatar
Michael Yang committed
518
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
519
		return nil, err
520
521
522
	}

	var c container
523
524
	switch magic {
	case FILE_MAGIC_GGUF_LE:
525
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
526
	case FILE_MAGIC_GGUF_BE:
527
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
528
	default:
529
		return nil, errors.New("invalid file magic")
530
531
	}

Michael Yang's avatar
Michael Yang committed
532
	model, err := c.Decode(rs)
533
	if err != nil {
534
		return nil, err
535
536
	}

Michael Yang's avatar
Michael Yang committed
537
538
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
539
		return nil, err
Michael Yang's avatar
Michael Yang committed
540
541
	}

542
	// final model type
543
544
545
	return &GGML{
		container: c,
		model:     model,
546
547
		Length:    offset,
	}, nil
548
}
Michael Yang's avatar
Michael Yang committed
549

550
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
551
552
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
553
	embedding := f.KV().EmbeddingLength()
554
	heads := f.KV().HeadCountMax()
555
	headsArr := f.KV().HeadCount()
556
	headsKV := f.KV().HeadCountKVMax()
557
	headsKVArr := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
558
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
559

560
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
561
562
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
563

Michael Yang's avatar
Michael Yang committed
564
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
565

566
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
567
568
569
570
571
572
573
574
575
576
577

	// Default for models unless special-cased below. These defaults mirror the
	// cache usage in llama.cpp under the assumption that models without special
	// cases below will use the llamarunner and caching will be handled by the
	// llama.cpp layer.
	//
	// This also assumes that a layer without heads or headsKV set is recurrent
	// which is usually the case. Some models (eg nemotronh) use "blocks" in
	// place of layers where some are MLP blocks that don't have any cache.
	// Models like this will need a special case below to be accurately
	// estimated.
Michael Yang's avatar
Michael Yang committed
578
	var kvTotal uint64
579
	kv = make([]uint64, f.KV().BlockCount())
580
581
	kvSizeAttn := uint64(0)
	kvSizeRecurrent := uint64(0)
582
	for i := range kv {
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
		headsL := headsArr[i]
		headsKVL := headsKVArr[i]
		if headsL > 0 && headsKVL > 0 {
			// full attention layer
			// NOTE: Assumes uniform values for all attn layers
			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
			kvSizeAttn += kv[i]
		} else {
			// recurrent layer
			ssmDConv := f.KV().SSMConvKernel()
			ssmDState := f.KV().SSMStateSize()
			ssmDInner := f.KV().SSMInnerSize()
			ssmNGroups := f.KV().SSMGroupCount()
			nEmbdR := uint64(0)
			if ssmDConv > 0 {
				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
			}
			nEmbdS := ssmDState * ssmDInner

			// recurrent always uses F32 in llama.cpp backend
			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")

			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
			kvSizeRecurrent += kv[i]
		}
Michael Yang's avatar
Michael Yang committed
609
		kvTotal += kv[i]
610
	}
611
	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)
Michael Yang's avatar
Michael Yang committed
612

Michael Yang's avatar
Michael Yang committed
613
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
614
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
615
616
617
618
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
619
620
621

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
622
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
623
624
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
625

Michael Yang's avatar
Michael Yang committed
626
627
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
628
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
629
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
630
631
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
632
633
634
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
635
636
637
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
638
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
639
640
641
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
642
643
644
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
645
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
646
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
647
			if slices.Contains(crossAttentionLayers, int32(i)) {
648
649
650
651
652
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
653
654
		}

Michael Yang's avatar
Michael Yang committed
655
656
657
658
659
660
661
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
662
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
663
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
664
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
665
666
667
668
669
670
671
672
673
674
675
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
676
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
677
678
679
680
681
682
683
684
685
686
687
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
688

689
690
691
692
693
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

694
695
696
697
698
699
700
701
702
703
704
705
706
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
707
708
709
710
711
712
713
714
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
715
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
732

Michael Yang's avatar
Michael Yang committed
733
734
735
736
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
737
738
739
740
741
742
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
743
744
745
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
746
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
747
748
749
750
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
751
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
752
		)
Michael Yang's avatar
Michael Yang committed
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
779
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
780
781
782
783
784
785
786
787
788
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
789

790
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
791
792
793
794
		if useFlashAttention {
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
795
796
	}

Michael Yang's avatar
Michael Yang committed
797
	return
Michael Yang's avatar
Michael Yang committed
798
}
799

800
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
801
802
803
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
804

Michael Yang's avatar
Michael Yang committed
805
	for name, layer := range llm.Tensors().GroupLayers() {
806
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
807
808
			for _, tensor := range layer {
				weights += tensor.Size()
809
810
			}
		}
Michael Yang's avatar
Michael Yang committed
811
	}
812

Michael Yang's avatar
Michael Yang committed
813
814
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
815
816
817
818
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
819

Michael Yang's avatar
Michael Yang committed
820
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
821

Michael Yang's avatar
Michael Yang committed
822
823
824
825
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
826

Michael Yang's avatar
Michael Yang committed
827
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
828
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
829

Michael Yang's avatar
Michael Yang committed
830
831
	switch llm.KV().Architecture() {
	case "mllama":
832
833
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
834
835
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

836
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
837
			imageSize*imageSize*numChannels*maxNumTiles +
838
839
840
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
841
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
842
843
844
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
845
846
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
847
848
849

		numPatches := maxPixels / (patchSize * patchSize)

850
851
852
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
853
854
855
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
856
857
858
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
859
860
861
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
862
	}
Michael Yang's avatar
Michael Yang committed
863

864
865
866
	return weights, graphSize
}

867
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
868
func (f GGML) SupportsKVCacheType(cacheType string) bool {
869
870
871
872
873
	if cacheType == "" || cacheType == "f16" {
		return true
	}

	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
874
875
876
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
877
878
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
879
880
881
882
	if isEmbedding {
		return false
	}

883
884
885
886
	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
		return false
	}

887
	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
888
889
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
890
891
892
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

893
894
895
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
896
		"gemma3",
897
		"gptoss", "gpt-oss",
898
899
		"qwen3",
		"qwen3moe",
900
901
902
	}, f.KV().String("general.architecture"))
}

903
904
905
906
907
908
909
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
910
911
	case "f32":
		return 4 // f32 (default for recurrent)
912
913
914
915
	default:
		return 2 // f16 (default)
	}
}