ggml.go 24.5 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"math"
11
	"slices"
Michael Yang's avatar
Michael Yang committed
12
	"strings"
13

14
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
15
	"github.com/ollama/ollama/fs/util/bufioutil"
16
17
)

Michael Yang's avatar
Michael Yang committed
18
19
20
type GGML struct {
	container
	model
21
	Length int64
Michael Yang's avatar
Michael Yang committed
22
}
23

Michael Yang's avatar
Michael Yang committed
24
type model interface {
Michael Yang's avatar
Michael Yang committed
25
	KV() KV
Michael Yang's avatar
Michael Yang committed
26
	Tensors() Tensors
27
28
}

29
30
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
31
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
33
34
}

35
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
36
	return kv.String("general.type", "unknown")
37
38
}

Michael Yang's avatar
Michael Yang committed
39
func (kv KV) ParameterCount() uint64 {
40
41
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
42
43
}

44
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
45
	if t := kv.Uint("general.file_type"); t > 0 {
46
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
47
48
	}

49
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
50
51
52
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
53
54
55
56
57
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
58
59
}

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
func (kv KV) HeadCount() []uint64 {
	headCountDefault := uint32(1)
	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
	if len(headCount) == 1 {
		headCountDefault = headCount[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCount) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCount) {
			out[i] = uint64(headCountDefault)
		} else {
			out[i] = uint64(headCount[i])
		}
	}
	return out
}

81
82
func (kv KV) HeadCountMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
83
84
}

85
86
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
87
88
}

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
func (kv KV) HeadCountKV() []uint64 {
	headCountKVDefault := uint32(1)
	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
	if len(headCountKV) == 1 {
		headCountKVDefault = headCountKV[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCountKV) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCountKV) {
			out[i] = uint64(headCountKVDefault)
		} else {
			out[i] = uint64(headCountKV[i])
		}
	}
	return out
}

110
111
112
113
114
115
116
117
118
119
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
120
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
121
122
123
124
125
126
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
127
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
128
129
130
}

func (kv KV) EmbeddingHeadCountV() uint64 {
131
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
132
133
134
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
135
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
136
137
}

Michael Yang's avatar
Michael Yang committed
138
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
139
140
141
	return kv.String("tokenizer.chat_template")
}

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// ssm architecture parameters

func (kv KV) SSMConvKernel() uint64 {
	return uint64(kv.Uint("ssm.conv_kernel"))
}

func (kv KV) SSMInnerSize() uint64 {
	return uint64(kv.Uint("ssm.inner_size"))
}

func (kv KV) SSMStateSize() uint64 {
	return uint64(kv.Uint("ssm.state_size"))
}

func (kv KV) SSMGroupCount() uint64 {
	return uint64(kv.Uint("ssm.group_count"))
}

// general types

Michael Yang's avatar
Michael Yang committed
162
func (kv KV) String(key string, defaultValue ...string) string {
163
164
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
165
166
167
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
168
169
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
170
171
172
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
173
174
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
175
176
}

177
func (kv KV) Bool(key string, defaultValue ...bool) bool {
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
193
194
195
196
197
	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
	return slices.Min(arrVal), slices.Max(arrVal)
}

func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
198
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
199
		return []uint32{u32}
200
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
201
		return u32s.values
202
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
203
204
205
206
207
208
		dst := make([]uint32, len(i32s.values))
		for i, v := range i32s.values {
			if v < 0 {
				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
			}
			dst[i] = uint32(v)
209
		}
210
		return dst
211
212
	}

213
	return []uint32{defaultValue}
214
215
}

Michael Yang's avatar
Michael Yang committed
216
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
217
218
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
219
220
}

Michael Yang's avatar
Michael Yang committed
221
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
222
223
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
224
225
}

Michael Yang's avatar
Michael Yang committed
226
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
227
228
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
229
230
}

Patrick Devine's avatar
Patrick Devine committed
231
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
232
233
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
234
235
}

Michael Yang's avatar
Michael Yang committed
236
237
238
239
240
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

241
func (kv KV) OllamaEngineRequired() bool {
242
243
	return slices.Contains([]string{
		"gemma3",
244
		"gemma3n",
245
		"mistral3",
246
		"qwen3",
Michael Yang's avatar
llama4  
Michael Yang committed
247
		"llama4",
248
		"mllama",
249
		"qwen25vl",
250
		"gptoss", "gpt-oss",
251
	}, kv.Architecture())
252
253
}

Michael Yang's avatar
Michael Yang committed
254
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
255
256
257
258
259
260
261
262
263
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
264
265
}

266
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
267
268
269
270
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

271
272
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
273
274
	}

275
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
276
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
277
278
}

279
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
280
	items  []*Tensor
281
	Offset uint64
Michael Yang's avatar
Michael Yang committed
282
}
Michael Yang's avatar
Michael Yang committed
283

Michael Yang's avatar
Michael Yang committed
284
285
286
287
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
288

Michael Yang's avatar
Michael Yang committed
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
309
			}
Michael Yang's avatar
Michael Yang committed
310
		}
311

Michael Yang's avatar
Michael Yang committed
312
313
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
314
315
		}

Michael Yang's avatar
Michael Yang committed
316
317
318
319
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
320
321
322
323
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
324
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
325
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
326
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
327
328
329
330
331
	}

	return size
}

332
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
333
334
335
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
336
337

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
338
	Shape []uint64 `json:"shape"`
339

Michael Yang's avatar
Michael Yang committed
340
	io.WriterTo `json:"-"`
341
342
}

343
344
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
345
		return math.MaxInt
346
347
348
349
350
	}

	return
}

351
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
352
	return TensorType(t.Kind).BlockSize()
353
354
355
356
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
357
	case
358
359
360
361
362
363
364
365
		TensorTypeF32,
		TensorTypeF16,
		TensorTypeI8,
		TensorTypeI16,
		TensorTypeI32,
		TensorTypeI64,
		TensorTypeF64,
		TensorTypeBF16:
366
		return 1
Michael Yang's avatar
Michael Yang committed
367
	case
368
369
370
371
372
373
374
375
		TensorTypeQ4_0,
		TensorTypeQ4_1,
		TensorTypeQ5_0,
		TensorTypeQ5_1,
		TensorTypeQ8_0,
		TensorTypeQ8_1,
		tensorTypeIQ4_NL,
		4, TensorTypeMXFP4:
376
		return 32
Michael Yang's avatar
Michael Yang committed
377
	default:
378
379
380
381
382
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
383
384
385
386
387
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
388

389
390
	switch t {
	case TensorTypeF32:
391
		return 4
392
	case TensorTypeF16:
393
		return 2
394
	case TensorTypeQ4_0:
395
		return 2 + blockSize/2
396
	case TensorTypeQ4_1:
397
		return 2 + 2 + blockSize/2
398
	case TensorTypeQ5_0:
399
		return 2 + 4 + blockSize/2
400
	case TensorTypeQ5_1:
401
		return 2 + 2 + 4 + blockSize/2
402
	case TensorTypeQ8_0:
403
		return 2 + blockSize
404
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
405
		return 2 + 2 + blockSize
406
	case TensorTypeQ2_K:
407
		return blockSize/16 + blockSize/4 + 2 + 2
408
	case TensorTypeQ3_K:
409
		return blockSize/8 + blockSize/4 + 12 + 2
410
	case TensorTypeQ4_K:
411
		return 2 + 2 + 12 + blockSize/2
412
	case TensorTypeQ5_K:
413
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
414
	case TensorTypeQ6_K:
415
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
416
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
417
		return 4 + blockSize + 2*blockSize/16
418
	case tensorTypeIQ2_XXS:
419
		return 2 + 2*blockSize/8
420
	case tensorTypeIQ2_XS:
421
		return 2 + 2*blockSize/8 + blockSize/32
422
	case tensorTypeIQ3_XXS:
423
		return 2 + blockSize/4 + blockSize/8
424
	case tensorTypeIQ1_S:
425
		return 2 + blockSize/8 + blockSize/16
426
	case tensorTypeIQ4_NL:
427
		return 2 + blockSize/2
428
	case tensorTypeIQ3_S:
429
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
430
	case tensorTypeIQ2_S:
431
		return 2 + blockSize/4 + blockSize/16
432
	case tensorTypeIQ4_XS:
433
		return 2 + 2 + blockSize/2 + blockSize/64
434
	case TensorTypeI8:
435
		return 1
436
	case TensorTypeI16:
437
		return 2
438
	case TensorTypeI32:
439
		return 4
440
	case TensorTypeI64:
441
		return 8
442
	case TensorTypeF64:
443
		return 8
444
	case tensorTypeIQ1_M:
445
		return blockSize/8 + blockSize/16 + blockSize/32
446
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
447
		return 2
448
449
	case 4, TensorTypeMXFP4:
		return 1 + blockSize/2
450
451
452
453
454
	default:
		return 0
	}
}

455
func (t Tensor) Elements() uint64 {
456
457
458
459
460
461
462
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
463
func (t Tensor) Size() uint64 {
464
	return t.Elements() * t.typeSize() / t.blockSize()
465
466
}

467
func (t Tensor) Type() string {
468
	return TensorType(t.Kind).String()
469
470
}

471
472
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
473
	Decode(io.ReadSeeker) (model, error)
474
475
476
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
477
	// Magic constant for `ggml` files (unversioned).
478
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
479
	// Magic constant for `ggml` files (versioned, ggmf).
480
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
481
	// Magic constant for `ggml` files (versioned, ggjt).
482
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
483
	// Magic constant for `ggla` files (LoRA adapter).
484
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
485
	// Magic constant for `gguf` files (versioned, gguf)
486
487
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
488
489
)

Bruce MacDonald's avatar
Bruce MacDonald committed
490
491
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
492
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
493
494
495
496
497
498
499
500
501
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
502
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
503
504
505
506
507
508
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
509
// Decode decodes a GGML model from the given reader.
510
511
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
512
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
513
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
514
515
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

516
	var magic uint32
Michael Yang's avatar
Michael Yang committed
517
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
518
		return nil, err
519
520
521
	}

	var c container
522
523
	switch magic {
	case FILE_MAGIC_GGUF_LE:
524
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
525
	case FILE_MAGIC_GGUF_BE:
526
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
527
	default:
528
		return nil, errors.New("invalid file magic")
529
530
	}

Michael Yang's avatar
Michael Yang committed
531
	model, err := c.Decode(rs)
532
	if err != nil {
533
		return nil, err
534
535
	}

Michael Yang's avatar
Michael Yang committed
536
537
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
538
		return nil, err
Michael Yang's avatar
Michael Yang committed
539
540
	}

541
	// final model type
542
543
544
	return &GGML{
		container: c,
		model:     model,
545
546
		Length:    offset,
	}, nil
547
}
Michael Yang's avatar
Michael Yang committed
548

549
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
550
551
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
552
	embedding := f.KV().EmbeddingLength()
553
	heads := f.KV().HeadCountMax()
554
	headsArr := f.KV().HeadCount()
555
	headsKV := f.KV().HeadCountKVMax()
556
	headsKVArr := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
557
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
558

559
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
560
561
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
562

Michael Yang's avatar
Michael Yang committed
563
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
564

565
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
566
567
568
569
570
571
572
573
574
575
576

	// Default for models unless special-cased below. These defaults mirror the
	// cache usage in llama.cpp under the assumption that models without special
	// cases below will use the llamarunner and caching will be handled by the
	// llama.cpp layer.
	//
	// This also assumes that a layer without heads or headsKV set is recurrent
	// which is usually the case. Some models (eg nemotronh) use "blocks" in
	// place of layers where some are MLP blocks that don't have any cache.
	// Models like this will need a special case below to be accurately
	// estimated.
Michael Yang's avatar
Michael Yang committed
577
	var kvTotal uint64
578
	kv = make([]uint64, f.KV().BlockCount())
579
580
	kvSizeAttn := uint64(0)
	kvSizeRecurrent := uint64(0)
581
	for i := range kv {
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
		headsL := headsArr[i]
		headsKVL := headsKVArr[i]
		if headsL > 0 && headsKVL > 0 {
			// full attention layer
			// NOTE: Assumes uniform values for all attn layers
			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
			kvSizeAttn += kv[i]
		} else {
			// recurrent layer
			ssmDConv := f.KV().SSMConvKernel()
			ssmDState := f.KV().SSMStateSize()
			ssmDInner := f.KV().SSMInnerSize()
			ssmNGroups := f.KV().SSMGroupCount()
			nEmbdR := uint64(0)
			if ssmDConv > 0 {
				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
			}
			nEmbdS := ssmDState * ssmDInner

			// recurrent always uses F32 in llama.cpp backend
			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")

			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
			kvSizeRecurrent += kv[i]
		}
Michael Yang's avatar
Michael Yang committed
608
		kvTotal += kv[i]
609
	}
610
	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)
Michael Yang's avatar
Michael Yang committed
611

Michael Yang's avatar
Michael Yang committed
612
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
613
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
614
615
616
617
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
618
619
620

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
621
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
622
623
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
624

Michael Yang's avatar
Michael Yang committed
625
626
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
627
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
628
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
629
630
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
631
632
633
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
634
635
636
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
637
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
638
639
640
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
641
642
643
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
644
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
645
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
646
			if slices.Contains(crossAttentionLayers, int32(i)) {
647
648
649
650
651
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
652
653
		}

Michael Yang's avatar
Michael Yang committed
654
655
656
657
658
659
660
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
661
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
662
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
663
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
664
665
666
667
668
669
670
671
672
673
674
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
675
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
676
677
678
679
680
681
682
683
684
685
686
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
687

688
689
690
691
692
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

693
694
695
696
697
698
699
700
701
702
703
704
705
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
706
707
708
709
710
711
712
713
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
714
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
731

Michael Yang's avatar
Michael Yang committed
732
733
734
735
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
736
737
738
739
740
741
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
742
743
744
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
745
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
746
747
748
749
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
750
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
751
		)
Michael Yang's avatar
Michael Yang committed
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
778
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
779
780
781
782
783
784
785
786
787
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
788

789
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
790
791
792
793
		if useFlashAttention {
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
794
795
	}

Michael Yang's avatar
Michael Yang committed
796
	return
Michael Yang's avatar
Michael Yang committed
797
}
798

799
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
800
801
802
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
803

Michael Yang's avatar
Michael Yang committed
804
	for name, layer := range llm.Tensors().GroupLayers() {
805
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
806
807
			for _, tensor := range layer {
				weights += tensor.Size()
808
809
			}
		}
Michael Yang's avatar
Michael Yang committed
810
	}
811

Michael Yang's avatar
Michael Yang committed
812
813
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
814
815
816
817
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
818

Michael Yang's avatar
Michael Yang committed
819
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
820

Michael Yang's avatar
Michael Yang committed
821
822
823
824
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
825

Michael Yang's avatar
Michael Yang committed
826
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
827
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
828

Michael Yang's avatar
Michael Yang committed
829
830
	switch llm.KV().Architecture() {
	case "mllama":
831
832
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
833
834
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

835
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
836
			imageSize*imageSize*numChannels*maxNumTiles +
837
838
839
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
840
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
841
842
843
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
844
845
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
846
847
848

		numPatches := maxPixels / (patchSize * patchSize)

849
850
851
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
852
853
854
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
855
856
857
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
858
859
860
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
861
	}
Michael Yang's avatar
Michael Yang committed
862

863
864
865
	return weights, graphSize
}

866
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
867
func (f GGML) SupportsKVCacheType(cacheType string) bool {
868
869
870
871
	if cacheType == "" || cacheType == "f16" {
		return true
	}

872
873
	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
		// gpt-oss uses attention with sinks which does not support quantized cache types
874
875
		slog.Warn("model only supports non-quantized cache types", "model", arch)
		return false
876
	}
877
	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
878
879
880
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
881
882
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
883
884
885
886
	if isEmbedding {
		return false
	}

887
888
889
890
	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
		return false
	}

891
	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
892
893
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
894
895
896
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

897
898
899
900
901
902
903
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
		"gptoss", "gpt-oss",
	}, f.KV().String("general.architecture"))
}

904
905
906
907
908
909
910
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
911
912
	case "f32":
		return 4 // f32 (default for recurrent)
913
914
915
916
	default:
		return 2 // f16 (default)
	}
}