ggml.go 22.7 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"math"
11
	"slices"
Michael Yang's avatar
Michael Yang committed
12
	"strings"
13

14
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
15
	"github.com/ollama/ollama/fs/util/bufioutil"
16
	"github.com/ollama/ollama/ml"
17
18
)

Michael Yang's avatar
Michael Yang committed
19
20
21
type GGML struct {
	container
	model
22
	Length int64
Michael Yang's avatar
Michael Yang committed
23
}
24

Michael Yang's avatar
Michael Yang committed
25
type model interface {
Michael Yang's avatar
Michael Yang committed
26
	KV() KV
Michael Yang's avatar
Michael Yang committed
27
	Tensors() Tensors
28
29
}

30
31
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
32
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
33
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
34
35
}

36
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
37
	return kv.String("general.type", "unknown")
38
39
}

Michael Yang's avatar
Michael Yang committed
40
func (kv KV) ParameterCount() uint64 {
41
42
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
43
44
}

45
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
46
	if t := kv.Uint("general.file_type"); t > 0 {
47
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
48
49
	}

50
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
51
52
53
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
59
60
}

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
func (kv KV) HeadCount() []uint64 {
	headCountDefault := uint32(1)
	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
	if len(headCount) == 1 {
		headCountDefault = headCount[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCount) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCount) {
			out[i] = uint64(headCountDefault)
		} else {
			out[i] = uint64(headCount[i])
		}
	}
	return out
}

82
83
func (kv KV) HeadCountMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
84
85
}

86
87
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
88
89
}

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
func (kv KV) HeadCountKV() []uint64 {
	headCountKVDefault := uint32(1)
	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
	if len(headCountKV) == 1 {
		headCountKVDefault = headCountKV[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCountKV) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCountKV) {
			out[i] = uint64(headCountKVDefault)
		} else {
			out[i] = uint64(headCountKV[i])
		}
	}
	return out
}

111
112
113
114
115
116
117
118
119
120
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
121
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
122
123
124
125
126
127
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
128
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
129
130
131
}

func (kv KV) EmbeddingHeadCountV() uint64 {
132
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
133
134
135
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
136
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
137
138
}

Michael Yang's avatar
Michael Yang committed
139
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
140
141
142
	return kv.String("tokenizer.chat_template")
}

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// ssm architecture parameters

func (kv KV) SSMConvKernel() uint64 {
	return uint64(kv.Uint("ssm.conv_kernel"))
}

func (kv KV) SSMInnerSize() uint64 {
	return uint64(kv.Uint("ssm.inner_size"))
}

func (kv KV) SSMStateSize() uint64 {
	return uint64(kv.Uint("ssm.state_size"))
}

func (kv KV) SSMGroupCount() uint64 {
	return uint64(kv.Uint("ssm.group_count"))
}

// general types

Michael Yang's avatar
Michael Yang committed
163
func (kv KV) String(key string, defaultValue ...string) string {
164
165
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
166
167
168
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
169
170
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
171
172
173
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
174
175
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
176
177
}

178
func (kv KV) Bool(key string, defaultValue ...bool) bool {
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
194
195
196
197
198
	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
	return slices.Min(arrVal), slices.Max(arrVal)
}

func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
199
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
200
		return []uint32{u32}
201
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
202
		return u32s.values
203
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
204
205
206
207
208
209
		dst := make([]uint32, len(i32s.values))
		for i, v := range i32s.values {
			if v < 0 {
				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
			}
			dst[i] = uint32(v)
210
		}
211
		return dst
212
213
	}

214
	return []uint32{defaultValue}
215
216
}

Michael Yang's avatar
Michael Yang committed
217
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
218
219
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
220
221
}

Michael Yang's avatar
Michael Yang committed
222
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
223
224
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
225
226
}

Michael Yang's avatar
Michael Yang committed
227
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
228
229
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
230
231
}

Patrick Devine's avatar
Patrick Devine committed
232
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
233
234
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
235
236
}

Michael Yang's avatar
Michael Yang committed
237
238
239
240
241
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

242
func (kv KV) OllamaEngineRequired() bool {
243
	return slices.Contains([]string{
244
245
246
		"bert",
		"deepseek2",
		"deepseekocr",
247
		"gemma3",
248
		"gemma3n",
249
		"gptoss", "gpt-oss",
Michael Yang's avatar
llama4  
Michael Yang committed
250
		"llama4",
251
		"mistral3",
252
		"mllama",
253
254
		"nomic-bert",
		"olmo3",
255
		"qwen25vl",
256
257
		"qwen3", "qwen3moe",
		"qwen3vl", "qwen3vlmoe",
258
	}, kv.Architecture())
259
260
}

Michael Yang's avatar
Michael Yang committed
261
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
262
263
264
265
266
267
268
269
270
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
271
272
}

273
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
274
275
276
277
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

278
279
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
280
281
	}

282
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
283
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
284
285
}

286
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
287
	items  []*Tensor
288
	Offset uint64
Michael Yang's avatar
Michael Yang committed
289
}
Michael Yang's avatar
Michael Yang committed
290

Michael Yang's avatar
Michael Yang committed
291
292
293
294
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
295

Michael Yang's avatar
Michael Yang committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
316
			}
Michael Yang's avatar
Michael Yang committed
317
		}
318

Michael Yang's avatar
Michael Yang committed
319
320
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
321
322
		}

Michael Yang's avatar
Michael Yang committed
323
324
325
326
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
327
328
329
330
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
331
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
332
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
333
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
334
335
336
337
338
	}

	return size
}

339
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
340
341
342
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
343
344

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
345
	Shape []uint64 `json:"shape"`
346

Michael Yang's avatar
Michael Yang committed
347
	io.WriterTo `json:"-"`
348
349
}

350
351
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
352
		return math.MaxInt
353
354
355
356
357
	}

	return
}

358
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
359
	return TensorType(t.Kind).BlockSize()
360
361
362
363
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
364
	case
365
366
367
368
369
370
371
372
		TensorTypeF32,
		TensorTypeF16,
		TensorTypeI8,
		TensorTypeI16,
		TensorTypeI32,
		TensorTypeI64,
		TensorTypeF64,
		TensorTypeBF16:
373
		return 1
Michael Yang's avatar
Michael Yang committed
374
	case
375
376
377
378
379
380
381
382
		TensorTypeQ4_0,
		TensorTypeQ4_1,
		TensorTypeQ5_0,
		TensorTypeQ5_1,
		TensorTypeQ8_0,
		TensorTypeQ8_1,
		tensorTypeIQ4_NL,
		4, TensorTypeMXFP4:
383
		return 32
Michael Yang's avatar
Michael Yang committed
384
	default:
385
386
387
388
389
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
390
391
392
393
394
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
395

396
397
	switch t {
	case TensorTypeF32:
398
		return 4
399
	case TensorTypeF16:
400
		return 2
401
	case TensorTypeQ4_0:
402
		return 2 + blockSize/2
403
	case TensorTypeQ4_1:
404
		return 2 + 2 + blockSize/2
405
	case TensorTypeQ5_0:
406
		return 2 + 4 + blockSize/2
407
	case TensorTypeQ5_1:
408
		return 2 + 2 + 4 + blockSize/2
409
	case TensorTypeQ8_0:
410
		return 2 + blockSize
411
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
412
		return 2 + 2 + blockSize
413
	case TensorTypeQ2_K:
414
		return blockSize/16 + blockSize/4 + 2 + 2
415
	case TensorTypeQ3_K:
416
		return blockSize/8 + blockSize/4 + 12 + 2
417
	case TensorTypeQ4_K:
418
		return 2 + 2 + 12 + blockSize/2
419
	case TensorTypeQ5_K:
420
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
421
	case TensorTypeQ6_K:
422
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
423
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
424
		return 4 + blockSize + 2*blockSize/16
425
	case tensorTypeIQ2_XXS:
426
		return 2 + 2*blockSize/8
427
	case tensorTypeIQ2_XS:
428
		return 2 + 2*blockSize/8 + blockSize/32
429
	case tensorTypeIQ3_XXS:
430
		return 2 + blockSize/4 + blockSize/8
431
	case tensorTypeIQ1_S:
432
		return 2 + blockSize/8 + blockSize/16
433
	case tensorTypeIQ4_NL:
434
		return 2 + blockSize/2
435
	case tensorTypeIQ3_S:
436
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
437
	case tensorTypeIQ2_S:
438
		return 2 + blockSize/4 + blockSize/16
439
	case tensorTypeIQ4_XS:
440
		return 2 + 2 + blockSize/2 + blockSize/64
441
	case TensorTypeI8:
442
		return 1
443
	case TensorTypeI16:
444
		return 2
445
	case TensorTypeI32:
446
		return 4
447
	case TensorTypeI64:
448
		return 8
449
	case TensorTypeF64:
450
		return 8
451
	case tensorTypeIQ1_M:
452
		return blockSize/8 + blockSize/16 + blockSize/32
453
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
454
		return 2
455
456
	case 4, TensorTypeMXFP4:
		return 1 + blockSize/2
457
458
459
460
461
	default:
		return 0
	}
}

462
func (t Tensor) Elements() uint64 {
463
464
465
466
467
468
469
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
470
func (t Tensor) Size() uint64 {
471
	return t.Elements() * t.typeSize() / t.blockSize()
472
473
}

474
func (t Tensor) Type() string {
475
	return TensorType(t.Kind).String()
476
477
}

478
479
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
480
	Decode(io.ReadSeeker) (model, error)
481
482
483
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
484
	// Magic constant for `ggml` files (unversioned).
485
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
486
	// Magic constant for `ggml` files (versioned, ggmf).
487
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
488
	// Magic constant for `ggml` files (versioned, ggjt).
489
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
490
	// Magic constant for `ggla` files (LoRA adapter).
491
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
492
	// Magic constant for `gguf` files (versioned, gguf)
493
494
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
495
496
)

Bruce MacDonald's avatar
Bruce MacDonald committed
497
498
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
499
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
500
501
502
503
504
505
506
507
508
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
509
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
510
511
512
513
514
515
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
516
// Decode decodes a GGML model from the given reader.
517
518
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
519
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
520
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
521
522
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

523
	var magic uint32
Michael Yang's avatar
Michael Yang committed
524
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
525
		return nil, err
526
527
528
	}

	var c container
529
530
	switch magic {
	case FILE_MAGIC_GGUF_LE:
531
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
532
	case FILE_MAGIC_GGUF_BE:
533
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
534
	default:
535
		return nil, errors.New("invalid file magic")
536
537
	}

Michael Yang's avatar
Michael Yang committed
538
	model, err := c.Decode(rs)
539
	if err != nil {
540
		return nil, err
541
542
	}

Michael Yang's avatar
Michael Yang committed
543
544
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
545
		return nil, err
Michael Yang's avatar
Michael Yang committed
546
547
	}

548
	// final model type
549
550
551
	return &GGML{
		container: c,
		model:     model,
552
553
		Length:    offset,
	}, nil
554
}
Michael Yang's avatar
Michael Yang committed
555

556
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
557
558
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
559
	embedding := f.KV().EmbeddingLength()
560
	heads := f.KV().HeadCountMax()
561
	headsArr := f.KV().HeadCount()
562
	headsKV := f.KV().HeadCountKVMax()
563
	headsKVArr := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
564
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
565

566
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
567
568
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
569

Michael Yang's avatar
Michael Yang committed
570
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
571

572
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
573
574
575
576
577
578
579
580
581
582
583

	// Default for models unless special-cased below. These defaults mirror the
	// cache usage in llama.cpp under the assumption that models without special
	// cases below will use the llamarunner and caching will be handled by the
	// llama.cpp layer.
	//
	// This also assumes that a layer without heads or headsKV set is recurrent
	// which is usually the case. Some models (eg nemotronh) use "blocks" in
	// place of layers where some are MLP blocks that don't have any cache.
	// Models like this will need a special case below to be accurately
	// estimated.
Michael Yang's avatar
Michael Yang committed
584
	var kvTotal uint64
585
	kv = make([]uint64, f.KV().BlockCount())
586
587
	kvSizeAttn := uint64(0)
	kvSizeRecurrent := uint64(0)
588
	for i := range kv {
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
		headsL := headsArr[i]
		headsKVL := headsKVArr[i]
		if headsL > 0 && headsKVL > 0 {
			// full attention layer
			// NOTE: Assumes uniform values for all attn layers
			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
			kvSizeAttn += kv[i]
		} else {
			// recurrent layer
			ssmDConv := f.KV().SSMConvKernel()
			ssmDState := f.KV().SSMStateSize()
			ssmDInner := f.KV().SSMInnerSize()
			ssmNGroups := f.KV().SSMGroupCount()
			nEmbdR := uint64(0)
			if ssmDConv > 0 {
				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
			}
			nEmbdS := ssmDState * ssmDInner

			// recurrent always uses F32 in llama.cpp backend
			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")

			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
			kvSizeRecurrent += kv[i]
		}
Michael Yang's avatar
Michael Yang committed
615
		kvTotal += kv[i]
616
	}
617
	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)
Michael Yang's avatar
Michael Yang committed
618

Michael Yang's avatar
Michael Yang committed
619
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
620
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
621
622
623
624
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
625
626
627

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
628
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
629
630
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
631

Michael Yang's avatar
Michael Yang committed
632
633
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
634
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
635
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
636
637
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
638
639
640
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
641
642
643
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
644
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
645
646
647
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
648
649
650
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
651
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
652
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
653
			if slices.Contains(crossAttentionLayers, int32(i)) {
654
655
656
657
658
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
659
660
		}

Michael Yang's avatar
Michael Yang committed
661
662
663
664
665
666
667
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
668
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
669
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
670
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
671
672
673
674
675
676
677
678
679
680
681
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
682
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
683
684
685
686
687
688
689
690
691
692
693
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
694

695
696
697
698
699
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

700
701
702
703
704
705
706
707
708
709
710
711
712
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
713
714
715
716
717
718
719
720
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
721
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
738

Michael Yang's avatar
Michael Yang committed
739
740
741
742
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
743
744
745
746
747
748
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
749
750
751
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
752
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
753
754
755
756
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
757
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
758
		)
Michael Yang's avatar
Michael Yang committed
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
785
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
786
787
788
789
790
791
792
793
794
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
795

796
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
797
		if useFlashAttention == ml.FlashAttentionEnabled {
798
799
800
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
801
802
	}

Michael Yang's avatar
Michael Yang committed
803
	return
Michael Yang's avatar
Michael Yang committed
804
}
805
806

// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
807
func (f GGML) SupportsKVCacheType(cacheType string) bool {
808
809
810
811
812
	if cacheType == "" || cacheType == "f16" {
		return true
	}

	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
813
814
}

815
816
817
818
819
820
821
822
// KVCacheTypeIsQuantized checks if the requested cache type is a quantized type
func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
	if cacheType == "" || cacheType == "f16" || cacheType == "f32" || cacheType == "bf16" {
		return false
	}
	return true
}

823
// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
824
825
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
826
827
828
829
	if isEmbedding {
		return false
	}

830
831
832
833
	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
		return false
	}

834
	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
835
836
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
837
838
839
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

840
841
842
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
843
		"bert",
844
		"gemma3",
845
		"gptoss", "gpt-oss",
846
		"mistral3",
847
		"olmo3",
848
849
		"qwen3", "qwen3moe",
		"qwen3vl", "qwen3vlmoe",
850
851
852
	}, f.KV().String("general.architecture"))
}

853
854
855
856
857
858
859
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
860
861
	case "f32":
		return 4 // f32 (default for recurrent)
862
863
864
865
	default:
		return 2 // f16 (default)
	}
}