ggml.go 22.8 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
9
	"iter"
Michael Yang's avatar
Michael Yang committed
10
	"log/slog"
11
	"maps"
12
	"math"
13
	"slices"
Michael Yang's avatar
Michael Yang committed
14
	"strings"
15

16
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
17
	"github.com/ollama/ollama/fs/util/bufioutil"
18
	"github.com/ollama/ollama/ml"
19
20
)

Michael Yang's avatar
Michael Yang committed
21
22
23
type GGML struct {
	container
	model
24
	Length int64
Michael Yang's avatar
Michael Yang committed
25
}
26

Michael Yang's avatar
Michael Yang committed
27
type model interface {
Michael Yang's avatar
Michael Yang committed
28
	KV() KV
Michael Yang's avatar
Michael Yang committed
29
	Tensors() Tensors
30
31
}

32
33
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
34
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
35
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
36
37
}

38
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
39
	return kv.String("general.type", "unknown")
40
41
}

Michael Yang's avatar
Michael Yang committed
42
func (kv KV) ParameterCount() uint64 {
43
44
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
45
46
}

47
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
48
	if t := kv.Uint("general.file_type"); t > 0 {
49
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
50
51
	}

52
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
57
58
59
60
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
61
62
}

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
func (kv KV) HeadCount() []uint64 {
	headCountDefault := uint32(1)
	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
	if len(headCount) == 1 {
		headCountDefault = headCount[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCount) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCount) {
			out[i] = uint64(headCountDefault)
		} else {
			out[i] = uint64(headCount[i])
		}
	}
	return out
}

84
85
func (kv KV) HeadCountMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
86
87
}

88
89
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
90
91
}

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
func (kv KV) HeadCountKV() []uint64 {
	headCountKVDefault := uint32(1)
	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
	if len(headCountKV) == 1 {
		headCountKVDefault = headCountKV[0]
	}
	nLayers := int(kv.BlockCount())
	if len(headCountKV) > nLayers {
		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
	}
	out := make([]uint64, nLayers)
	for i := range nLayers {
		if i >= len(headCountKV) {
			out[i] = uint64(headCountKVDefault)
		} else {
			out[i] = uint64(headCountKV[i])
		}
	}
	return out
}

113
114
115
116
117
118
119
120
121
122
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
123
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
124
125
126
127
128
129
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
130
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
131
132
133
}

func (kv KV) EmbeddingHeadCountV() uint64 {
134
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
135
136
137
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
138
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
139
140
}

Michael Yang's avatar
Michael Yang committed
141
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
142
143
144
	return kv.String("tokenizer.chat_template")
}

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// ssm architecture parameters

func (kv KV) SSMConvKernel() uint64 {
	return uint64(kv.Uint("ssm.conv_kernel"))
}

func (kv KV) SSMInnerSize() uint64 {
	return uint64(kv.Uint("ssm.inner_size"))
}

func (kv KV) SSMStateSize() uint64 {
	return uint64(kv.Uint("ssm.state_size"))
}

func (kv KV) SSMGroupCount() uint64 {
	return uint64(kv.Uint("ssm.group_count"))
}

// general types

Michael Yang's avatar
Michael Yang committed
165
func (kv KV) String(key string, defaultValue ...string) string {
166
167
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
168
169
170
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
171
172
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
173
174
175
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
176
177
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
178
179
}

180
func (kv KV) Bool(key string, defaultValue ...bool) bool {
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
196
197
198
199
200
	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
	return slices.Min(arrVal), slices.Max(arrVal)
}

func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
201
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
202
		return []uint32{u32}
203
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
204
		return u32s.values
205
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
206
207
208
209
210
211
		dst := make([]uint32, len(i32s.values))
		for i, v := range i32s.values {
			if v < 0 {
				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
			}
			dst[i] = uint32(v)
212
		}
213
		return dst
214
215
	}

216
	return []uint32{defaultValue}
217
218
}

Michael Yang's avatar
Michael Yang committed
219
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
220
221
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
222
223
}

Michael Yang's avatar
Michael Yang committed
224
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
225
226
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
227
228
}

Michael Yang's avatar
Michael Yang committed
229
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
230
231
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
232
233
}

Patrick Devine's avatar
Patrick Devine committed
234
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
235
236
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
237
238
}

Michael Yang's avatar
Michael Yang committed
239
240
241
242
243
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

244
245
246
247
248
249
250
251
252
253
254
255
func (kv KV) Len() int {
	return len(kv)
}

func (kv KV) Keys() iter.Seq[string] {
	return maps.Keys(kv)
}

func (kv KV) Value(key string) any {
	return kv[key]
}

256
func (kv KV) OllamaEngineRequired() bool {
257
	return slices.Contains([]string{
258
259
260
		"bert",
		"deepseek2",
		"deepseekocr",
261
		"gemma3",
262
		"gemma3n",
263
		"gptoss", "gpt-oss",
Michael Yang's avatar
llama4  
Michael Yang committed
264
		"llama4",
265
		"mistral3",
266
		"mllama",
267
268
		"nomic-bert",
		"olmo3",
269
		"qwen25vl",
270
271
		"qwen3", "qwen3moe",
		"qwen3vl", "qwen3vlmoe",
272
	}, kv.Architecture())
273
274
}

Michael Yang's avatar
Michael Yang committed
275
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
276
277
278
279
280
281
282
283
284
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
285
286
}

287
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
288
289
290
291
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

292
293
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
294
295
	}

296
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
297
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
298
299
}

300
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
301
	items  []*Tensor
302
	Offset uint64
Michael Yang's avatar
Michael Yang committed
303
}
Michael Yang's avatar
Michael Yang committed
304

Michael Yang's avatar
Michael Yang committed
305
306
307
308
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
309

Michael Yang's avatar
Michael Yang committed
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
330
			}
Michael Yang's avatar
Michael Yang committed
331
		}
332

Michael Yang's avatar
Michael Yang committed
333
334
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
335
336
		}

Michael Yang's avatar
Michael Yang committed
337
338
339
340
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
341
342
343
344
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
345
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
346
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
347
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
348
349
350
351
352
	}

	return size
}

353
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
354
355
356
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
357
358

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
359
	Shape []uint64 `json:"shape"`
360

Michael Yang's avatar
Michael Yang committed
361
	io.WriterTo `json:"-"`
362
363
}

364
365
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
366
		return math.MaxInt
367
368
369
370
371
	}

	return
}

372
func (t Tensor) blockSize() uint64 {
Michael Yang's avatar
Michael Yang committed
373
	return TensorType(t.Kind).BlockSize()
374
375
376
377
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
378
	case
379
380
381
382
383
384
385
386
		TensorTypeF32,
		TensorTypeF16,
		TensorTypeI8,
		TensorTypeI16,
		TensorTypeI32,
		TensorTypeI64,
		TensorTypeF64,
		TensorTypeBF16:
387
		return 1
Michael Yang's avatar
Michael Yang committed
388
	case
389
390
391
392
393
394
395
396
		TensorTypeQ4_0,
		TensorTypeQ4_1,
		TensorTypeQ5_0,
		TensorTypeQ5_1,
		TensorTypeQ8_0,
		TensorTypeQ8_1,
		tensorTypeIQ4_NL,
		4, TensorTypeMXFP4:
397
		return 32
Michael Yang's avatar
Michael Yang committed
398
	default:
399
400
401
402
403
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
404
405
406
407
408
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
409

410
411
	switch t {
	case TensorTypeF32:
412
		return 4
413
	case TensorTypeF16:
414
		return 2
415
	case TensorTypeQ4_0:
416
		return 2 + blockSize/2
417
	case TensorTypeQ4_1:
418
		return 2 + 2 + blockSize/2
419
	case TensorTypeQ5_0:
420
		return 2 + 4 + blockSize/2
421
	case TensorTypeQ5_1:
422
		return 2 + 2 + 4 + blockSize/2
423
	case TensorTypeQ8_0:
424
		return 2 + blockSize
425
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
426
		return 2 + 2 + blockSize
427
	case TensorTypeQ2_K:
428
		return blockSize/16 + blockSize/4 + 2 + 2
429
	case TensorTypeQ3_K:
430
		return blockSize/8 + blockSize/4 + 12 + 2
431
	case TensorTypeQ4_K:
432
		return 2 + 2 + 12 + blockSize/2
433
	case TensorTypeQ5_K:
434
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
435
	case TensorTypeQ6_K:
436
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
437
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
438
		return 4 + blockSize + 2*blockSize/16
439
	case tensorTypeIQ2_XXS:
440
		return 2 + 2*blockSize/8
441
	case tensorTypeIQ2_XS:
442
		return 2 + 2*blockSize/8 + blockSize/32
443
	case tensorTypeIQ3_XXS:
444
		return 2 + blockSize/4 + blockSize/8
445
	case tensorTypeIQ1_S:
446
		return 2 + blockSize/8 + blockSize/16
447
	case tensorTypeIQ4_NL:
448
		return 2 + blockSize/2
449
	case tensorTypeIQ3_S:
450
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
451
	case tensorTypeIQ2_S:
452
		return 2 + blockSize/4 + blockSize/16
453
	case tensorTypeIQ4_XS:
454
		return 2 + 2 + blockSize/2 + blockSize/64
455
	case TensorTypeI8:
456
		return 1
457
	case TensorTypeI16:
458
		return 2
459
	case TensorTypeI32:
460
		return 4
461
	case TensorTypeI64:
462
		return 8
463
	case TensorTypeF64:
464
		return 8
465
	case tensorTypeIQ1_M:
466
		return blockSize/8 + blockSize/16 + blockSize/32
467
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
468
		return 2
469
470
	case 4, TensorTypeMXFP4:
		return 1 + blockSize/2
471
472
473
474
475
	default:
		return 0
	}
}

476
func (t Tensor) Elements() uint64 {
477
478
479
480
481
482
483
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
484
func (t Tensor) Size() uint64 {
485
	return t.Elements() * t.typeSize() / t.blockSize()
486
487
}

488
func (t Tensor) Type() string {
489
	return TensorType(t.Kind).String()
490
491
}

492
493
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
494
	Decode(io.ReadSeeker) (model, error)
495
496
497
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
498
	// Magic constant for `ggml` files (unversioned).
499
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
500
	// Magic constant for `ggml` files (versioned, ggmf).
501
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
502
	// Magic constant for `ggml` files (versioned, ggjt).
503
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
504
	// Magic constant for `ggla` files (LoRA adapter).
505
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
506
	// Magic constant for `gguf` files (versioned, gguf)
507
508
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
509
510
)

Bruce MacDonald's avatar
Bruce MacDonald committed
511
512
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
513
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
514
515
516
517
518
519
520
521
522
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
523
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
524
525
526
527
528
529
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
530
// Decode decodes a GGML model from the given reader.
531
532
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
533
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
534
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
535
536
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

537
	var magic uint32
Michael Yang's avatar
Michael Yang committed
538
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
539
		return nil, err
540
541
542
	}

	var c container
543
544
	switch magic {
	case FILE_MAGIC_GGUF_LE:
545
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
546
	case FILE_MAGIC_GGUF_BE:
547
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
548
	default:
549
		return nil, errors.New("invalid file magic")
550
551
	}

Michael Yang's avatar
Michael Yang committed
552
	model, err := c.Decode(rs)
553
	if err != nil {
554
		return nil, err
555
556
	}

Michael Yang's avatar
Michael Yang committed
557
558
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
559
		return nil, err
Michael Yang's avatar
Michael Yang committed
560
561
	}

562
	// final model type
563
564
565
	return &GGML{
		container: c,
		model:     model,
566
567
		Length:    offset,
	}, nil
568
}
Michael Yang's avatar
Michael Yang committed
569

570
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
Jesse Gross's avatar
Jesse Gross committed
571
572
	context *= uint64(numParallel)

Michael Yang's avatar
Michael Yang committed
573
	embedding := f.KV().EmbeddingLength()
574
	heads := f.KV().HeadCountMax()
575
	headsArr := f.KV().HeadCount()
576
	headsKV := f.KV().HeadCountKVMax()
577
	headsKVArr := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
578
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
579

580
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
581
582
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
583

Michael Yang's avatar
Michael Yang committed
584
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
585

586
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
587
588
589
590
591
592
593
594
595
596
597

	// Default for models unless special-cased below. These defaults mirror the
	// cache usage in llama.cpp under the assumption that models without special
	// cases below will use the llamarunner and caching will be handled by the
	// llama.cpp layer.
	//
	// This also assumes that a layer without heads or headsKV set is recurrent
	// which is usually the case. Some models (eg nemotronh) use "blocks" in
	// place of layers where some are MLP blocks that don't have any cache.
	// Models like this will need a special case below to be accurately
	// estimated.
Michael Yang's avatar
Michael Yang committed
598
	var kvTotal uint64
599
	kv = make([]uint64, f.KV().BlockCount())
600
601
	kvSizeAttn := uint64(0)
	kvSizeRecurrent := uint64(0)
602
	for i := range kv {
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
		headsL := headsArr[i]
		headsKVL := headsKVArr[i]
		if headsL > 0 && headsKVL > 0 {
			// full attention layer
			// NOTE: Assumes uniform values for all attn layers
			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
			kvSizeAttn += kv[i]
		} else {
			// recurrent layer
			ssmDConv := f.KV().SSMConvKernel()
			ssmDState := f.KV().SSMStateSize()
			ssmDInner := f.KV().SSMInnerSize()
			ssmNGroups := f.KV().SSMGroupCount()
			nEmbdR := uint64(0)
			if ssmDConv > 0 {
				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
			}
			nEmbdS := ssmDState * ssmDInner

			// recurrent always uses F32 in llama.cpp backend
			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")

			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
			kvSizeRecurrent += kv[i]
		}
Michael Yang's avatar
Michael Yang committed
629
		kvTotal += kv[i]
630
	}
631
	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)
Michael Yang's avatar
Michael Yang committed
632

Michael Yang's avatar
Michael Yang committed
633
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
634
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
635
636
637
638
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
639
640
641

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
642
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
643
644
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
645

Michael Yang's avatar
Michael Yang committed
646
647
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
648
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
649
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
650
651
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
652
653
654
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
655
656
657
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
658
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
659
660
661
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
662
663
664
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
665
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
666
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
667
			if slices.Contains(crossAttentionLayers, int32(i)) {
668
669
670
671
672
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
673
674
		}

Michael Yang's avatar
Michael Yang committed
675
676
677
678
679
680
681
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
682
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
683
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
684
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
685
686
687
688
689
690
691
692
693
694
695
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
696
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
697
698
699
700
701
702
703
704
705
706
707
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
708

709
710
711
712
713
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

714
715
716
717
718
719
720
721
722
723
724
725
726
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
727
728
729
730
731
732
733
734
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
735
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
752

Michael Yang's avatar
Michael Yang committed
753
754
755
756
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
757
758
759
760
761
762
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
763
764
765
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
766
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
767
768
769
770
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
771
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
772
		)
Michael Yang's avatar
Michael Yang committed
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
799
	case "gptoss", "gpt-oss":
Michael Yang's avatar
Michael Yang committed
800
801
802
803
804
805
806
807
808
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (uint64(numParallel)*4096 + batch)
			} else {
				kv[i] *= context
			}
		}
809

810
		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
811
		if useFlashAttention == ml.FlashAttentionEnabled {
812
813
814
			// rough estimate of graph size with flash attention on
			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
		}
Michael Yang's avatar
Michael Yang committed
815
816
	}

Michael Yang's avatar
Michael Yang committed
817
	return
Michael Yang's avatar
Michael Yang committed
818
}
819
820

// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
821
func (f GGML) SupportsKVCacheType(cacheType string) bool {
822
823
824
825
826
	if cacheType == "" || cacheType == "f16" {
		return true
	}

	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
827
828
}

829
830
831
832
833
834
835
836
// KVCacheTypeIsQuantized checks if the requested cache type is a quantized type
func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
	if cacheType == "" || cacheType == "f16" || cacheType == "f32" || cacheType == "bf16" {
		return false
	}
	return true
}

837
// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
838
839
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
840
841
842
843
	if isEmbedding {
		return false
	}

844
845
846
847
	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
		return false
	}

848
	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
849
850
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
851
852
853
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

854
855
856
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
	return slices.Contains([]string{
857
		"bert",
858
		"gemma3",
859
		"gptoss", "gpt-oss",
860
		"mistral3",
861
		"olmo3",
862
863
		"qwen3", "qwen3moe",
		"qwen3vl", "qwen3vlmoe",
864
865
866
	}, f.KV().String("general.architecture"))
}

867
868
869
870
871
872
873
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
874
875
	case "f32":
		return 4 // f32 (default for recurrent)
876
877
878
879
	default:
		return 2 // f16 (default)
	}
}