ggml.go 19.8 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
type GGML struct {
	container
	model
18
	Length int64
Michael Yang's avatar
Michael Yang committed
19
}
20

Michael Yang's avatar
Michael Yang committed
21
type model interface {
Michael Yang's avatar
Michael Yang committed
22
	KV() KV
Michael Yang's avatar
Michael Yang committed
23
	Tensors() Tensors
24
25
}

26
27
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
28
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
29
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
30
31
}

32
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
33
	return kv.String("general.type", "unknown")
34
35
}

Michael Yang's avatar
Michael Yang committed
36
func (kv KV) ParameterCount() uint64 {
37
38
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
39
40
}

41
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
42
	if t := kv.Uint("general.file_type"); t > 0 {
43
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
44
45
	}

46
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
47
48
49
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
50
51
52
53
54
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
55
56
}

57
58
59
60
61
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
62
63
}

64
65
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
66
67
}

68
69
70
71
72
73
74
75
76
77
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
78
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
79
80
81
82
83
84
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
85
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
86
87
88
}

func (kv KV) EmbeddingHeadCountV() uint64 {
89
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
90
91
92
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
93
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
94
95
}

Michael Yang's avatar
Michael Yang committed
96
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
97
98
99
100
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
101
102
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
103
104
105
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
106
107
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
108
109
110
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
111
112
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
113
114
}

115
func (kv KV) Bool(key string, defaultValue ...bool) bool {
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
147
148
}

Michael Yang's avatar
Michael Yang committed
149
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
150
151
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
152
153
}

Michael Yang's avatar
Michael Yang committed
154
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
155
156
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
157
158
}

Michael Yang's avatar
Michael Yang committed
159
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
160
161
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
162
163
}

Patrick Devine's avatar
Patrick Devine committed
164
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
165
166
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
167
168
}

Michael Yang's avatar
Michael Yang committed
169
170
171
172
173
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

174
func (kv KV) OllamaEngineRequired() bool {
175
176
	return slices.Contains([]string{
		"gemma3",
177
		"gemma3n",
178
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
179
		"llama4",
180
		"mllama",
181
		"qwen25vl",
Michael Yang's avatar
Michael Yang committed
182
		"gptoss",
183
	}, kv.Architecture())
184
185
}

Michael Yang's avatar
Michael Yang committed
186
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
187
188
189
190
191
192
193
194
195
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
196
197
}

198
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
199
200
201
202
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

203
204
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
205
206
	}

207
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
208
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
209
210
}

211
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
212
	items  []*Tensor
213
	Offset uint64
Michael Yang's avatar
Michael Yang committed
214
}
Michael Yang's avatar
Michael Yang committed
215

Michael Yang's avatar
Michael Yang committed
216
217
218
219
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
220

Michael Yang's avatar
Michael Yang committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
241
			}
Michael Yang's avatar
Michael Yang committed
242
		}
243

Michael Yang's avatar
Michael Yang committed
244
245
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
246
247
		}

Michael Yang's avatar
Michael Yang committed
248
249
250
251
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
252
253
254
255
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
256
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
257
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
258
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
259
260
261
262
263
	}

	return size
}

264
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
265
266
267
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
268
269

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
270
	Shape []uint64 `json:"shape"`
271

Michael Yang's avatar
Michael Yang committed
272
	io.WriterTo `json:"-"`
273
274
}

275
276
277
278
279
280
281
282
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

283
func (t Tensor) blockSize() uint64 {
284
285
286
287
288
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
289
290
291
292
293
294
295
296
297
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
298
		return 1
Michael Yang's avatar
Michael Yang committed
299
300
301
302
303
304
305
306
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
307
		return 32
Michael Yang's avatar
Michael Yang committed
308
	default:
309
310
311
312
313
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
314
315
316
317
318
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
319

320
321
	switch t {
	case TensorTypeF32:
322
		return 4
323
	case TensorTypeF16:
324
		return 2
325
	case TensorTypeQ4_0:
326
		return 2 + blockSize/2
327
	case TensorTypeQ4_1:
328
		return 2 + 2 + blockSize/2
329
	case TensorTypeQ5_0:
330
		return 2 + 4 + blockSize/2
331
	case TensorTypeQ5_1:
332
		return 2 + 2 + 4 + blockSize/2
333
	case TensorTypeQ8_0:
334
		return 2 + blockSize
335
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
336
		return 2 + 2 + blockSize
337
	case TensorTypeQ2_K:
338
		return blockSize/16 + blockSize/4 + 2 + 2
339
	case TensorTypeQ3_K:
340
		return blockSize/8 + blockSize/4 + 12 + 2
341
	case TensorTypeQ4_K:
342
		return 2 + 2 + 12 + blockSize/2
343
	case TensorTypeQ5_K:
344
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
345
	case TensorTypeQ6_K:
346
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
347
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
348
		return 4 + blockSize + 2*blockSize/16
349
	case tensorTypeIQ2_XXS:
350
		return 2 + 2*blockSize/8
351
	case tensorTypeIQ2_XS:
352
		return 2 + 2*blockSize/8 + blockSize/32
353
	case tensorTypeIQ3_XXS:
354
		return 2 + blockSize/4 + blockSize/8
355
	case tensorTypeIQ1_S:
356
		return 2 + blockSize/8 + blockSize/16
357
	case tensorTypeIQ4_NL:
358
		return 2 + blockSize/2
359
	case tensorTypeIQ3_S:
360
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
361
	case tensorTypeIQ2_S:
362
		return 2 + blockSize/4 + blockSize/16
363
	case tensorTypeIQ4_XS:
364
		return 2 + 2 + blockSize/2 + blockSize/64
365
	case TensorTypeI8:
366
		return 1
367
	case TensorTypeI16:
368
		return 2
369
	case TensorTypeI32:
370
		return 4
371
	case TensorTypeI64:
372
		return 8
373
	case TensorTypeF64:
374
		return 8
375
	case tensorTypeIQ1_M:
376
		return blockSize/8 + blockSize/16 + blockSize/32
377
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
378
		return 2
379
380
381
382
383
	default:
		return 0
	}
}

384
func (t Tensor) Elements() uint64 {
385
386
387
388
389
390
391
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
392
func (t Tensor) Size() uint64 {
393
	return t.Elements() * t.typeSize() / t.blockSize()
394
395
}

396
func (t Tensor) Type() string {
397
	return TensorType(t.Kind).String()
398
399
}

400
401
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
402
	Decode(io.ReadSeeker) (model, error)
403
404
405
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
406
	// Magic constant for `ggml` files (unversioned).
407
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
408
	// Magic constant for `ggml` files (versioned, ggmf).
409
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
410
	// Magic constant for `ggml` files (versioned, ggjt).
411
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
412
	// Magic constant for `ggla` files (LoRA adapter).
413
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
414
	// Magic constant for `gguf` files (versioned, gguf)
415
416
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
417
418
)

Bruce MacDonald's avatar
Bruce MacDonald committed
419
420
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
421
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
422
423
424
425
426
427
428
429
430
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
431
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
432
433
434
435
436
437
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
438
// Decode decodes a GGML model from the given reader.
439
440
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
441
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
442
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
443
444
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

445
	var magic uint32
Michael Yang's avatar
Michael Yang committed
446
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
447
		return nil, err
448
449
450
	}

	var c container
451
452
	switch magic {
	case FILE_MAGIC_GGUF_LE:
453
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
454
	case FILE_MAGIC_GGUF_BE:
455
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
456
	default:
457
		return nil, errors.New("invalid file magic")
458
459
	}

Michael Yang's avatar
Michael Yang committed
460
	model, err := c.Decode(rs)
461
	if err != nil {
462
		return nil, err
463
464
	}

Michael Yang's avatar
Michael Yang committed
465
466
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
467
		return nil, err
Michael Yang's avatar
Michael Yang committed
468
469
	}

470
	// final model type
471
472
473
	return &GGML{
		container: c,
		model:     model,
474
475
		Length:    offset,
	}, nil
476
}
Michael Yang's avatar
Michael Yang committed
477

478
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
479
	embedding := f.KV().EmbeddingLength()
480
481
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
482
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
483

484
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
485
486
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
487

Michael Yang's avatar
Michael Yang committed
488
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
489

490
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
491
492
493
494
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
495

Michael Yang's avatar
Michael Yang committed
496
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
497
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
498
499
500
501
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
502
503
504

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
505
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
506
507
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
508

Michael Yang's avatar
Michael Yang committed
509
510
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
511
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
512
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
513
514
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
515
516
517
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
518
519
520
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
521
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
522
523
524
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
525
526
527
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
528
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
529
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
530
			if slices.Contains(crossAttentionLayers, int32(i)) {
531
532
533
534
535
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
536
537
		}

Michael Yang's avatar
Michael Yang committed
538
539
540
541
542
543
544
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
545
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
546
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
547
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
548
549
550
551
552
553
554
555
556
557
558
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
559
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
560
561
562
563
564
565
566
567
568
569
570
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
571

572
573
574
575
576
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

577
578
579
580
581
582
583
584
585
586
587
588
589
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
590
591
592
593
594
595
596
597
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
598
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
615

Michael Yang's avatar
Michael Yang committed
616
617
618
619
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
620
621
622
623
624
625
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
626
627
628
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
629
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
630
631
632
633
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
634
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
635
		)
Michael Yang's avatar
Michael Yang committed
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
662
663
	}

Michael Yang's avatar
Michael Yang committed
664
	return
Michael Yang's avatar
Michael Yang committed
665
}
666

667
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
668
669
670
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
671

Michael Yang's avatar
Michael Yang committed
672
	for name, layer := range llm.Tensors().GroupLayers() {
673
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
674
675
			for _, tensor := range layer {
				weights += tensor.Size()
676
677
			}
		}
Michael Yang's avatar
Michael Yang committed
678
	}
679

Michael Yang's avatar
Michael Yang committed
680
681
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
682
683
684
685
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
686

Michael Yang's avatar
Michael Yang committed
687
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
688

Michael Yang's avatar
Michael Yang committed
689
690
691
692
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
693

Michael Yang's avatar
Michael Yang committed
694
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
695
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
696

Michael Yang's avatar
Michael Yang committed
697
698
	switch llm.KV().Architecture() {
	case "mllama":
699
700
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
701
702
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

703
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
704
			imageSize*imageSize*numChannels*maxNumTiles +
705
706
707
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
708
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
709
710
711
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
712
713
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
714
715
716

		numPatches := maxPixels / (patchSize * patchSize)

717
718
719
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
720
721
722
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
723
724
725
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
726
727
728
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
729
	}
Michael Yang's avatar
Michael Yang committed
730

731
732
733
	return weights, graphSize
}

734
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
735
736
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
737
738
739
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
740
741
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
742
743
744
745
746
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
747
748
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
749
750
751
752
753
754
755
756
757
758
759
760
761
762
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}