ggml.go 19.5 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
type GGML struct {
	container
	model
18
	Length int64
Michael Yang's avatar
Michael Yang committed
19
}
20

Michael Yang's avatar
Michael Yang committed
21
type model interface {
Michael Yang's avatar
Michael Yang committed
22
	KV() KV
Michael Yang's avatar
Michael Yang committed
23
	Tensors() Tensors
24
25
}

26
27
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
28
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
29
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
30
31
}

32
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
33
	return kv.String("general.type", "unknown")
34
35
}

Michael Yang's avatar
Michael Yang committed
36
func (kv KV) ParameterCount() uint64 {
37
38
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
39
40
}

41
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
42
	if t := kv.Uint("general.file_type"); t > 0 {
43
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
44
45
	}

46
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
47
48
49
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
50
51
52
53
54
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
55
56
}

57
58
59
60
61
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
62
63
}

64
65
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
66
67
}

68
69
70
71
72
73
74
75
76
77
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
78
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
79
80
81
82
83
84
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
85
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
86
87
88
}

func (kv KV) EmbeddingHeadCountV() uint64 {
89
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
90
91
92
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
93
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
94
95
}

Michael Yang's avatar
Michael Yang committed
96
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
97
98
99
100
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
101
102
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
103
104
105
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
106
107
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
108
109
110
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
111
112
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
113
114
}

115
func (kv KV) Bool(key string, defaultValue ...bool) bool {
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
147
148
}

Michael Yang's avatar
Michael Yang committed
149
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
150
151
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
152
153
}

Michael Yang's avatar
Michael Yang committed
154
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
155
156
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
157
158
}

Michael Yang's avatar
Michael Yang committed
159
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
160
161
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
162
163
}

Patrick Devine's avatar
Patrick Devine committed
164
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
165
166
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
167
168
}

169
func (kv KV) OllamaEngineRequired() bool {
170
171
172
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
173
		"llama4",
174
		"mllama",
175
		"qwen25vl",
176
	}, kv.Architecture())
177
178
}

Michael Yang's avatar
Michael Yang committed
179
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
180
181
182
183
184
185
186
187
188
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
189
190
}

191
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
192
193
194
195
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

196
197
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
198
199
	}

200
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
201
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
202
203
}

204
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
205
	items  []*Tensor
206
	Offset uint64
Michael Yang's avatar
Michael Yang committed
207
}
Michael Yang's avatar
Michael Yang committed
208

Michael Yang's avatar
Michael Yang committed
209
210
211
212
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
213

Michael Yang's avatar
Michael Yang committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
234
			}
Michael Yang's avatar
Michael Yang committed
235
		}
236

Michael Yang's avatar
Michael Yang committed
237
238
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
239
240
		}

Michael Yang's avatar
Michael Yang committed
241
242
243
244
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
245
246
247
248
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
249
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
250
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
251
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
252
253
254
255
256
	}

	return size
}

257
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
258
259
260
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
261
262

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
263
	Shape []uint64 `json:"shape"`
264

Michael Yang's avatar
Michael Yang committed
265
	io.WriterTo `json:"-"`
266
267
}

268
269
270
271
272
273
274
275
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

276
func (t Tensor) blockSize() uint64 {
277
278
279
280
281
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
282
283
284
285
286
287
288
289
290
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
291
		return 1
Michael Yang's avatar
Michael Yang committed
292
293
294
295
296
297
298
299
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
300
		return 32
Michael Yang's avatar
Michael Yang committed
301
	default:
302
303
304
305
306
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
307
308
309
310
311
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
312

313
314
	switch t {
	case TensorTypeF32:
315
		return 4
316
	case TensorTypeF16:
317
		return 2
318
	case TensorTypeQ4_0:
319
		return 2 + blockSize/2
320
	case TensorTypeQ4_1:
321
		return 2 + 2 + blockSize/2
322
	case TensorTypeQ5_0:
323
		return 2 + 4 + blockSize/2
324
	case TensorTypeQ5_1:
325
		return 2 + 2 + 4 + blockSize/2
326
	case TensorTypeQ8_0:
327
		return 2 + blockSize
328
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
329
		return 2 + 2 + blockSize
330
	case TensorTypeQ2_K:
331
		return blockSize/16 + blockSize/4 + 2 + 2
332
	case TensorTypeQ3_K:
333
		return blockSize/8 + blockSize/4 + 12 + 2
334
	case TensorTypeQ4_K:
335
		return 2 + 2 + 12 + blockSize/2
336
	case TensorTypeQ5_K:
337
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
338
	case TensorTypeQ6_K:
339
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
340
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
341
		return 4 + blockSize + 2*blockSize/16
342
	case tensorTypeIQ2_XXS:
343
		return 2 + 2*blockSize/8
344
	case tensorTypeIQ2_XS:
345
		return 2 + 2*blockSize/8 + blockSize/32
346
	case tensorTypeIQ3_XXS:
347
		return 2 + blockSize/4 + blockSize/8
348
	case tensorTypeIQ1_S:
349
		return 2 + blockSize/8 + blockSize/16
350
	case tensorTypeIQ4_NL:
351
		return 2 + blockSize/2
352
	case tensorTypeIQ3_S:
353
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
354
	case tensorTypeIQ2_S:
355
		return 2 + blockSize/4 + blockSize/16
356
	case tensorTypeIQ4_XS:
357
		return 2 + 2 + blockSize/2 + blockSize/64
358
	case TensorTypeI8:
359
		return 1
360
	case TensorTypeI16:
361
		return 2
362
	case TensorTypeI32:
363
		return 4
364
	case TensorTypeI64:
365
		return 8
366
	case TensorTypeF64:
367
		return 8
368
	case tensorTypeIQ1_M:
369
		return blockSize/8 + blockSize/16 + blockSize/32
370
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
371
		return 2
372
373
374
375
376
	default:
		return 0
	}
}

377
func (t Tensor) Elements() uint64 {
378
379
380
381
382
383
384
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
385
func (t Tensor) Size() uint64 {
386
	return t.Elements() * t.typeSize() / t.blockSize()
387
388
}

389
func (t Tensor) Type() string {
390
	return TensorType(t.Kind).String()
391
392
}

393
394
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
395
	Decode(io.ReadSeeker) (model, error)
396
397
398
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
399
	// Magic constant for `ggml` files (unversioned).
400
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
401
	// Magic constant for `ggml` files (versioned, ggmf).
402
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
403
	// Magic constant for `ggml` files (versioned, ggjt).
404
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
405
	// Magic constant for `ggla` files (LoRA adapter).
406
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
407
	// Magic constant for `gguf` files (versioned, gguf)
408
409
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
410
411
)

Bruce MacDonald's avatar
Bruce MacDonald committed
412
413
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
414
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
415
416
417
418
419
420
421
422
423
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
424
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
425
426
427
428
429
430
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
431
// Decode decodes a GGML model from the given reader.
432
433
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
434
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
435
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
436
437
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

438
	var magic uint32
Michael Yang's avatar
Michael Yang committed
439
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
440
		return nil, err
441
442
443
	}

	var c container
444
445
	switch magic {
	case FILE_MAGIC_GGUF_LE:
446
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
447
	case FILE_MAGIC_GGUF_BE:
448
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
449
	default:
450
		return nil, errors.New("invalid file magic")
451
452
	}

Michael Yang's avatar
Michael Yang committed
453
	model, err := c.Decode(rs)
454
	if err != nil {
455
		return nil, err
456
457
	}

Michael Yang's avatar
Michael Yang committed
458
459
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
460
		return nil, err
Michael Yang's avatar
Michael Yang committed
461
462
	}

463
	// final model type
464
465
466
	return &GGML{
		container: c,
		model:     model,
467
468
		Length:    offset,
	}, nil
469
}
Michael Yang's avatar
Michael Yang committed
470

471
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
472
	embedding := f.KV().EmbeddingLength()
473
474
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
475
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
476

477
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
478
479
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
480

Michael Yang's avatar
Michael Yang committed
481
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
482

483
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
484
485
486
487
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
488

Michael Yang's avatar
Michael Yang committed
489
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
490
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
491
492
493
494
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
495
496
497

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
498
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
499
500
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
501

Michael Yang's avatar
Michael Yang committed
502
503
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
504
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
505
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
506
507
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
508
509
510
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
511
512
513
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
514
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
515
516
517
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
518
519
520
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
521
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
522
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
523
			if slices.Contains(crossAttentionLayers, int32(i)) {
524
525
526
527
528
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
529
530
		}

Michael Yang's avatar
Michael Yang committed
531
532
533
534
535
536
537
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
538
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
539
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
540
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
541
542
543
544
545
546
547
548
549
550
551
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
552
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
553
554
555
556
557
558
559
560
561
562
563
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
564
565
566
567
568
569
570
571
572
573
574
575
576
577

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
578
579
580
581
582
583
584
585
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
586
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
603

Michael Yang's avatar
Michael Yang committed
604
605
606
607
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
608
609
610
611
612
613
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
614
615
616
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
617
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
618
619
620
621
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
622
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
623
		)
Michael Yang's avatar
Michael Yang committed
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
650
651
	}

Michael Yang's avatar
Michael Yang committed
652
	return
Michael Yang's avatar
Michael Yang committed
653
}
654

655
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
656
657
658
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
659

Michael Yang's avatar
Michael Yang committed
660
	for name, layer := range llm.Tensors().GroupLayers() {
661
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
662
663
			for _, tensor := range layer {
				weights += tensor.Size()
664
665
			}
		}
Michael Yang's avatar
Michael Yang committed
666
	}
667

Michael Yang's avatar
Michael Yang committed
668
669
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
670
671
672
673
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
674

Michael Yang's avatar
Michael Yang committed
675
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
676

Michael Yang's avatar
Michael Yang committed
677
678
679
680
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
681

Michael Yang's avatar
Michael Yang committed
682
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
683
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
684

Michael Yang's avatar
Michael Yang committed
685
686
	switch llm.KV().Architecture() {
	case "mllama":
687
688
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
689
690
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

691
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
692
			imageSize*imageSize*numChannels*maxNumTiles +
693
694
695
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
696
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
697
698
699
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
700
701
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
702
703
704

		numPatches := maxPixels / (patchSize * patchSize)

705
706
707
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
708
709
710
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
711
712
713
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
714
715
716
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
717
	}
Michael Yang's avatar
Michael Yang committed
718

719
720
721
	return weights, graphSize
}

722
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
723
724
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
725
726
727
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
728
729
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
730
731
732
733
734
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
735
736
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
737
738
739
740
741
742
743
744
745
746
747
748
749
750
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}