ggml.go 20.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3

import (
Michael Yang's avatar
Michael Yang committed
4
	"cmp"
5
6
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
7
	"fmt"
8
	"io"
Michael Yang's avatar
Michael Yang committed
9
	"log/slog"
10
	"slices"
Michael Yang's avatar
Michael Yang committed
11
	"strings"
12

Michael Yang's avatar
Michael Yang committed
13
	"github.com/ollama/ollama/fs/util/bufioutil"
14
15
)

Michael Yang's avatar
Michael Yang committed
16
17
18
type GGML struct {
	container
	model
19
	Length int64
Michael Yang's avatar
Michael Yang committed
20
}
21

Michael Yang's avatar
Michael Yang committed
22
type model interface {
Michael Yang's avatar
Michael Yang committed
23
	KV() KV
Michael Yang's avatar
Michael Yang committed
24
	Tensors() Tensors
25
26
}

27
28
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
29
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
30
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
31
32
}

33
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
34
	return kv.String("general.type", "unknown")
35
36
}

Michael Yang's avatar
Michael Yang committed
37
func (kv KV) ParameterCount() uint64 {
38
39
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
40
41
}

42
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
43
	if t := kv.Uint("general.file_type"); t > 0 {
44
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
45
46
	}

47
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
48
49
50
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
51
52
53
54
55
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
56
57
}

58
59
60
61
62
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
63
64
}

65
66
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
67
68
}

69
70
71
72
73
74
75
76
77
78
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
79
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
80
81
82
83
84
85
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
86
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
87
88
89
}

func (kv KV) EmbeddingHeadCountV() uint64 {
90
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
91
92
93
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
94
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
95
96
}

Michael Yang's avatar
Michael Yang committed
97
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
98
99
100
101
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
102
103
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
104
105
106
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
107
108
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
109
110
111
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
112
113
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
114
115
}

116
func (kv KV) Bool(key string, defaultValue ...bool) bool {
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
148
149
}

Michael Yang's avatar
Michael Yang committed
150
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
151
152
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
153
154
}

Michael Yang's avatar
Michael Yang committed
155
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
156
157
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
158
159
}

Michael Yang's avatar
Michael Yang committed
160
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
161
162
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
163
164
}

Patrick Devine's avatar
Patrick Devine committed
165
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
166
167
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
168
169
}

Michael Yang's avatar
Michael Yang committed
170
171
172
173
174
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

175
func (kv KV) OllamaEngineRequired() bool {
176
177
	return slices.Contains([]string{
		"gemma3",
178
		"gemma3n",
179
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
180
		"llama4",
181
		"mllama",
182
		"qwen25vl",
Michael Yang's avatar
Michael Yang committed
183
		"gptoss",
184
	}, kv.Architecture())
185
186
}

Michael Yang's avatar
Michael Yang committed
187
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
188
189
190
191
192
193
194
195
196
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
197
198
}

199
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
200
201
202
203
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

204
205
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
206
207
	}

208
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
209
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
210
211
}

212
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
213
	items  []*Tensor
214
	Offset uint64
Michael Yang's avatar
Michael Yang committed
215
}
Michael Yang's avatar
Michael Yang committed
216

Michael Yang's avatar
Michael Yang committed
217
218
219
220
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
221

Michael Yang's avatar
Michael Yang committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
242
			}
Michael Yang's avatar
Michael Yang committed
243
		}
244

Michael Yang's avatar
Michael Yang committed
245
246
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
247
248
		}

Michael Yang's avatar
Michael Yang committed
249
250
251
252
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
253
254
255
256
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
257
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
258
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
259
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
260
261
262
263
264
	}

	return size
}

265
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
266
267
268
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
269
270

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
271
	Shape []uint64 `json:"shape"`
272

Michael Yang's avatar
Michael Yang committed
273
	io.WriterTo `json:"-"`
274
275
}

276
277
278
279
280
281
282
283
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

284
func (t Tensor) blockSize() uint64 {
285
286
287
288
289
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
290
291
292
293
294
295
296
297
298
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
299
		return 1
Michael Yang's avatar
Michael Yang committed
300
301
302
303
304
305
306
307
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
308
		return 32
Michael Yang's avatar
Michael Yang committed
309
	default:
310
311
312
313
314
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
315
316
317
318
319
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
320

321
322
	switch t {
	case TensorTypeF32:
323
		return 4
324
	case TensorTypeF16:
325
		return 2
326
	case TensorTypeQ4_0:
327
		return 2 + blockSize/2
328
	case TensorTypeQ4_1:
329
		return 2 + 2 + blockSize/2
330
	case TensorTypeQ5_0:
331
		return 2 + 4 + blockSize/2
332
	case TensorTypeQ5_1:
333
		return 2 + 2 + 4 + blockSize/2
334
	case TensorTypeQ8_0:
335
		return 2 + blockSize
336
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
337
		return 2 + 2 + blockSize
338
	case TensorTypeQ2_K:
339
		return blockSize/16 + blockSize/4 + 2 + 2
340
	case TensorTypeQ3_K:
341
		return blockSize/8 + blockSize/4 + 12 + 2
342
	case TensorTypeQ4_K:
343
		return 2 + 2 + 12 + blockSize/2
344
	case TensorTypeQ5_K:
345
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
346
	case TensorTypeQ6_K:
347
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
348
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
349
		return 4 + blockSize + 2*blockSize/16
350
	case tensorTypeIQ2_XXS:
351
		return 2 + 2*blockSize/8
352
	case tensorTypeIQ2_XS:
353
		return 2 + 2*blockSize/8 + blockSize/32
354
	case tensorTypeIQ3_XXS:
355
		return 2 + blockSize/4 + blockSize/8
356
	case tensorTypeIQ1_S:
357
		return 2 + blockSize/8 + blockSize/16
358
	case tensorTypeIQ4_NL:
359
		return 2 + blockSize/2
360
	case tensorTypeIQ3_S:
361
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
362
	case tensorTypeIQ2_S:
363
		return 2 + blockSize/4 + blockSize/16
364
	case tensorTypeIQ4_XS:
365
		return 2 + 2 + blockSize/2 + blockSize/64
366
	case TensorTypeI8:
367
		return 1
368
	case TensorTypeI16:
369
		return 2
370
	case TensorTypeI32:
371
		return 4
372
	case TensorTypeI64:
373
		return 8
374
	case TensorTypeF64:
375
		return 8
376
	case tensorTypeIQ1_M:
377
		return blockSize/8 + blockSize/16 + blockSize/32
378
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
379
		return 2
380
381
382
383
384
	default:
		return 0
	}
}

385
func (t Tensor) Elements() uint64 {
386
387
388
389
390
391
392
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
393
func (t Tensor) Size() uint64 {
394
	return t.Elements() * t.typeSize() / t.blockSize()
395
396
}

397
func (t Tensor) Type() string {
398
	return TensorType(t.Kind).String()
399
400
}

401
402
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
403
	Decode(io.ReadSeeker) (model, error)
404
405
406
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
407
	// Magic constant for `ggml` files (unversioned).
408
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
409
	// Magic constant for `ggml` files (versioned, ggmf).
410
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
411
	// Magic constant for `ggml` files (versioned, ggjt).
412
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
413
	// Magic constant for `ggla` files (LoRA adapter).
414
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
415
	// Magic constant for `gguf` files (versioned, gguf)
416
417
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
418
419
)

Bruce MacDonald's avatar
Bruce MacDonald committed
420
421
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
422
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
423
424
425
426
427
428
429
430
431
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
432
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
433
434
435
436
437
438
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
439
// Decode decodes a GGML model from the given reader.
440
441
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
442
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
443
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
444
445
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

446
	var magic uint32
Michael Yang's avatar
Michael Yang committed
447
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
448
		return nil, err
449
450
451
	}

	var c container
452
453
	switch magic {
	case FILE_MAGIC_GGUF_LE:
454
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
455
	case FILE_MAGIC_GGUF_BE:
456
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
457
	default:
458
		return nil, errors.New("invalid file magic")
459
460
	}

Michael Yang's avatar
Michael Yang committed
461
	model, err := c.Decode(rs)
462
	if err != nil {
463
		return nil, err
464
465
	}

Michael Yang's avatar
Michael Yang committed
466
467
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
468
		return nil, err
Michael Yang's avatar
Michael Yang committed
469
470
	}

471
	// final model type
472
473
474
	return &GGML{
		container: c,
		model:     model,
475
476
		Length:    offset,
	}, nil
477
}
Michael Yang's avatar
Michael Yang committed
478

479
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
480
	embedding := f.KV().EmbeddingLength()
481
482
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
483
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
484

485
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
486
487
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
488

Michael Yang's avatar
Michael Yang committed
489
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
490

491
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
Michael Yang's avatar
Michael Yang committed
492
	var kvTotal uint64
493
494
495
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
Michael Yang's avatar
Michael Yang committed
496
		kvTotal += kv[i]
497
	}
Michael Yang's avatar
Michael Yang committed
498

Michael Yang's avatar
Michael Yang committed
499
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
500
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
501
502
503
504
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
505
506
507

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
508
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
509
510
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
511

Michael Yang's avatar
Michael Yang committed
512
513
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
514
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
515
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
516
517
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
518
519
520
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
521
522
523
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
524
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
525
526
527
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
528
529
530
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
531
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
532
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
533
			if slices.Contains(crossAttentionLayers, int32(i)) {
534
535
536
537
538
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
539
540
		}

Michael Yang's avatar
Michael Yang committed
541
542
543
544
545
546
547
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
548
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
549
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
550
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
551
552
553
554
555
556
557
558
559
560
561
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
562
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
563
564
565
566
567
568
569
570
571
572
573
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
574

575
576
577
578
579
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

580
581
582
583
584
585
586
587
588
589
590
591
592
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
593
594
595
596
597
598
599
600
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
601
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
618

Michael Yang's avatar
Michael Yang committed
619
620
621
622
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
623
624
625
626
627
628
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
629
630
631
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
632
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
633
634
635
636
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
637
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
638
		)
Michael Yang's avatar
Michael Yang committed
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
665
666
667
668
669
670
671
672
673
674
675
676
	case "gptoss":
		kv = make([]uint64, f.KV().BlockCount())
		for i := range kv {
			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			if i%2 == 0 {
				kv[i] *= (4096 + batch)
			} else {
				kv[i] *= context
			}
		}
		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
		partialOffload = 2 * fullOffload
Michael Yang's avatar
Michael Yang committed
677
678
	}

Michael Yang's avatar
Michael Yang committed
679
	return
Michael Yang's avatar
Michael Yang committed
680
}
681

682
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
683
684
685
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
686

Michael Yang's avatar
Michael Yang committed
687
	for name, layer := range llm.Tensors().GroupLayers() {
688
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
689
690
			for _, tensor := range layer {
				weights += tensor.Size()
691
692
			}
		}
Michael Yang's avatar
Michael Yang committed
693
	}
694

Michael Yang's avatar
Michael Yang committed
695
696
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
697
698
699
700
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
701

Michael Yang's avatar
Michael Yang committed
702
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
703

Michael Yang's avatar
Michael Yang committed
704
705
706
707
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
708

Michael Yang's avatar
Michael Yang committed
709
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
710
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
711

Michael Yang's avatar
Michael Yang committed
712
713
	switch llm.KV().Architecture() {
	case "mllama":
714
715
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
716
717
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

718
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
719
			imageSize*imageSize*numChannels*maxNumTiles +
720
721
722
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
723
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
724
725
726
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
727
728
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
729
730
731

		numPatches := maxPixels / (patchSize * patchSize)

732
733
734
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
735
736
737
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
738
739
740
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
741
742
743
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
744
	}
Michael Yang's avatar
Michael Yang committed
745

746
747
748
	return weights, graphSize
}

749
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
750
751
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
752
753
754
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
755
756
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
757
758
759
760
761
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
762
763
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
764
765
766
767
768
769
770
771
772
773
774
775
776
777
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}