ggml.go 19.8 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
type GGML struct {
	container
	model
18
	Length int64
Michael Yang's avatar
Michael Yang committed
19
}
20

Michael Yang's avatar
Michael Yang committed
21
type model interface {
Michael Yang's avatar
Michael Yang committed
22
	KV() KV
Michael Yang's avatar
Michael Yang committed
23
	Tensors() Tensors
24
25
}

26
27
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
28
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
29
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
30
31
}

32
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
33
	return kv.String("general.type", "unknown")
34
35
}

Michael Yang's avatar
Michael Yang committed
36
func (kv KV) ParameterCount() uint64 {
37
38
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
39
40
}

41
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
42
	if t := kv.Uint("general.file_type"); t > 0 {
43
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
44
45
	}

46
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
47
48
49
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
50
51
52
53
54
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
55
56
}

57
58
59
60
61
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
62
63
}

64
65
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
66
67
}

68
69
70
71
72
73
74
75
76
77
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
78
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
79
80
81
82
83
84
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
85
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
86
87
88
}

func (kv KV) EmbeddingHeadCountV() uint64 {
89
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
90
91
92
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
93
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
94
95
}

Michael Yang's avatar
Michael Yang committed
96
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
97
98
99
100
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
101
102
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
103
104
105
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
106
107
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
108
109
110
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
111
112
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
113
114
}

115
func (kv KV) Bool(key string, defaultValue ...bool) bool {
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
147
148
}

Michael Yang's avatar
Michael Yang committed
149
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
150
151
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
152
153
}

Michael Yang's avatar
Michael Yang committed
154
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
155
156
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
157
158
}

Michael Yang's avatar
Michael Yang committed
159
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
160
161
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
162
163
}

Patrick Devine's avatar
Patrick Devine committed
164
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
165
166
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
167
168
}

Michael Yang's avatar
Michael Yang committed
169
170
171
172
173
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
	return val.values
}

174
func (kv KV) OllamaEngineRequired() bool {
175
176
	return slices.Contains([]string{
		"gemma3",
177
		"gemma3n",
178
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
179
		"llama4",
180
		"mllama",
181
		"qwen25vl",
182
	}, kv.Architecture())
183
184
}

Michael Yang's avatar
Michael Yang committed
185
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
186
187
188
189
190
191
192
193
194
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
195
196
}

197
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
198
199
200
201
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

202
203
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
204
205
	}

206
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
207
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
208
209
}

210
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
211
	items  []*Tensor
212
	Offset uint64
Michael Yang's avatar
Michael Yang committed
213
}
Michael Yang's avatar
Michael Yang committed
214

Michael Yang's avatar
Michael Yang committed
215
216
217
218
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
219

Michael Yang's avatar
Michael Yang committed
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
240
			}
Michael Yang's avatar
Michael Yang committed
241
		}
242

Michael Yang's avatar
Michael Yang committed
243
244
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
245
246
		}

Michael Yang's avatar
Michael Yang committed
247
248
249
250
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
251
252
253
254
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
255
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
256
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
257
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
258
259
260
261
262
	}

	return size
}

263
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
264
265
266
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
267
268

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
269
	Shape []uint64 `json:"shape"`
270

Michael Yang's avatar
Michael Yang committed
271
	io.WriterTo `json:"-"`
272
273
}

274
275
276
277
278
279
280
281
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

282
func (t Tensor) blockSize() uint64 {
283
284
285
286
287
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
288
289
290
291
292
293
294
295
296
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
297
		return 1
Michael Yang's avatar
Michael Yang committed
298
299
300
301
302
303
304
305
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
306
		return 32
Michael Yang's avatar
Michael Yang committed
307
	default:
308
309
310
311
312
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
313
314
315
316
317
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
318

319
320
	switch t {
	case TensorTypeF32:
321
		return 4
322
	case TensorTypeF16:
323
		return 2
324
	case TensorTypeQ4_0:
325
		return 2 + blockSize/2
326
	case TensorTypeQ4_1:
327
		return 2 + 2 + blockSize/2
328
	case TensorTypeQ5_0:
329
		return 2 + 4 + blockSize/2
330
	case TensorTypeQ5_1:
331
		return 2 + 2 + 4 + blockSize/2
332
	case TensorTypeQ8_0:
333
		return 2 + blockSize
334
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
335
		return 2 + 2 + blockSize
336
	case TensorTypeQ2_K:
337
		return blockSize/16 + blockSize/4 + 2 + 2
338
	case TensorTypeQ3_K:
339
		return blockSize/8 + blockSize/4 + 12 + 2
340
	case TensorTypeQ4_K:
341
		return 2 + 2 + 12 + blockSize/2
342
	case TensorTypeQ5_K:
343
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
344
	case TensorTypeQ6_K:
345
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
346
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
347
		return 4 + blockSize + 2*blockSize/16
348
	case tensorTypeIQ2_XXS:
349
		return 2 + 2*blockSize/8
350
	case tensorTypeIQ2_XS:
351
		return 2 + 2*blockSize/8 + blockSize/32
352
	case tensorTypeIQ3_XXS:
353
		return 2 + blockSize/4 + blockSize/8
354
	case tensorTypeIQ1_S:
355
		return 2 + blockSize/8 + blockSize/16
356
	case tensorTypeIQ4_NL:
357
		return 2 + blockSize/2
358
	case tensorTypeIQ3_S:
359
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
360
	case tensorTypeIQ2_S:
361
		return 2 + blockSize/4 + blockSize/16
362
	case tensorTypeIQ4_XS:
363
		return 2 + 2 + blockSize/2 + blockSize/64
364
	case TensorTypeI8:
365
		return 1
366
	case TensorTypeI16:
367
		return 2
368
	case TensorTypeI32:
369
		return 4
370
	case TensorTypeI64:
371
		return 8
372
	case TensorTypeF64:
373
		return 8
374
	case tensorTypeIQ1_M:
375
		return blockSize/8 + blockSize/16 + blockSize/32
376
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
377
		return 2
378
379
380
381
382
	default:
		return 0
	}
}

383
func (t Tensor) Elements() uint64 {
384
385
386
387
388
389
390
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
391
func (t Tensor) Size() uint64 {
392
	return t.Elements() * t.typeSize() / t.blockSize()
393
394
}

395
func (t Tensor) Type() string {
396
	return TensorType(t.Kind).String()
397
398
}

399
400
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
401
	Decode(io.ReadSeeker) (model, error)
402
403
404
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
405
	// Magic constant for `ggml` files (unversioned).
406
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
407
	// Magic constant for `ggml` files (versioned, ggmf).
408
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
409
	// Magic constant for `ggml` files (versioned, ggjt).
410
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
411
	// Magic constant for `ggla` files (LoRA adapter).
412
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
413
	// Magic constant for `gguf` files (versioned, gguf)
414
415
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
416
417
)

Bruce MacDonald's avatar
Bruce MacDonald committed
418
419
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
420
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
421
422
423
424
425
426
427
428
429
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
430
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
431
432
433
434
435
436
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
437
// Decode decodes a GGML model from the given reader.
438
439
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
440
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
441
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
442
443
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

444
	var magic uint32
Michael Yang's avatar
Michael Yang committed
445
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
446
		return nil, err
447
448
449
	}

	var c container
450
451
	switch magic {
	case FILE_MAGIC_GGUF_LE:
452
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
453
	case FILE_MAGIC_GGUF_BE:
454
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
455
	default:
456
		return nil, errors.New("invalid file magic")
457
458
	}

Michael Yang's avatar
Michael Yang committed
459
	model, err := c.Decode(rs)
460
	if err != nil {
461
		return nil, err
462
463
	}

Michael Yang's avatar
Michael Yang committed
464
465
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
466
		return nil, err
Michael Yang's avatar
Michael Yang committed
467
468
	}

469
	// final model type
470
471
472
	return &GGML{
		container: c,
		model:     model,
473
474
		Length:    offset,
	}, nil
475
}
Michael Yang's avatar
Michael Yang committed
476

477
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
478
	embedding := f.KV().EmbeddingLength()
479
480
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
481
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
482

483
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
484
485
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
486

Michael Yang's avatar
Michael Yang committed
487
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
488

489
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
490
491
492
493
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
494

Michael Yang's avatar
Michael Yang committed
495
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
496
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
497
498
499
500
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
501
502
503

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
504
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
505
506
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
507

Michael Yang's avatar
Michael Yang committed
508
509
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
510
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
511
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
512
513
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
514
515
516
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
517
518
519
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
520
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
521
522
523
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
524
525
526
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
527
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
528
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
529
			if slices.Contains(crossAttentionLayers, int32(i)) {
530
531
532
533
534
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
535
536
		}

Michael Yang's avatar
Michael Yang committed
537
538
539
540
541
542
543
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
544
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
545
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
546
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
547
548
549
550
551
552
553
554
555
556
557
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
558
	case "gemma", "gemma2", "gemma3", "gemma3n":
Michael Yang's avatar
Michael Yang committed
559
560
561
562
563
564
565
566
567
568
569
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
570

571
572
573
574
575
		if f.KV().Architecture() == "gemma3n" {
			fullOffload *= 4
			partialOffload *= 4
		}

576
577
578
579
580
581
582
583
584
585
586
587
588
		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
589
590
591
592
593
594
595
596
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
597
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
614

Michael Yang's avatar
Michael Yang committed
615
616
617
618
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
619
620
621
622
623
624
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
625
626
627
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
628
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
629
630
631
632
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
633
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
634
		)
Michael Yang's avatar
Michael Yang committed
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
661
662
	}

Michael Yang's avatar
Michael Yang committed
663
	return
Michael Yang's avatar
Michael Yang committed
664
}
665

666
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
667
668
669
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
670

Michael Yang's avatar
Michael Yang committed
671
	for name, layer := range llm.Tensors().GroupLayers() {
672
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
673
674
			for _, tensor := range layer {
				weights += tensor.Size()
675
676
			}
		}
Michael Yang's avatar
Michael Yang committed
677
	}
678

Michael Yang's avatar
Michael Yang committed
679
680
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
681
682
683
684
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
685

Michael Yang's avatar
Michael Yang committed
686
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
687

Michael Yang's avatar
Michael Yang committed
688
689
690
691
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
692

Michael Yang's avatar
Michael Yang committed
693
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
694
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
695

Michael Yang's avatar
Michael Yang committed
696
697
	switch llm.KV().Architecture() {
	case "mllama":
698
699
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
700
701
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

702
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
703
			imageSize*imageSize*numChannels*maxNumTiles +
704
705
706
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
707
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
708
709
710
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
711
712
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
713
714
715

		numPatches := maxPixels / (patchSize * patchSize)

716
717
718
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
719
720
721
			// Patches storage (numPatches * channels * patchSize^2)
			numPatches*numChannels*patchSize*patchSize +
			// Self-attention calculations
722
723
724
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
725
726
727
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
728
	}
Michael Yang's avatar
Michael Yang committed
729

730
731
732
	return weights, graphSize
}

733
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
734
735
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
736
737
738
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
739
740
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
741
742
743
744
745
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
746
747
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
748
749
750
751
752
753
754
755
756
757
758
759
760
761
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}