ggml.go 19 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
36
37
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
38
39
}

40
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
41
	if t := kv.Uint("general.file_type"); t > 0 {
42
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
43
44
	}

45
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
46
47
48
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
49
50
51
52
53
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
54
55
}

56
57
58
59
60
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

63
64
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
65
66
}

67
68
69
70
71
72
73
74
75
76
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
77
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
78
79
80
81
82
83
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
84
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
85
86
87
}

func (kv KV) EmbeddingHeadCountV() uint64 {
88
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
89
90
91
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
92
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
93
94
}

Michael Yang's avatar
Michael Yang committed
95
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
96
97
98
99
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
100
101
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
102
103
104
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
105
106
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
107
108
109
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
110
111
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
112
113
}

114
func (kv KV) Bool(key string, defaultValue ...bool) bool {
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
146
147
}

Michael Yang's avatar
Michael Yang committed
148
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
149
150
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
151
152
}

Michael Yang's avatar
Michael Yang committed
153
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
154
155
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
156
157
}

Michael Yang's avatar
Michael Yang committed
158
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
159
160
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
161
162
}

Patrick Devine's avatar
Patrick Devine committed
163
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
164
165
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
166
167
}

168
func (kv KV) OllamaEngineRequired() bool {
169
170
171
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
172
		"llama4",
173
	}, kv.Architecture())
174
175
}

Michael Yang's avatar
Michael Yang committed
176
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
177
178
179
180
181
182
183
184
185
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
186
187
}

188
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
189
190
191
192
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

193
194
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
195
196
	}

197
	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
198
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
199
200
}

201
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
202
	items  []*Tensor
203
	Offset uint64
Michael Yang's avatar
Michael Yang committed
204
}
Michael Yang's avatar
Michael Yang committed
205

Michael Yang's avatar
Michael Yang committed
206
207
208
209
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
210

Michael Yang's avatar
Michael Yang committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
231
			}
Michael Yang's avatar
Michael Yang committed
232
		}
233

Michael Yang's avatar
Michael Yang committed
234
235
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
236
237
		}

Michael Yang's avatar
Michael Yang committed
238
239
240
241
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
242
243
244
245
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
246
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
247
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
248
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
249
250
251
252
253
	}

	return size
}

254
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
255
256
257
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
258
259

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
260
	Shape []uint64 `json:"shape"`
261

Michael Yang's avatar
Michael Yang committed
262
	io.WriterTo `json:"-"`
263
264
}

265
266
267
268
269
270
271
272
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

273
func (t Tensor) blockSize() uint64 {
274
275
276
277
278
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
279
280
281
282
283
284
285
286
287
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
288
		return 1
Michael Yang's avatar
Michael Yang committed
289
290
291
292
293
294
295
296
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
297
		return 32
Michael Yang's avatar
Michael Yang committed
298
	default:
299
300
301
302
303
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
304
305
306
307
308
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
309

310
311
	switch t {
	case TensorTypeF32:
312
		return 4
313
	case TensorTypeF16:
314
		return 2
315
	case TensorTypeQ4_0:
316
		return 2 + blockSize/2
317
	case TensorTypeQ4_1:
318
		return 2 + 2 + blockSize/2
319
	case TensorTypeQ5_0:
320
		return 2 + 4 + blockSize/2
321
	case TensorTypeQ5_1:
322
		return 2 + 2 + 4 + blockSize/2
323
	case TensorTypeQ8_0:
324
		return 2 + blockSize
325
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
326
		return 2 + 2 + blockSize
327
	case TensorTypeQ2_K:
328
		return blockSize/16 + blockSize/4 + 2 + 2
329
	case TensorTypeQ3_K:
330
		return blockSize/8 + blockSize/4 + 12 + 2
331
	case TensorTypeQ4_K:
332
		return 2 + 2 + 12 + blockSize/2
333
	case TensorTypeQ5_K:
334
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
335
	case TensorTypeQ6_K:
336
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
337
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
338
		return 4 + blockSize + 2*blockSize/16
339
	case tensorTypeIQ2_XXS:
340
		return 2 + 2*blockSize/8
341
	case tensorTypeIQ2_XS:
342
		return 2 + 2*blockSize/8 + blockSize/32
343
	case tensorTypeIQ3_XXS:
344
		return 2 + blockSize/4 + blockSize/8
345
	case tensorTypeIQ1_S:
346
		return 2 + blockSize/8 + blockSize/16
347
	case tensorTypeIQ4_NL:
348
		return 2 + blockSize/2
349
	case tensorTypeIQ3_S:
350
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
351
	case tensorTypeIQ2_S:
352
		return 2 + blockSize/4 + blockSize/16
353
	case tensorTypeIQ4_XS:
354
		return 2 + 2 + blockSize/2 + blockSize/64
355
	case TensorTypeI8:
356
		return 1
357
	case TensorTypeI16:
358
		return 2
359
	case TensorTypeI32:
360
		return 4
361
	case TensorTypeI64:
362
		return 8
363
	case TensorTypeF64:
364
		return 8
365
	case tensorTypeIQ1_M:
366
		return blockSize/8 + blockSize/16 + blockSize/32
367
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
368
		return 2
369
370
371
372
373
	default:
		return 0
	}
}

374
func (t Tensor) Elements() uint64 {
375
376
377
378
379
380
381
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
382
func (t Tensor) Size() uint64 {
383
	return t.Elements() * t.typeSize() / t.blockSize()
384
385
}

386
func (t Tensor) Type() string {
387
	return TensorType(t.Kind).String()
388
389
}

390
391
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
392
	Decode(io.ReadSeeker) (model, error)
393
394
395
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
396
	// Magic constant for `ggml` files (unversioned).
397
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
398
	// Magic constant for `ggml` files (versioned, ggmf).
399
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
400
	// Magic constant for `ggml` files (versioned, ggjt).
401
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
402
	// Magic constant for `ggla` files (LoRA adapter).
403
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
404
	// Magic constant for `gguf` files (versioned, gguf)
405
406
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
407
408
)

Bruce MacDonald's avatar
Bruce MacDonald committed
409
410
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
411
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
412
413
414
415
416
417
418
419
420
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
421
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
422
423
424
425
426
427
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
428
// Decode decodes a GGML model from the given reader.
429
430
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
431
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
432
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
433
434
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

435
	var magic uint32
Michael Yang's avatar
Michael Yang committed
436
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
437
		return nil, 0, err
438
439
440
	}

	var c container
441
442
	switch magic {
	case FILE_MAGIC_GGUF_LE:
443
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
444
	case FILE_MAGIC_GGUF_BE:
445
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
446
	default:
Michael Yang's avatar
Michael Yang committed
447
		return nil, 0, errors.New("invalid file magic")
448
449
	}

Michael Yang's avatar
Michael Yang committed
450
	model, err := c.Decode(rs)
451
	if err != nil {
Michael Yang's avatar
Michael Yang committed
452
		return nil, 0, err
453
454
	}

Michael Yang's avatar
Michael Yang committed
455
456
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
457
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
458
459
	}

460
	// final model type
461
462
463
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
464
	}, offset, nil
465
}
Michael Yang's avatar
Michael Yang committed
466

467
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
468
	embedding := f.KV().EmbeddingLength()
469
470
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
471
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
472

473
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
474
475
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
476

Michael Yang's avatar
Michael Yang committed
477
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
478

479
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
480
481
482
483
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
484

Michael Yang's avatar
Michael Yang committed
485
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
486
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
487
488
489
490
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
491
492
493

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
494
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
495
496
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
497

Michael Yang's avatar
Michael Yang committed
498
499
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
500
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
501
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
502
503
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
504
505
506
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
507
508
509
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
510
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
511
512
513
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
514
515
516
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
517
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
518
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
519
			if slices.Contains(crossAttentionLayers, int32(i)) {
520
521
522
523
524
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
525
526
		}

Michael Yang's avatar
Michael Yang committed
527
528
529
530
531
532
533
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
534
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
535
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
536
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
537
538
539
540
541
542
543
544
545
546
547
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
548
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
549
550
551
552
553
554
555
556
557
558
559
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
560
561
562
563
564
565
566
567
568
569
570
571
572
573

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
574
575
576
577
578
579
580
581
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
582
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
599

Michael Yang's avatar
Michael Yang committed
600
601
602
603
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
604
605
606
607
608
609
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
610
611
612
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
613
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
614
615
616
617
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
618
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
619
		)
Michael Yang's avatar
Michael Yang committed
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
646
647
	}

Michael Yang's avatar
Michael Yang committed
648
	return
Michael Yang's avatar
Michael Yang committed
649
}
650

651
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
652
653
654
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
655

Michael Yang's avatar
Michael Yang committed
656
	for name, layer := range llm.Tensors().GroupLayers() {
657
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
658
659
			for _, tensor := range layer {
				weights += tensor.Size()
660
661
			}
		}
Michael Yang's avatar
Michael Yang committed
662
	}
663

Michael Yang's avatar
Michael Yang committed
664
665
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
666
667
668
669
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
670

Michael Yang's avatar
Michael Yang committed
671
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
672

Michael Yang's avatar
Michael Yang committed
673
674
675
676
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
677

Michael Yang's avatar
Michael Yang committed
678
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
679
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
680

Michael Yang's avatar
Michael Yang committed
681
682
	switch llm.KV().Architecture() {
	case "mllama":
683
684
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
685
686
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

687
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
688
			imageSize*imageSize*numChannels*maxNumTiles +
689
690
691
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
692
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
693
694
695
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
Michael Yang's avatar
memory  
Michael Yang committed
696
697
698
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
699
	}
Michael Yang's avatar
Michael Yang committed
700

701
702
703
	return weights, graphSize
}

704
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
705
706
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
707
708
709
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
710
711
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
712
713
714
715
716
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
717
718
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
719
720
721
722
723
724
725
726
727
728
729
730
731
732
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}