ggml.go 18.7 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
36
37
	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
	return val
Michael Yang's avatar
Michael Yang committed
38
39
}

Michael Yang's avatar
Michael Yang committed
40
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
41
42
	if t := kv.Uint("general.file_type"); t > 0 {
		return fileType(t)
Michael Yang's avatar
Michael Yang committed
43
44
	}

Michael Yang's avatar
Michael Yang committed
45
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
46
47
48
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
49
50
51
52
53
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
54
55
}

56
57
58
59
60
func (kv KV) HeadCountMax() uint64 {
	// TODO(drifkin): using the max value can cause an overestimation. In the
	// future if array values become more popular, we can adapt the more invasive
	// <https://github.com/ollama/ollama/pull/10225>
	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

63
64
func (kv KV) HeadCountMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
Michael Yang's avatar
Michael Yang committed
65
66
}

67
68
69
70
71
72
73
74
75
76
func (kv KV) HeadCountKVMax() uint64 {
	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
}

func (kv KV) HeadCountKVMin() uint64 {
	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
}

func (kv KV) EmbeddingHeadCountMax() uint64 {
	if heads := kv.HeadCountMin(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
77
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
78
79
80
81
82
83
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
84
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
85
86
87
}

func (kv KV) EmbeddingHeadCountV() uint64 {
88
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
Michael Yang's avatar
Michael Yang committed
89
90
91
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
92
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
93
94
}

Michael Yang's avatar
Michael Yang committed
95
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
96
97
98
99
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
100
101
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
Michael Yang's avatar
Michael Yang committed
102
103
104
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
105
106
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
107
108
109
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
110
111
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
Michael Yang's avatar
Michael Yang committed
112
113
}

114
func (kv KV) Bool(key string, defaultValue ...bool) bool {
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
	_, max := kv.UintOrArrayValue(key, defaultValue)
	return max
}

func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
	min, _ := kv.UintOrArrayValue(key, defaultValue)
	return min
}

func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
	if u32, ok := keyValue(kv, key, uint32(0)); ok {
		return u32, u32
	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
		min := slices.Min(u32s.values)
		max := slices.Max(u32s.values)
		return min, max
	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
		min := slices.Min(i32s.values)
		max := slices.Max(i32s.values)
		if min < 0 || max < 0 {
			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
		}
		return uint32(min), uint32(max)
	}

	return defaultValue, defaultValue
146
147
}

Michael Yang's avatar
Michael Yang committed
148
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
149
150
	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
151
152
}

Michael Yang's avatar
Michael Yang committed
153
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
154
155
	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
156
157
}

Michael Yang's avatar
Michael Yang committed
158
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
159
160
	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
	return val.values
Michael Yang's avatar
Michael Yang committed
161
162
}

Patrick Devine's avatar
Patrick Devine committed
163
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
164
165
	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
	return val.values
Patrick Devine's avatar
Patrick Devine committed
166
167
}

168
func (kv KV) OllamaEngineRequired() bool {
169
170
171
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
172
		"llama4",
173
	}, kv.Architecture())
174
175
}

Michael Yang's avatar
Michael Yang committed
176
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
177
178
179
180
181
182
183
184
185
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
186
187
}

188
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
Michael Yang's avatar
Michael Yang committed
189
190
191
192
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

193
194
	if val, ok := kv[key].(T); ok {
		return val, true
Michael Yang's avatar
Michael Yang committed
195
196
	}

197
198
	slog.Warn("key with type not found", "key", key, "default", defaultValue[0])
	return defaultValue[0], false
Michael Yang's avatar
Michael Yang committed
199
200
}

201
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
202
	items  []*Tensor
203
	Offset uint64
Michael Yang's avatar
Michael Yang committed
204
}
Michael Yang's avatar
Michael Yang committed
205

Michael Yang's avatar
Michael Yang committed
206
207
208
209
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
210

Michael Yang's avatar
Michael Yang committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
231
			}
Michael Yang's avatar
Michael Yang committed
232
		}
233

Michael Yang's avatar
Michael Yang committed
234
235
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
236
237
		}

Michael Yang's avatar
Michael Yang committed
238
239
240
241
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
242
243
244
245
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
246
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
247
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
248
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
249
250
251
252
253
	}

	return size
}

254
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
255
256
257
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
258
259

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
260
	Shape []uint64 `json:"shape"`
261

Michael Yang's avatar
Michael Yang committed
262
	io.WriterTo `json:"-"`
263
264
}

265
266
267
268
269
270
271
272
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

273
func (t Tensor) blockSize() uint64 {
274
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
275
276
277
278
279
280
281
282
283
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
284
		return 1
Michael Yang's avatar
Michael Yang committed
285
286
287
288
289
290
291
292
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
293
		return 32
Michael Yang's avatar
Michael Yang committed
294
	default:
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
Michael Yang's avatar
Michael Yang committed
318
		return 2 + 2 + blockSize
319
320
321
322
323
324
325
326
327
328
329
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
Michael Yang's avatar
Michael Yang committed
330
		return 4 + blockSize + 2*blockSize/16
331
332
333
334
335
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
Michael Yang's avatar
Michael Yang committed
359
360
	case 30: // BF16
		return 2
361
362
363
364
365
366
367
368
369
370
371
372
373
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
374
func (t Tensor) Size() uint64 {
375
376
377
	return t.parameters() * t.typeSize() / t.blockSize()
}

378
379
380
381
func (t Tensor) Type() string {
	return fileType(t.Kind).String()
}

382
383
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
384
	Decode(io.ReadSeeker) (model, error)
385
386
387
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
388
	// Magic constant for `ggml` files (unversioned).
389
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
390
	// Magic constant for `ggml` files (versioned, ggmf).
391
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
392
	// Magic constant for `ggml` files (versioned, ggjt).
393
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
394
	// Magic constant for `ggla` files (LoRA adapter).
395
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
396
	// Magic constant for `gguf` files (versioned, gguf)
397
398
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
399
400
)

Bruce MacDonald's avatar
Bruce MacDonald committed
401
402
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
403
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
404
405
406
407
408
409
410
411
412
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
413
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
414
415
416
417
418
419
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
420
// Decode decodes a GGML model from the given reader.
421
422
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
423
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
424
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
425
426
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

427
	var magic uint32
Michael Yang's avatar
Michael Yang committed
428
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
429
		return nil, 0, err
430
431
432
	}

	var c container
433
434
	switch magic {
	case FILE_MAGIC_GGUF_LE:
435
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
436
	case FILE_MAGIC_GGUF_BE:
437
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
438
	default:
Michael Yang's avatar
Michael Yang committed
439
		return nil, 0, errors.New("invalid file magic")
440
441
	}

Michael Yang's avatar
Michael Yang committed
442
	model, err := c.Decode(rs)
443
	if err != nil {
Michael Yang's avatar
Michael Yang committed
444
		return nil, 0, err
445
446
	}

Michael Yang's avatar
Michael Yang committed
447
448
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
449
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
450
451
	}

452
	// final model type
453
454
455
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
456
	}, offset, nil
457
}
Michael Yang's avatar
Michael Yang committed
458

459
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
460
	embedding := f.KV().EmbeddingLength()
461
462
	heads := f.KV().HeadCountMax()
	headsKV := f.KV().HeadCountKVMax()
Michael Yang's avatar
Michael Yang committed
463
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
464

465
	embeddingHeads := f.KV().EmbeddingHeadCountMax()
Michael Yang's avatar
Michael Yang committed
466
467
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
468

Michael Yang's avatar
Michael Yang committed
469
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
470

471
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
472
473
474
475
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
476

Michael Yang's avatar
Michael Yang committed
477
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
478
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
479
480
481
482
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
483
484
485

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
486
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
487
488
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
489

Michael Yang's avatar
Michael Yang committed
490
491
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
492
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
493
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
494
495
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
496
497
498
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
499
500
501
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
502
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
503
504
505
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
506
507
508
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
509
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
510
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
511
			if slices.Contains(crossAttentionLayers, int32(i)) {
512
513
514
515
516
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
517
518
		}

Michael Yang's avatar
Michael Yang committed
519
520
521
522
523
524
525
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
526
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
527
528
529
530
531
532
533
534
535
536
537
538
539
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
				ropeFreqsCount = ropeFreqsWeights.parameters()
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
540
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
541
542
543
544
545
546
547
548
549
550
551
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
552
553
554
555
556
557
558
559
560
561
562
563
564
565

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
566
567
568
569
570
571
572
573
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
574
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
591

Michael Yang's avatar
Michael Yang committed
592
593
594
595
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
596
597
598
599
600
601
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
602
603
604
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
605
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
606
607
608
609
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
610
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
611
		)
Michael Yang's avatar
Michael Yang committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
638
639
	}

Michael Yang's avatar
Michael Yang committed
640
	return
Michael Yang's avatar
Michael Yang committed
641
}
642

643
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
644
645
646
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
647

Michael Yang's avatar
Michael Yang committed
648
	for name, layer := range llm.Tensors().GroupLayers() {
649
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
650
651
			for _, tensor := range layer {
				weights += tensor.Size()
652
653
			}
		}
Michael Yang's avatar
Michael Yang committed
654
	}
655

Michael Yang's avatar
Michael Yang committed
656
657
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
658
659
660
661
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
662

Michael Yang's avatar
Michael Yang committed
663
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
664

Michael Yang's avatar
Michael Yang committed
665
666
667
668
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
669

Michael Yang's avatar
Michael Yang committed
670
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
671
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
672

Michael Yang's avatar
Michael Yang committed
673
674
	switch llm.KV().Architecture() {
	case "mllama":
675
676
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
677
678
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

679
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
680
			imageSize*imageSize*numChannels*maxNumTiles +
681
682
683
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
684
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
685
686
687
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
Michael Yang's avatar
memory  
Michael Yang committed
688
689
690
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
691
	}
Michael Yang's avatar
Michael Yang committed
692

693
694
695
	return weights, graphSize
}

696
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
697
698
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
699
700
701
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
702
703
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
704
705
706
707
708
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
709
710
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
711
712
713
714
715
716
717
718
719
720
721
722
723
724
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}