ggml.go 17.5 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"slices"
Michael Yang's avatar
Michael Yang committed
10
	"strings"
11

Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/fs/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
Michael Yang's avatar
Michael Yang committed
22
	Tensors() Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
28
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
29
30
}

31
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
32
	return kv.String("general.type", "unknown")
33
34
}

Michael Yang's avatar
Michael Yang committed
35
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
36
	return keyValue(kv, "general.parameter_count", uint64(0))
Michael Yang's avatar
Michael Yang committed
37
38
}

39
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
40
	if t := kv.Uint("general.file_type"); t > 0 {
41
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
42
43
	}

44
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
45
46
47
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
48
49
50
51
52
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
53
54
55
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
56
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
57
58
59
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
60
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
61
62
}

Michael Yang's avatar
Michael Yang committed
63
64
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
65
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
72
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
73
74
75
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
76
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
77
78
}

Michael Yang's avatar
Michael Yang committed
79
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
80
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
81
82
83
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
84
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
85
86
}

Michael Yang's avatar
Michael Yang committed
87
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

103
104
105
106
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
107
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
Michael Yang's avatar
Michael Yang committed
108
	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
109
110
}

Michael Yang's avatar
Michael Yang committed
111
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
Michael Yang's avatar
Michael Yang committed
112
	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
113
114
}

Michael Yang's avatar
Michael Yang committed
115
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
Michael Yang's avatar
Michael Yang committed
116
	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
117
118
}

Patrick Devine's avatar
Patrick Devine committed
119
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
120
	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
Patrick Devine's avatar
Patrick Devine committed
121
122
}

123
func (kv KV) OllamaEngineRequired() bool {
124
125
126
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
127
		"llama4",
128
		"mllama",
129
	}, kv.Architecture())
130
131
}

Michael Yang's avatar
Michael Yang committed
132
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
133
134
135
136
137
138
139
140
141
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
142
143
}

Michael Yang's avatar
Michael Yang committed
144
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
145
146
147
148
149
150
151
152
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

153
	slog.Debug("key not found", "key", key, "default", defaultValue[0])
Michael Yang's avatar
Michael Yang committed
154
155
156
	return defaultValue[0]
}

157
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
158
	items  []*Tensor
159
	Offset uint64
Michael Yang's avatar
Michael Yang committed
160
}
Michael Yang's avatar
Michael Yang committed
161

Michael Yang's avatar
Michael Yang committed
162
163
164
165
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
166

Michael Yang's avatar
Michael Yang committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
187
			}
Michael Yang's avatar
Michael Yang committed
188
		}
189

Michael Yang's avatar
Michael Yang committed
190
191
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
192
193
		}

Michael Yang's avatar
Michael Yang committed
194
195
196
197
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
198
199
200
201
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
202
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
203
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
204
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
205
206
207
208
209
	}

	return size
}

210
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
211
212
213
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
214
215

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
216
	Shape []uint64 `json:"shape"`
217

Michael Yang's avatar
Michael Yang committed
218
	io.WriterTo `json:"-"`
219
220
}

221
222
223
224
225
226
227
228
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

229
func (t Tensor) blockSize() uint64 {
230
231
232
233
234
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
235
236
237
238
239
240
241
242
243
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
244
		return 1
Michael Yang's avatar
Michael Yang committed
245
246
247
248
249
250
251
252
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
253
		return 32
Michael Yang's avatar
Michael Yang committed
254
	default:
255
256
257
258
259
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
260
261
262
263
264
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
265

266
267
	switch t {
	case TensorTypeF32:
268
		return 4
269
	case TensorTypeF16:
270
		return 2
271
	case TensorTypeQ4_0:
272
		return 2 + blockSize/2
273
	case TensorTypeQ4_1:
274
		return 2 + 2 + blockSize/2
275
	case TensorTypeQ5_0:
276
		return 2 + 4 + blockSize/2
277
	case TensorTypeQ5_1:
278
		return 2 + 2 + 4 + blockSize/2
279
	case TensorTypeQ8_0:
280
		return 2 + blockSize
281
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
282
		return 2 + 2 + blockSize
283
	case TensorTypeQ2_K:
284
		return blockSize/16 + blockSize/4 + 2 + 2
285
	case TensorTypeQ3_K:
286
		return blockSize/8 + blockSize/4 + 12 + 2
287
	case TensorTypeQ4_K:
288
		return 2 + 2 + 12 + blockSize/2
289
	case TensorTypeQ5_K:
290
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
291
	case TensorTypeQ6_K:
292
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
293
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
294
		return 4 + blockSize + 2*blockSize/16
295
	case tensorTypeIQ2_XXS:
296
		return 2 + 2*blockSize/8
297
	case tensorTypeIQ2_XS:
298
		return 2 + 2*blockSize/8 + blockSize/32
299
	case tensorTypeIQ3_XXS:
300
		return 2 + blockSize/4 + blockSize/8
301
	case tensorTypeIQ1_S:
302
		return 2 + blockSize/8 + blockSize/16
303
	case tensorTypeIQ4_NL:
304
		return 2 + blockSize/2
305
	case tensorTypeIQ3_S:
306
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
307
	case tensorTypeIQ2_S:
308
		return 2 + blockSize/4 + blockSize/16
309
	case tensorTypeIQ4_XS:
310
		return 2 + 2 + blockSize/2 + blockSize/64
311
	case TensorTypeI8:
312
		return 1
313
	case TensorTypeI16:
314
		return 2
315
	case TensorTypeI32:
316
		return 4
317
	case TensorTypeI64:
318
		return 8
319
	case TensorTypeF64:
320
		return 8
321
	case tensorTypeIQ1_M:
322
		return blockSize/8 + blockSize/16 + blockSize/32
323
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
324
		return 2
325
326
327
328
329
	default:
		return 0
	}
}

330
func (t Tensor) Elements() uint64 {
331
332
333
334
335
336
337
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
338
func (t Tensor) Size() uint64 {
339
	return t.Elements() * t.typeSize() / t.blockSize()
340
341
}

342
func (t Tensor) Type() string {
343
	return TensorType(t.Kind).String()
344
345
}

346
347
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
348
	Decode(io.ReadSeeker) (model, error)
349
350
351
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
352
	// Magic constant for `ggml` files (unversioned).
353
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
354
	// Magic constant for `ggml` files (versioned, ggmf).
355
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
356
	// Magic constant for `ggml` files (versioned, ggjt).
357
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
358
	// Magic constant for `ggla` files (LoRA adapter).
359
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
360
	// Magic constant for `gguf` files (versioned, gguf)
361
362
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
363
364
)

Bruce MacDonald's avatar
Bruce MacDonald committed
365
366
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
367
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
368
369
370
371
372
373
374
375
376
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
377
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
378
379
380
381
382
383
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
384
// Decode decodes a GGML model from the given reader.
385
386
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
387
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
388
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
389
390
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

391
	var magic uint32
Michael Yang's avatar
Michael Yang committed
392
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
393
		return nil, 0, err
394
395
396
	}

	var c container
397
398
	switch magic {
	case FILE_MAGIC_GGUF_LE:
399
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
400
	case FILE_MAGIC_GGUF_BE:
401
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
402
	default:
Michael Yang's avatar
Michael Yang committed
403
		return nil, 0, errors.New("invalid file magic")
404
405
	}

Michael Yang's avatar
Michael Yang committed
406
	model, err := c.Decode(rs)
407
	if err != nil {
Michael Yang's avatar
Michael Yang committed
408
		return nil, 0, err
409
410
	}

Michael Yang's avatar
Michael Yang committed
411
412
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
413
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
414
415
	}

416
	// final model type
417
418
419
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
420
	}, offset, nil
421
}
Michael Yang's avatar
Michael Yang committed
422

423
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
424
425
426
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
427
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
428

Michael Yang's avatar
Michael Yang committed
429
430
431
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
432

Michael Yang's avatar
Michael Yang committed
433
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
434

435
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
436
437
438
439
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
440

Michael Yang's avatar
Michael Yang committed
441
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
442
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
443
444
445
446
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
447
448
449

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
450
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
451
452
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
453

Michael Yang's avatar
Michael Yang committed
454
455
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
456
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
457
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
458
459
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
460
461
462
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
463
464
465
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
466
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
467
468
469
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
470
471
472
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
473
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
474
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
475
			if slices.Contains(crossAttentionLayers, int32(i)) {
476
477
478
479
480
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
481
482
		}

Michael Yang's avatar
Michael Yang committed
483
484
485
486
487
488
489
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
490
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
491
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
492
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
493
494
495
496
497
498
499
500
501
502
503
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
504
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
505
506
507
508
509
510
511
512
513
514
515
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
516
517
518
519
520
521
522
523
524
525
526
527
528
529

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
530
531
532
533
534
535
536
537
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
538
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
555

Michael Yang's avatar
Michael Yang committed
556
557
558
559
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
560
561
562
563
564
565
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
566
567
568
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
569
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
570
571
572
573
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
574
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
575
		)
Michael Yang's avatar
Michael Yang committed
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
602
603
	}

Michael Yang's avatar
Michael Yang committed
604
	return
Michael Yang's avatar
Michael Yang committed
605
}
606

607
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
608
609
610
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
611

Michael Yang's avatar
Michael Yang committed
612
	for name, layer := range llm.Tensors().GroupLayers() {
613
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
614
615
			for _, tensor := range layer {
				weights += tensor.Size()
616
617
			}
		}
Michael Yang's avatar
Michael Yang committed
618
	}
619

Michael Yang's avatar
Michael Yang committed
620
621
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
622
623
624
625
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
626

Michael Yang's avatar
Michael Yang committed
627
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
628

Michael Yang's avatar
Michael Yang committed
629
630
631
632
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
633

Michael Yang's avatar
Michael Yang committed
634
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
635
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
636

Michael Yang's avatar
Michael Yang committed
637
638
	switch llm.KV().Architecture() {
	case "mllama":
639
640
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
641
642
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

643
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
644
			imageSize*imageSize*numChannels*maxNumTiles +
645
646
647
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
648
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
649
650
651
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
Michael Yang's avatar
memory  
Michael Yang committed
652
653
654
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
655
	}
Michael Yang's avatar
Michael Yang committed
656

657
658
659
	return weights, graphSize
}

660
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
661
662
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
663
664
665
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
666
667
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
668
669
670
671
672
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
673
674
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
675
676
677
678
679
680
681
682
683
684
685
686
687
688
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}