ggml.go 18.6 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
package ggml
2
3
4
5

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"log/slog"
9
	"math"
10
	"slices"
Michael Yang's avatar
Michael Yang committed
11
	"strings"
12

Michael Yang's avatar
Michael Yang committed
13
	"github.com/ollama/ollama/fs/util/bufioutil"
14
15
)

Michael Yang's avatar
Michael Yang committed
16
17
18
19
type GGML struct {
	container
	model
}
20

Michael Yang's avatar
Michael Yang committed
21
type model interface {
Michael Yang's avatar
Michael Yang committed
22
	KV() KV
Michael Yang's avatar
Michael Yang committed
23
	Tensors() Tensors
24
25
}

26
27
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
28
func (kv KV) Architecture() string {
Michael Yang's avatar
Michael Yang committed
29
	return kv.String("general.architecture", "unknown")
Michael Yang's avatar
Michael Yang committed
30
31
}

32
func (kv KV) Kind() string {
Michael Yang's avatar
Michael Yang committed
33
	return kv.String("general.type", "unknown")
34
35
}

Michael Yang's avatar
Michael Yang committed
36
func (kv KV) ParameterCount() uint64 {
Michael Yang's avatar
Michael Yang committed
37
	return keyValue(kv, "general.parameter_count", uint64(0))
Michael Yang's avatar
Michael Yang committed
38
39
}

40
func (kv KV) FileType() FileType {
Michael Yang's avatar
Michael Yang committed
41
	if t := kv.Uint("general.file_type"); t > 0 {
42
		return FileType(t)
Michael Yang's avatar
Michael Yang committed
43
44
	}

45
	return FileTypeUnknown
Michael Yang's avatar
Michael Yang committed
46
47
48
}

func (kv KV) BlockCount() uint64 {
Michael Yang's avatar
Michael Yang committed
49
50
51
52
53
	return uint64(kv.Uint("block_count"))
}

func (kv KV) EmbeddingLength() uint64 {
	return uint64(kv.Uint("embedding_length"))
Michael Yang's avatar
Michael Yang committed
54
55
56
}

func (kv KV) HeadCount() uint64 {
Michael Yang's avatar
Michael Yang committed
57
	return uint64(kv.Uint("attention.head_count"))
Michael Yang's avatar
Michael Yang committed
58
59
60
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
61
	return uint64(kv.Uint("attention.head_count_kv", 1))
Michael Yang's avatar
Michael Yang committed
62
63
}

Michael Yang's avatar
Michael Yang committed
64
65
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
Michael Yang's avatar
Michael Yang committed
66
		return kv.EmbeddingLength() / heads
Michael Yang's avatar
Michael Yang committed
67
68
69
70
71
72
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
Michael Yang's avatar
Michael Yang committed
73
	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
74
75
76
}

func (kv KV) EmbeddingHeadCountV() uint64 {
Michael Yang's avatar
Michael Yang committed
77
	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
Michael Yang's avatar
Michael Yang committed
78
79
}

Michael Yang's avatar
Michael Yang committed
80
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
81
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
82
83
84
}

func (kv KV) ContextLength() uint64 {
Michael Yang's avatar
Michael Yang committed
85
	return uint64(kv.Uint("context_length"))
Michael Yang's avatar
Michael Yang committed
86
87
}

Michael Yang's avatar
Michael Yang committed
88
func (kv KV) ChatTemplate() string {
Michael Yang's avatar
Michael Yang committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
	return kv.String("tokenizer.chat_template")
}

func (kv KV) String(key string, defaultValue ...string) string {
	return keyValue(kv, key, append(defaultValue, "")...)
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	return keyValue(kv, key, append(defaultValue, 0)...)
}

104
105
106
107
func (kv KV) Bool(key string, defaultValue ...bool) bool {
	return keyValue(kv, key, append(defaultValue, false)...)
}

Michael Yang's avatar
Michael Yang committed
108
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
Michael Yang's avatar
Michael Yang committed
109
	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
110
111
}

Michael Yang's avatar
Michael Yang committed
112
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
Michael Yang's avatar
Michael Yang committed
113
	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
114
115
}

Michael Yang's avatar
Michael Yang committed
116
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
Michael Yang's avatar
Michael Yang committed
117
	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
Michael Yang's avatar
Michael Yang committed
118
119
}

Patrick Devine's avatar
Patrick Devine committed
120
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
121
	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
Patrick Devine's avatar
Patrick Devine committed
122
123
}

124
func (kv KV) OllamaEngineRequired() bool {
125
126
127
	return slices.Contains([]string{
		"gemma3",
		"mistral3",
Michael Yang's avatar
llama4  
Michael Yang committed
128
		"llama4",
129
		"mllama",
130
		"qwen25vl",
131
	}, kv.Architecture())
132
133
}

Michael Yang's avatar
Michael Yang committed
134
type valueTypes interface {
Michael Yang's avatar
Michael Yang committed
135
136
137
138
139
140
141
142
143
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
		*array[string] | *array[float32] | *array[float64] | *array[bool]
Michael Yang's avatar
Michael Yang committed
144
145
}

Michael Yang's avatar
Michael Yang committed
146
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
Michael Yang's avatar
Michael Yang committed
147
148
149
150
151
152
153
154
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key]; ok {
		return val.(T)
	}

155
	slog.Debug("key not found", "key", key, "default", defaultValue[0])
Michael Yang's avatar
Michael Yang committed
156
157
158
	return defaultValue[0]
}

159
type Tensors struct {
Michael Yang's avatar
Michael Yang committed
160
	items  []*Tensor
161
	Offset uint64
Michael Yang's avatar
Michael Yang committed
162
}
Michael Yang's avatar
Michael Yang committed
163

Michael Yang's avatar
Michael Yang committed
164
165
166
167
func (s Tensors) Items(prefix ...string) []*Tensor {
	if len(prefix) == 0 {
		return s.items
	}
168

Michael Yang's avatar
Michael Yang committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
	var items []*Tensor
	for _, t := range s.items {
		if strings.HasPrefix(t.Name, prefix[0]) {
			items = append(items, t)
		}
	}

	return items
}

func (ts Tensors) GroupLayers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts.items {
		parts := strings.Split(t.Name, ".")
		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			if len(parts) > index+2 {
				// blk and mm should have a number after them, join it
				parts = append(
					[]string{strings.Join(parts[:index+2], ".")},
					parts[index+2:]...)
189
			}
Michael Yang's avatar
Michael Yang committed
190
		}
191

Michael Yang's avatar
Michael Yang committed
192
193
		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
Michael Yang's avatar
Michael Yang committed
194
195
		}

Michael Yang's avatar
Michael Yang committed
196
197
198
199
		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
Michael Yang's avatar
Michael Yang committed
200
201
202
203
}

type Layer map[string]*Tensor

Michael Yang's avatar
Michael Yang committed
204
func (l Layer) Size() (size uint64) {
Michael Yang's avatar
Michael Yang committed
205
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
206
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
207
208
209
210
211
	}

	return size
}

212
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
213
214
215
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
216
217

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
218
	Shape []uint64 `json:"shape"`
219

Michael Yang's avatar
Michael Yang committed
220
	io.WriterTo `json:"-"`
221
222
}

223
224
225
226
227
228
229
230
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

231
func (t Tensor) blockSize() uint64 {
232
233
234
235
236
	return (TensorType)(t.Kind).BlockSize()
}

func (t TensorType) BlockSize() uint64 {
	switch t {
Michael Yang's avatar
Michael Yang committed
237
238
239
240
241
242
243
244
245
	case
		0,  // F32
		1,  // F16
		24, // I8
		25, // I16
		26, // I32
		27, // I64
		28, // F64
		30: // BF16
246
		return 1
Michael Yang's avatar
Michael Yang committed
247
248
249
250
251
252
253
254
	case
		2,  // Q4_0
		3,  // Q4_1
		6,  // Q5_0
		7,  // Q5_1
		8,  // Q8_0
		9,  // Q8_1
		20: // IQ4_NL
255
		return 32
Michael Yang's avatar
Michael Yang committed
256
	default:
257
258
259
260
261
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
262
263
264
265
266
	return TensorType(t.Kind).TypeSize()
}

func (t TensorType) TypeSize() uint64 {
	blockSize := t.BlockSize()
267

268
269
	switch t {
	case TensorTypeF32:
270
		return 4
271
	case TensorTypeF16:
272
		return 2
273
	case TensorTypeQ4_0:
274
		return 2 + blockSize/2
275
	case TensorTypeQ4_1:
276
		return 2 + 2 + blockSize/2
277
	case TensorTypeQ5_0:
278
		return 2 + 4 + blockSize/2
279
	case TensorTypeQ5_1:
280
		return 2 + 2 + 4 + blockSize/2
281
	case TensorTypeQ8_0:
282
		return 2 + blockSize
283
	case TensorTypeQ8_1:
Michael Yang's avatar
Michael Yang committed
284
		return 2 + 2 + blockSize
285
	case TensorTypeQ2_K:
286
		return blockSize/16 + blockSize/4 + 2 + 2
287
	case TensorTypeQ3_K:
288
		return blockSize/8 + blockSize/4 + 12 + 2
289
	case TensorTypeQ4_K:
290
		return 2 + 2 + 12 + blockSize/2
291
	case TensorTypeQ5_K:
292
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
293
	case TensorTypeQ6_K:
294
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
295
	case TensorTypeQ8_K:
Michael Yang's avatar
Michael Yang committed
296
		return 4 + blockSize + 2*blockSize/16
297
	case tensorTypeIQ2_XXS:
298
		return 2 + 2*blockSize/8
299
	case tensorTypeIQ2_XS:
300
		return 2 + 2*blockSize/8 + blockSize/32
301
	case tensorTypeIQ3_XXS:
302
		return 2 + blockSize/4 + blockSize/8
303
	case tensorTypeIQ1_S:
304
		return 2 + blockSize/8 + blockSize/16
305
	case tensorTypeIQ4_NL:
306
		return 2 + blockSize/2
307
	case tensorTypeIQ3_S:
308
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
309
	case tensorTypeIQ2_S:
310
		return 2 + blockSize/4 + blockSize/16
311
	case tensorTypeIQ4_XS:
312
		return 2 + 2 + blockSize/2 + blockSize/64
313
	case TensorTypeI8:
314
		return 1
315
	case TensorTypeI16:
316
		return 2
317
	case TensorTypeI32:
318
		return 4
319
	case TensorTypeI64:
320
		return 8
321
	case TensorTypeF64:
322
		return 8
323
	case tensorTypeIQ1_M:
324
		return blockSize/8 + blockSize/16 + blockSize/32
325
	case TensorTypeBF16:
Michael Yang's avatar
Michael Yang committed
326
		return 2
327
328
329
330
331
	default:
		return 0
	}
}

332
func (t Tensor) Elements() uint64 {
333
334
335
336
337
338
339
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
340
func (t Tensor) Size() uint64 {
341
	return t.Elements() * t.typeSize() / t.blockSize()
342
343
}

344
func (t Tensor) Type() string {
345
	return TensorType(t.Kind).String()
346
347
}

348
349
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
350
	Decode(io.ReadSeeker) (model, error)
351
352
353
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
354
	// Magic constant for `ggml` files (unversioned).
355
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
356
	// Magic constant for `ggml` files (versioned, ggmf).
357
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
358
	// Magic constant for `ggml` files (versioned, ggjt).
359
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
360
	// Magic constant for `ggla` files (LoRA adapter).
361
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
362
	// Magic constant for `gguf` files (versioned, gguf)
363
364
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
365
366
)

Bruce MacDonald's avatar
Bruce MacDonald committed
367
368
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
369
func DetectContentType(b []byte) string {
Michael Yang's avatar
Michael Yang committed
370
371
372
373
374
375
376
377
378
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
379
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
380
381
382
383
384
385
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
386
// Decode decodes a GGML model from the given reader.
387
388
//
// It collects array values for arrays with a size less than or equal to
Michael Yang's avatar
Michael Yang committed
389
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
Michael Yang's avatar
Michael Yang committed
390
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
391
392
	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

393
	var magic uint32
Michael Yang's avatar
Michael Yang committed
394
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
395
		return nil, 0, err
396
397
398
	}

	var c container
399
400
	switch magic {
	case FILE_MAGIC_GGUF_LE:
401
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
402
	case FILE_MAGIC_GGUF_BE:
403
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
404
	default:
Michael Yang's avatar
Michael Yang committed
405
		return nil, 0, errors.New("invalid file magic")
406
407
	}

Michael Yang's avatar
Michael Yang committed
408
	model, err := c.Decode(rs)
409
	if err != nil {
Michael Yang's avatar
Michael Yang committed
410
		return nil, 0, err
411
412
	}

Michael Yang's avatar
Michael Yang committed
413
414
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
415
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
416
417
	}

418
	// final model type
419
420
421
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
422
	}, offset, nil
423
}
Michael Yang's avatar
Michael Yang committed
424

425
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
426
427
428
	embedding := f.KV().EmbeddingLength()
	heads := f.KV().HeadCount()
	headsKV := f.KV().HeadCountKV()
Michael Yang's avatar
Michael Yang committed
429
	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
Michael Yang's avatar
Michael Yang committed
430

Michael Yang's avatar
Michael Yang committed
431
432
433
	embeddingHeads := f.KV().EmbeddingHeadCount()
	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
434

Michael Yang's avatar
Michael Yang committed
435
	layers := f.Tensors().GroupLayers()
Michael Yang's avatar
Michael Yang committed
436

437
	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
438
439
440
441
	kv = make([]uint64, f.KV().BlockCount())
	for i := range kv {
		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
	}
Michael Yang's avatar
Michael Yang committed
442

Michael Yang's avatar
Michael Yang committed
443
	switch f.KV().Architecture() {
Michael Yang's avatar
memory  
Michael Yang committed
444
	case "llama", "llama4":
Michael Yang's avatar
Michael Yang committed
445
446
447
448
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
449
450
451

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
452
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
453
454
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
455

Michael Yang's avatar
Michael Yang committed
456
457
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
Michael Yang's avatar
memory  
Michael Yang committed
458
			ff := uint64(f.KV().Uint("feed_forward_length"))
Michael Yang's avatar
Michael Yang committed
459
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
460
461
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
462
463
464
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
465
466
467
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
468
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
469
470
471
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
472
473
474
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
475
		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
476
		for i := range kv {
Michael Yang's avatar
Michael Yang committed
477
			if slices.Contains(crossAttentionLayers, int32(i)) {
478
479
480
481
482
				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
					4 * // sizeof(float32)
					visionTokens *
					tiles
			}
Michael Yang's avatar
Michael Yang committed
483
484
		}

Michael Yang's avatar
Michael Yang committed
485
486
487
488
489
490
491
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
Michael Yang's avatar
Michael Yang committed
492
		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
Michael Yang's avatar
Michael Yang committed
493
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
494
				ropeFreqsCount = ropeFreqsWeights.Elements()
Michael Yang's avatar
Michael Yang committed
495
496
497
498
499
500
501
502
503
504
505
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Patrick Devine's avatar
Patrick Devine committed
506
	case "gemma", "gemma2", "gemma3":
Michael Yang's avatar
Michael Yang committed
507
508
509
510
511
512
513
514
515
516
517
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
518
519
520
521
522
523
524
525
526
527
528
529
530
531

		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
		// engine. Gemma3 always uses the Ollama engine.
		if f.KV().Architecture() == "gemma3" {
			const gemma3GlobalCacheCount = 6
			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
			for i := range kv {
				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
				// layers are the smaller local (sliding) layers.
				if (i+1)%gemma3GlobalCacheCount != 0 {
					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
				}
			}
		}
Michael Yang's avatar
Michael Yang committed
532
533
534
535
536
537
538
539
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
540
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
557

Michael Yang's avatar
Michael Yang committed
558
559
560
561
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
562
563
564
565
566
567
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
568
569
570
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
571
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
572
573
574
575
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
576
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
577
		)
Michael Yang's avatar
Michael Yang committed
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
604
605
	}

Michael Yang's avatar
Michael Yang committed
606
	return
Michael Yang's avatar
Michael Yang committed
607
}
608

609
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
Michael Yang's avatar
Michael Yang committed
610
611
612
	if llm.KV().Uint("vision.block_count") == 0 {
		return
	}
613

Michael Yang's avatar
Michael Yang committed
614
	for name, layer := range llm.Tensors().GroupLayers() {
615
		if name == "v" || strings.HasPrefix(name, "v.") {
Michael Yang's avatar
Michael Yang committed
616
617
			for _, tensor := range layer {
				weights += tensor.Size()
618
619
			}
		}
Michael Yang's avatar
Michael Yang committed
620
	}
621

Michael Yang's avatar
Michael Yang committed
622
623
	imageSize := uint64(llm.KV().Uint("vision.image_size"))
	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
Michael Yang's avatar
Michael Yang committed
624
625
626
627
	if patchSize == 0 {
		slog.Warn("unknown patch size for vision model")
		return
	}
628

Michael Yang's avatar
Michael Yang committed
629
	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
630

Michael Yang's avatar
Michael Yang committed
631
632
633
634
	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
		numPatches++
	}
635

Michael Yang's avatar
Michael Yang committed
636
	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
Michael Yang's avatar
Michael Yang committed
637
	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
638

Michael Yang's avatar
Michael Yang committed
639
640
	switch llm.KV().Architecture() {
	case "mllama":
641
642
		numPaddedPatches := numPatches + 8 - (numPatches%8)%8

Michael Yang's avatar
Michael Yang committed
643
644
		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))

645
		graphSize = 4 * (8 +
Michael Yang's avatar
Michael Yang committed
646
			imageSize*imageSize*numChannels*maxNumTiles +
647
648
649
			embeddingLength*numPatches*maxNumTiles +
			9*embeddingLength*numPaddedPatches*maxNumTiles +
			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
650
	case "gemma3", "mistral3":
Michael Yang's avatar
Michael Yang committed
651
652
653
		graphSize = 4 * (imageSize*imageSize*numChannels +
			embeddingLength*patchSize +
			numPatches*numPatches*headCount)
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
	case "qwen25vl":
		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
		temporalPatchSize := uint64(2)

		// Calculate max possible patches based on max_pixels
		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
		maxWidth := maxPixels / maxHeight
		maxGridHeight := maxHeight / patchSize
		maxGridWidth := maxWidth / patchSize
		// Account for merged patches (2x2 grid)
		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)

		// Calculate graph size based on typical operations in ProcessImage and createPatches
		graphSize = 4 * (maxPixels*numChannels + // Original image storage
			// Normalized pixels
			maxPixels*numChannels +
			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
			// Self-attention calculations (similar to other architectures)
			numPatches*numPatches*headCount +
			// Additional buffer for processing
			embeddingLength*numPatches)
Michael Yang's avatar
memory  
Michael Yang committed
677
678
679
	case "llama4":
		// vision graph is computed independently in the same schedule
		// and is negligible compared to the worst case text graph
680
	}
Michael Yang's avatar
Michael Yang committed
681

682
683
684
	return weights, graphSize
}

685
// SupportsKVCacheType checks if the requested cache type is supported
Michael Yang's avatar
Michael Yang committed
686
687
func (f GGML) SupportsKVCacheType(cacheType string) bool {
	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
688
689
690
}

// SupportsFlashAttention checks if the model supports flash attention
Michael Yang's avatar
Michael Yang committed
691
692
func (f GGML) SupportsFlashAttention() bool {
	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
693
694
695
696
697
	if isEmbedding {
		return false
	}

	// Check head counts match and are non-zero
Michael Yang's avatar
Michael Yang committed
698
699
	headCountK := f.KV().EmbeddingHeadCountK()
	headCountV := f.KV().EmbeddingHeadCountV()
700
701
702
703
704
705
706
707
708
709
710
711
712
713
	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}

// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
	switch cacheType {
	case "q8_0":
		return 1 // 1/2 of fp16
	case "q4_0":
		return 0.5 // 1/4 of fp16
	default:
		return 2 // f16 (default)
	}
}