ggml.go 12.7 KB
Newer Older
1
2
3
4
5
package llm

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
8
	"slices"
Michael Yang's avatar
Michael Yang committed
9
	"strings"
10
	"sync"
11
12

	"github.com/ollama/ollama/util/bufioutil"
13
14
)

Michael Yang's avatar
Michael Yang committed
15
16
17
18
type GGML struct {
	container
	model
}
19

Michael Yang's avatar
Michael Yang committed
20
type model interface {
Michael Yang's avatar
Michael Yang committed
21
	KV() KV
22
	Tensors() *Tensors
23
24
}

25
26
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
func (kv KV) u64(key string) uint64 {
	switch v := kv[key].(type) {
	case uint64:
		return v
	case uint32:
		return uint64(v)
	case float64:
		return uint64(v)
	default:
		return 0
	}
}

func (kv KV) Architecture() string {
	if s, ok := kv["general.architecture"].(string); ok {
		return s
	}

	return "unknown"
}

48
49
50
51
52
53
54
55
func (kv KV) Kind() string {
	if s, ok := kv["general.type"].(string); ok {
		return s
	}

	return "unknown"
}

Michael Yang's avatar
Michael Yang committed
56
57
58
59
func (kv KV) ParameterCount() uint64 {
	return kv.u64("general.parameter_count")
}

Michael Yang's avatar
Michael Yang committed
60
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
61
	if u64 := kv.u64("general.file_type"); u64 > 0 {
Michael Yang's avatar
Michael Yang committed
62
		return fileType(uint32(u64))
Michael Yang's avatar
Michael Yang committed
63
64
	}

Michael Yang's avatar
Michael Yang committed
65
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
66
67
68
69
70
71
72
73
74
75
76
}

func (kv KV) BlockCount() uint64 {
	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
}

func (kv KV) HeadCount() uint64 {
	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
77
78
79
80
81
	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
		return headCountKV
	}

	return 1
Michael Yang's avatar
Michael Yang committed
82
83
}

Michael Yang's avatar
Michael Yang committed
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
		return kv.EmbeddingLength() / kv.HeadCount()
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
		return k
	}

	return kv.EmbeddingHeadCount()
}

func (kv KV) EmbeddingHeadCountV() uint64 {
	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
		return v
	}

	return kv.EmbeddingHeadCount()
}

Michael Yang's avatar
Michael Yang committed
108
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
109
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
110
111
112
113
114
115
116
117
118
119
}

func (kv KV) EmbeddingLength() uint64 {
	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
}

func (kv KV) ContextLength() uint64 {
	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}

Michael Yang's avatar
Michael Yang committed
120
121
122
123
124
func (kv KV) ChatTemplate() string {
	s, _ := kv["tokenizer.chat_template"].(string)
	return s
}

125
126
127
type Tensors struct {
	Items  []*Tensor
	Offset uint64
Michael Yang's avatar
Michael Yang committed
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
	layers     map[string]Layer
	layersOnce sync.Once
}

func (ts *Tensors) Layers() map[string]Layer {
	ts.layersOnce.Do(func() {
		ts.layers = make(map[string]Layer)
		for _, t := range ts.Items {
			parts := strings.Split(t.Name, ".")
			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
				if len(parts) > index+2 {
					// blk and mm should have a number after them, join it
					parts = append(
						[]string{strings.Join(parts[:index+2], ".")},
						parts[index+2:]...)
				}
			}

			if _, ok := ts.layers[parts[0]]; !ok {
				ts.layers[parts[0]] = make(Layer)
			}

			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
Michael Yang's avatar
Michael Yang committed
152
		}
153
	})
Michael Yang's avatar
Michael Yang committed
154

155
	return ts.layers
Michael Yang's avatar
Michael Yang committed
156
157
158
159
160
161
}

type Layer map[string]*Tensor

func (l Layer) size() (size uint64) {
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
162
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
163
164
165
166
167
	}

	return size
}

168
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
169
170
171
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
172
173

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
174
	Shape []uint64 `json:"shape"`
175

Michael Yang's avatar
Michael Yang committed
176
	io.WriterTo `json:"-"`
177
178
}

179
180
181
182
183
184
185
186
func (t Tensor) block() (n int) {
	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
		return -1
	}

	return
}

187
func (t Tensor) blockSize() uint64 {
188
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
189
	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
190
		return 1
Michael Yang's avatar
Michael Yang committed
191
	case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
192
		return 32
193
	default: // All others
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
		return 4 + 4 + blockSize
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
		return 2 + blockSize + 2*blockSize/16
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
frob's avatar
frob committed
258
259
	case 30: // BF16
		return 2
260
261
262
263
264
265
266
267
268
269
270
271
272
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
273
func (t Tensor) Size() uint64 {
274
275
276
	return t.parameters() * t.typeSize() / t.blockSize()
}

277
278
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
279
	Decode(io.ReadSeeker) (model, error)
280
281
282
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
283
	// Magic constant for `ggml` files (unversioned).
284
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
285
	// Magic constant for `ggml` files (versioned, ggmf).
286
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
287
	// Magic constant for `ggml` files (versioned, ggjt).
288
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
289
	// Magic constant for `ggla` files (LoRA adapter).
290
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
291
	// Magic constant for `gguf` files (versioned, gguf)
292
293
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
294
295
)

Bruce MacDonald's avatar
Bruce MacDonald committed
296
297
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
298
299
300
301
302
303
304
305
306
307
func DetectGGMLType(b []byte) string {
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
308
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
309
310
311
312
313
314
		return "gguf"
	default:
		return ""
	}
}

315
316
317
318
319
320
321
322
323
324
325
326
// DecodeGGML decodes a GGML model from the given reader.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
	if maxArraySize == 0 {
		maxArraySize = 1024
	}

	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

327
	var magic uint32
Michael Yang's avatar
Michael Yang committed
328
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
329
		return nil, 0, err
330
331
332
	}

	var c container
333
	switch magic {
Bruce MacDonald's avatar
Bruce MacDonald committed
334
	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
Michael Yang's avatar
Michael Yang committed
335
		return nil, 0, ErrUnsupportedFormat
336
	case FILE_MAGIC_GGLA:
337
		c = &containerGGLA{}
338
	case FILE_MAGIC_GGUF_LE:
339
		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
340
	case FILE_MAGIC_GGUF_BE:
341
		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
342
	default:
Michael Yang's avatar
Michael Yang committed
343
		return nil, 0, errors.New("invalid file magic")
344
345
	}

Michael Yang's avatar
Michael Yang committed
346
	model, err := c.Decode(rs)
347
	if err != nil {
Michael Yang's avatar
Michael Yang committed
348
		return nil, 0, err
349
350
	}

Michael Yang's avatar
Michael Yang committed
351
352
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
353
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
354
355
	}

356
	// final model type
357
358
359
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
360
	}, offset, nil
361
}
Michael Yang's avatar
Michael Yang committed
362

Michael Yang's avatar
Michael Yang committed
363
func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
Michael Yang's avatar
Michael Yang committed
364
365
366
	embedding := llm.KV().EmbeddingLength()
	heads := llm.KV().HeadCount()
	headsKV := llm.KV().HeadCountKV()
367
	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
Michael Yang's avatar
Michael Yang committed
368

Michael Yang's avatar
Michael Yang committed
369
370
	embeddingHeads := llm.KV().EmbeddingHeadCount()
	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
Michael Yang's avatar
Michael Yang committed
371
	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
Michael Yang's avatar
Michael Yang committed
372

Michael Yang's avatar
Michael Yang committed
373
374
	layers := llm.Tensors().Layers()

Michael Yang's avatar
Michael Yang committed
375
376
	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV

Michael Yang's avatar
Michael Yang committed
377
378
	switch llm.KV().Architecture() {
	case "llama":
Michael Yang's avatar
Michael Yang committed
379
380
381
382
		fullOffload = max(
			4*batch*(1+4*embedding+context*(1+heads)),
			4*batch*(embedding+vocab),
		)
Michael Yang's avatar
Michael Yang committed
383
384
385

		partialOffload = 4 * batch * embedding
		partialOffload += max(
Michael Yang's avatar
Michael Yang committed
386
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
387
388
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
389

Michael Yang's avatar
Michael Yang committed
390
391
392
393
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
394
395
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
396
397
398
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
399
400
401
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
402
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
403
404
405
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
406
407
408
	case "mllama":
		var visionTokens, tiles uint64 = 1601, 4

Michael Yang's avatar
Michael Yang committed
409
410
411
412
413
414
415
416
417
418
419
420
		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
			kv = headsKV *
				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
				(2* // sizeof(float16)
					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
					context +
					4* // sizeof(float32)
						uint64(crossAttentionLayers.size)* // num cross attention layers
						visionTokens*
						tiles)
		}

Michael Yang's avatar
Michael Yang committed
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
		fullOffload = max(
			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
			// vocab graph
			4*batch*(embedding+vocab),
		)

		var ropeFreqsCount uint64
		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
				ropeFreqsCount = ropeFreqsWeights.parameters()
			}
		}

		partialOffload = max(
			4*(batch*
				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
				ropeFreqsCount+
				embeddingHeadsK*context*headsKV),
			// vocab graph
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
442
443
444
445
446
447
448
449
450
451
452
453
	case "gemma", "gemma2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
		)

		partialOffload = max(
			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
				4*embeddingHeadsK*context*8+
				embedding*embeddingHeadsK*heads*9/16,
		)
Michael Yang's avatar
Michael Yang committed
454
455
456
457
458
459
460
461
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
462
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
479

Michael Yang's avatar
Michael Yang committed
480
481
482
483
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
484
485
486
487
488
489
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
490
491
492
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
493
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
494
495
496
497
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
498
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
499
		)
Michael Yang's avatar
Michael Yang committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
	case "chatglm":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
			fullOffload = max(
				fullOffload,
				4*batch*(2+
					2*embedding+
					context+
					context*heads+
					embeddingHeadsK*heads+
					qkvBias.Shape[0]),
			)

			partialOffload = max(
				partialOffload,
				4*batch*(1+
					2*embedding+
					embeddingHeadsK*heads+
					context+
					context*heads)+
					4*embeddingHeadsK*context+
					4*context*embeddingHeadsK+
					4*qkvBias.Shape[0],
			)
		}
Michael Yang's avatar
Michael Yang committed
526
527
	}

Michael Yang's avatar
Michael Yang committed
528
	return
Michael Yang's avatar
Michael Yang committed
529
}