ggml.go 9.65 KB
Newer Older
1
2
3
4
5
package llm

import (
	"encoding/binary"
	"errors"
Michael Yang's avatar
Michael Yang committed
6
	"fmt"
7
	"io"
Michael Yang's avatar
Michael Yang committed
8
	"strings"
9
10
)

Michael Yang's avatar
Michael Yang committed
11
12
13
14
type GGML struct {
	container
	model
}
15

Michael Yang's avatar
Michael Yang committed
16
type model interface {
Michael Yang's avatar
Michael Yang committed
17
	KV() KV
Michael Yang's avatar
Michael Yang committed
18
	Tensors() Tensors
19
20
}

21
22
type KV map[string]any

Michael Yang's avatar
Michael Yang committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
func (kv KV) u64(key string) uint64 {
	switch v := kv[key].(type) {
	case uint64:
		return v
	case uint32:
		return uint64(v)
	case float64:
		return uint64(v)
	default:
		return 0
	}
}

func (kv KV) Architecture() string {
	if s, ok := kv["general.architecture"].(string); ok {
		return s
	}

	return "unknown"
}

func (kv KV) ParameterCount() uint64 {
	return kv.u64("general.parameter_count")
}

Michael Yang's avatar
Michael Yang committed
48
func (kv KV) FileType() fileType {
Michael Yang's avatar
Michael Yang committed
49
	if u64 := kv.u64("general.file_type"); u64 > 0 {
Michael Yang's avatar
Michael Yang committed
50
		return fileType(uint32(u64))
Michael Yang's avatar
Michael Yang committed
51
52
	}

Michael Yang's avatar
Michael Yang committed
53
	return fileTypeUnknown
Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
61
62
63
64
}

func (kv KV) BlockCount() uint64 {
	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
}

func (kv KV) HeadCount() uint64 {
	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
}

func (kv KV) HeadCountKV() uint64 {
Michael Yang's avatar
Michael Yang committed
65
66
67
68
69
	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
		return headCountKV
	}

	return 1
Michael Yang's avatar
Michael Yang committed
70
71
}

Michael Yang's avatar
Michael Yang committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
func (kv KV) EmbeddingHeadCount() uint64 {
	if heads := kv.HeadCount(); heads > 0 {
		return kv.EmbeddingLength() / kv.HeadCount()
	}

	return 0
}

func (kv KV) EmbeddingHeadCountK() uint64 {
	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
		return k
	}

	return kv.EmbeddingHeadCount()
}

func (kv KV) EmbeddingHeadCountV() uint64 {
	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
		return v
	}

	return kv.EmbeddingHeadCount()
}

Michael Yang's avatar
Michael Yang committed
96
func (kv KV) GQA() uint64 {
Michael Yang's avatar
Michael Yang committed
97
	return kv.HeadCount() / kv.HeadCountKV()
Michael Yang's avatar
Michael Yang committed
98
99
100
101
102
103
104
105
106
107
}

func (kv KV) EmbeddingLength() uint64 {
	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
}

func (kv KV) ContextLength() uint64 {
	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}

Michael Yang's avatar
Michael Yang committed
108
109
110
111
112
func (kv KV) ChatTemplate() string {
	s, _ := kv["tokenizer.chat_template"].(string)
	return s
}

Michael Yang's avatar
Michael Yang committed
113
114
115
116
117
118
119
type Tensors []*Tensor

func (ts Tensors) Layers() map[string]Layer {
	layers := make(map[string]Layer)
	for _, t := range ts {
		parts := strings.Split(t.Name, ".")
		if parts[0] == "blk" {
120
121
			// join first and second part, e.g. blk.%d
			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
Michael Yang's avatar
Michael Yang committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
		}

		if _, ok := layers[parts[0]]; !ok {
			layers[parts[0]] = make(Layer)
		}

		layers[parts[0]][strings.Join(parts[1:], ".")] = t
	}

	return layers
}

type Layer map[string]*Tensor

func (l Layer) size() (size uint64) {
	for _, t := range l {
Michael Yang's avatar
Michael Yang committed
138
		size += t.Size()
Michael Yang's avatar
Michael Yang committed
139
140
141
142
143
	}

	return size
}

144
type Tensor struct {
Michael Yang's avatar
Michael Yang committed
145
146
147
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`
148
149

	// Shape is the number of elements in each dimension
Michael Yang's avatar
Michael Yang committed
150
	Shape []uint64 `json:"shape"`
151

Michael Yang's avatar
Michael Yang committed
152
	io.WriterTo `json:"-"`
153
154
155
}

func (t Tensor) blockSize() uint64 {
156
	switch t.Kind {
Michael Yang's avatar
Michael Yang committed
157
	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
158
		return 1
Michael Yang's avatar
Michael Yang committed
159
	case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
160
		return 32
161
	default: // All others
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
		return 256
	}
}

func (t Tensor) typeSize() uint64 {
	blockSize := t.blockSize()

	switch t.Kind {
	case 0: // FP32
		return 4
	case 1: // FP16
		return 2
	case 2: // Q4_0
		return 2 + blockSize/2
	case 3: // Q4_1
		return 2 + 2 + blockSize/2
	case 6: // Q5_0
		return 2 + 4 + blockSize/2
	case 7: // Q5_1
		return 2 + 2 + 4 + blockSize/2
	case 8: // Q8_0
		return 2 + blockSize
	case 9: // Q8_1
		return 4 + 4 + blockSize
	case 10: // Q2_K
		return blockSize/16 + blockSize/4 + 2 + 2
	case 11: // Q3_K
		return blockSize/8 + blockSize/4 + 12 + 2
	case 12: // Q4_K
		return 2 + 2 + 12 + blockSize/2
	case 13: // Q5_K
		return 2 + 2 + 12 + blockSize/8 + blockSize/2
	case 14: // Q6_K
		return blockSize/2 + blockSize/4 + blockSize/16 + 2
	case 15: // Q8_K
		return 2 + blockSize + 2*blockSize/16
	case 16: // IQ2_XXS
		return 2 + 2*blockSize/8
	case 17: // IQ2_XS
		return 2 + 2*blockSize/8 + blockSize/32
	case 18: // IQ3_XXS
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
		return 2 + blockSize/4 + blockSize/8
	case 19: // IQ1_S
		return 2 + blockSize/8 + blockSize/16
	case 20: // IQ4_NL
		return 2 + blockSize/2
	case 21: // IQ3_S
		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
	case 22: // IQ2_S
		return 2 + blockSize/4 + blockSize/16
	case 23: // IQ4_XS
		return 2 + 2 + blockSize/2 + blockSize/64
	case 24: // I8
		return 1
	case 25: // I16
		return 2
	case 26: // I32
		return 4
	case 27: // I64
		return 8
	case 28: // F64
		return 8
	case 29: // IQ1_M
		return blockSize/8 + blockSize/16 + blockSize/32
226
227
228
229
230
231
232
233
234
235
236
237
238
	default:
		return 0
	}
}

func (t Tensor) parameters() uint64 {
	var count uint64 = 1
	for _, n := range t.Shape {
		count *= n
	}
	return count
}

Michael Yang's avatar
Michael Yang committed
239
func (t Tensor) Size() uint64 {
240
241
242
	return t.parameters() * t.typeSize() / t.blockSize()
}

243
244
type container interface {
	Name() string
Michael Yang's avatar
Michael Yang committed
245
	Decode(io.ReadSeeker) (model, error)
246
247
248
}

const (
Bruce MacDonald's avatar
Bruce MacDonald committed
249
	// Magic constant for `ggml` files (unversioned).
250
	FILE_MAGIC_GGML = 0x67676d6c
Bruce MacDonald's avatar
Bruce MacDonald committed
251
	// Magic constant for `ggml` files (versioned, ggmf).
252
	FILE_MAGIC_GGMF = 0x67676d66
Bruce MacDonald's avatar
Bruce MacDonald committed
253
	// Magic constant for `ggml` files (versioned, ggjt).
254
	FILE_MAGIC_GGJT = 0x67676a74
Bruce MacDonald's avatar
Bruce MacDonald committed
255
	// Magic constant for `ggla` files (LoRA adapter).
256
	FILE_MAGIC_GGLA = 0x67676C61
Bruce MacDonald's avatar
Bruce MacDonald committed
257
	// Magic constant for `gguf` files (versioned, gguf)
258
259
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
260
261
)

Bruce MacDonald's avatar
Bruce MacDonald committed
262
263
var ErrUnsupportedFormat = errors.New("unsupported model format")

Michael Yang's avatar
Michael Yang committed
264
265
266
267
268
269
270
271
272
273
func DetectGGMLType(b []byte) string {
	switch binary.LittleEndian.Uint32(b[:4]) {
	case FILE_MAGIC_GGML:
		return "ggml"
	case FILE_MAGIC_GGMF:
		return "ggmf"
	case FILE_MAGIC_GGJT:
		return "ggjt"
	case FILE_MAGIC_GGLA:
		return "ggla"
274
	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
Michael Yang's avatar
Michael Yang committed
275
276
277
278
279
280
		return "gguf"
	default:
		return ""
	}
}

Michael Yang's avatar
Michael Yang committed
281
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
282
	var magic uint32
Michael Yang's avatar
Michael Yang committed
283
	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
Michael Yang's avatar
Michael Yang committed
284
		return nil, 0, err
285
286
287
	}

	var c container
288
	switch magic {
Bruce MacDonald's avatar
Bruce MacDonald committed
289
	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
Michael Yang's avatar
Michael Yang committed
290
		return nil, 0, ErrUnsupportedFormat
291
	case FILE_MAGIC_GGLA:
292
		c = &containerGGLA{}
293
	case FILE_MAGIC_GGUF_LE:
294
		c = &containerGGUF{ByteOrder: binary.LittleEndian}
295
296
	case FILE_MAGIC_GGUF_BE:
		c = &containerGGUF{ByteOrder: binary.BigEndian}
297
	default:
Michael Yang's avatar
Michael Yang committed
298
		return nil, 0, errors.New("invalid file magic")
299
300
	}

Michael Yang's avatar
Michael Yang committed
301
	model, err := c.Decode(rs)
Michael Yang's avatar
Michael Yang committed
302
303
304
	if errors.Is(err, io.EOF) {
		// noop
	} else if err != nil {
Michael Yang's avatar
Michael Yang committed
305
		return nil, 0, err
306
307
	}

Michael Yang's avatar
Michael Yang committed
308
309
	offset, err := rs.Seek(0, io.SeekCurrent)
	if err != nil {
Michael Yang's avatar
Michael Yang committed
310
		return nil, 0, err
Michael Yang's avatar
Michael Yang committed
311
312
	}

313
	// final model type
314
315
316
	return &GGML{
		container: c,
		model:     model,
Michael Yang's avatar
Michael Yang committed
317
	}, offset, nil
318
}
Michael Yang's avatar
Michael Yang committed
319

Michael Yang's avatar
Michael Yang committed
320
321
322
323
324
func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
	embedding := llm.KV().EmbeddingLength()
	heads := llm.KV().HeadCount()
	headsKV := llm.KV().HeadCountKV()
	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
Michael Yang's avatar
Michael Yang committed
325

Michael Yang's avatar
Michael Yang committed
326
327
328
	embeddingHeads := llm.KV().EmbeddingHeadCount()
	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()

Michael Yang's avatar
Michael Yang committed
329
330
	layers := llm.Tensors().Layers()

Michael Yang's avatar
Michael Yang committed
331
332
	switch llm.KV().Architecture() {
	case "llama":
Michael Yang's avatar
Michael Yang committed
333
334
335
336
		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))

		partialOffload = 4 * batch * embedding
		partialOffload += max(
337
			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
Michael Yang's avatar
Michael Yang committed
338
			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
Michael Yang's avatar
Michael Yang committed
339
340
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
		)
Michael Yang's avatar
Michael Yang committed
341

Michael Yang's avatar
Michael Yang committed
342
343
344
345
		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			// mixtral 8x22b
			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
346
347
				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
Michael Yang's avatar
Michael Yang committed
348
349
350
			)
		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
			// mixtral 8x7b
Michael Yang's avatar
Michael Yang committed
351
352
353
			ffnGateWeight1 := ffnGateWeight.Shape[1]
			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
			partialOffload = max(
Michael Yang's avatar
Michael Yang committed
354
				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
Michael Yang's avatar
Michael Yang committed
355
356
357
				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
			)
		}
Michael Yang's avatar
Michael Yang committed
358
359
360
361
362
363
364
365
366
367
368
	case "gemma":
		fullOffload = 4 * batch * (embedding + vocab)
		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
	case "command-r":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(2+4*embedding+context*(1+heads)),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
369
			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
Michael Yang's avatar
Michael Yang committed
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
		)
	case "qwen2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+2*embedding+context+context*heads),
		)

		partialOffload = max(
			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
		)
	case "phi2":
		fullOffload = max(
			4*batch*(embedding+vocab),
			4*batch*(1+4*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
386

Michael Yang's avatar
Michael Yang committed
387
388
389
390
		partialOffload = max(
			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
			4*batch*(2+3*embedding+context+context*heads),
		)
Michael Yang's avatar
Michael Yang committed
391
392
393
394
395
396
	case "stablelm":
		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
		partialOffload = max(
			4*batch*(vocab+2*embedding),
			fullOffload,
		)
Michael Yang's avatar
Michael Yang committed
397
398
399
	case "deepseek2":
		fullOffload = max(
			4*batch*(3*embedding+vocab),
Michael Yang's avatar
Michael Yang committed
400
			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
Michael Yang's avatar
Michael Yang committed
401
402
403
404
		)

		partialOffload = max(
			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
Michael Yang's avatar
Michael Yang committed
405
			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
Michael Yang's avatar
Michael Yang committed
406
		)
Michael Yang's avatar
Michael Yang committed
407
408
	}

Michael Yang's avatar
Michael Yang committed
409
	return
Michael Yang's avatar
Michael Yang committed
410
}