ggml.go 20.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"errors"
Michael Yang's avatar
Michael Yang committed
13
14
	"fmt"
	"io"
15
	"iter"
Michael Yang's avatar
Michael Yang committed
16
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
20
21
22
	"slices"
	"strconv"
	"strings"
	"unicode"
Michael Yang's avatar
Michael Yang committed
23
24
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
	fs "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml"
28
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
Michael Yang's avatar
Michael Yang committed
29
30
31
	"golang.org/x/sync/errgroup"
)

32
33
func devices() iter.Seq[*C.struct_ggml_backend_device] {
	return func(yield func(*C.struct_ggml_backend_device) bool) {
34
		ggml.OnceLoad()
35
36
37
38
39
		for i := range C.ggml_backend_dev_count() {
			if !yield(C.ggml_backend_dev_get(i)) {
				return
			}
		}
Michael Yang's avatar
Michael Yang committed
40
	}
41
}
Michael Yang's avatar
Michael Yang committed
42
43

type Backend struct {
44
	meta *fs.GGML
45

46
	flashAttention bool
47
48

	sched *C.struct_ggml_backend_sched
49
50
51
52
53

	tensors  map[string]*C.struct_ggml_tensor
	ctxs     []*C.struct_ggml_context
	backends []*C.struct_ggml_backend
	bufts    []*C.struct_ggml_backend_buffer_type
Michael Yang's avatar
Michael Yang committed
54
55
}

56
func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
Michael Yang's avatar
Michael Yang committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
	meta, n, err := fs.Decode(r, -1)
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

72
73
74
75
76
77
78
79
80
81
82
83
	type dbt struct {
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
	for d := range devices() {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			cpus = append(cpus, d)
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
84
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
85
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
86
87
88
		}
	}

89
90
91
92
93
94
	var cpuBufferTypes []*C.struct_ggml_backend_buffer_type
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			cpuBufferTypes = append(cpuBufferTypes, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
95
		}
96
97
98
99
	}

	var sum uint64
	var cumsum []uint64
Michael Yang's avatar
Michael Yang committed
100

101
102
103
104
105
106
107
108
109
110
111
112
	var gpuBufferTypes []dbt
	for _, d := range gpus {
		var free, total C.size_t
		C.ggml_backend_dev_memory(d, &free, &total)
		sum += uint64(free)
		cumsum = append(cumsum, sum)

		bt := C.ggml_backend_dev_buffer_type(d)
		gpuBufferTypes = append(gpuBufferTypes, dbt{
			d:   d,
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
		})
Michael Yang's avatar
Michael Yang committed
113
114
	}

115
116
117
118
119
120
121
122
123
124
125
126
	splits := make([]float64, len(cumsum))
	for i := range splits {
		splits[i] = float64(cumsum[i]) / float64(sum)
	}

	input := dbt{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
	slog.Info("input layer", "device", C.GoString(C.ggml_backend_dev_name(input.d)))

	var blocks int
	for key, value := range meta.KV() {
		if strings.HasSuffix(key, ".block_count") {
			blocks += int(value.(uint32))
Michael Yang's avatar
Michael Yang committed
127
		}
128
	}
Michael Yang's avatar
Michael Yang committed
129

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
	indexFunc := func(i int) func(float64) bool {
		return func(f float64) bool {
			return float64(i)/float64(blocks+1) < f
		}
	}

	layers := make([]dbt, blocks)
	for i := range layers {
		layers[i] = gpuBufferTypes[slices.IndexFunc(splits, indexFunc(i))]
		slog.Info("layer", "i", i, "device", C.GoString(C.ggml_backend_dev_name(layers[i].d)))
	}

	output := gpuBufferTypes[slices.IndexFunc(splits, indexFunc(blocks))]
	slog.Info("output layer", "device", C.GoString(C.ggml_backend_dev_name(output.d)))

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
	maxTensors += blocks * 2

	slog.Info("max tensors", "max_tensors", maxTensors)

151
152
153
154
155
156
157
	type tensor struct {
		source *fs.Tensor
		target string
	}

	targets := make(map[string][]string)

158
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
159
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
160
161
162
163
164
165
166
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
167

168
169
170
171
172
173
174
175
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
176
			defer C.free(unsafe.Pointer(cname))
177
178
179
180
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

181
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
182
183
			C.ggml_set_name(tt, cname)

184
			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
185
186
187
188
189
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
190
191
	}

192
193
194
195
196
197
198
199
200
	hasPart := func(s string, parts ...string) bool {
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
201
202
	}

203
204
205
	for _, t := range meta.Tensors().Items() {
		switch {
		case hasPart(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
206
			createTensor(tensor{source: t}, input.bts)
207
		case hasPart(t.Name, "cls", "output", "output_norm"):
208
			createTensor(tensor{source: t}, output.bts)
209
210
211
212
213
214
215
216
217
218
		default:
			if i := func() int {
				if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
					if i, err := strconv.Atoi(fields[0]); err == nil {
						return i
					}
				}

				return -1
			}(); i >= 0 {
219
				createTensor(tensor{source: t}, layers[i].bts)
220
			} else {
221
222
223
224
225
				for i, layer := range layers {
					createTensor(tensor{
						source: t,
						target: "blk." + strconv.Itoa(i) + "." + t.Name,
					}, layer.bts)
226
227
228
229
				}
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
	bbs := make(map[*C.struct_ggml_context][]*C.struct_ggml_backend_buffer, len(ctxs))

	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
		bbs[c] = append(bbs[c], b)
	}

	for bs := range maps.Values(bbs) {
		for _, b := range bs {
			slog.Info("model", "buffer", C.GoString(C.ggml_backend_buffer_name(b)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(b))))
		}
	}

	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
Michael Yang's avatar
Michael Yang committed
257
	var g errgroup.Group
258
	for _, t := range meta.Tensors().Items() {
259
260
261
262
263
		for _, target := range targets[t.Name] {
			g.Go(func() error {
				if target == "" {
					target = t.Name
				}
264

265
266
267
268
				tt, ok := tensors[target]
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
269

270
271
272
273
274
				bts := make([]byte, t.Size())
				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
				if err != nil {
					return err
				}
Michael Yang's avatar
Michael Yang committed
275

276
277
278
				if n != len(bts) {
					return errors.New("short read")
				}
Michael Yang's avatar
Michael Yang committed
279

280
281
282
283
284
285
286
				cname := C.CString(t.Name)
				C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size()))
				C.free(unsafe.Pointer(cname))

				return nil
			})
		}
Michael Yang's avatar
Michael Yang committed
287
288
	}

289
	if g.Wait() != nil {
Michael Yang's avatar
Michael Yang committed
290
291
292
		return nil, err
	}

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	var backends []*C.struct_ggml_backend
	var bufts []*C.struct_ggml_backend_buffer_type
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		backends = append(backends, b)

		bt := C.ggml_backend_get_default_buffer_type(b)
		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
				bt = hbt
			}
		}

		bufts = append(bufts, bt)

		slog.Info("compute buffer", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
309
310
	}

Michael Yang's avatar
Michael Yang committed
311
	return &Backend{
312
		flashAttention: params.FlashAttention,
313
314
		meta:              meta,
		tensors:           tensors,
315
316
317
318
319
320
321
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
			C.int(len(backends)),
			C.size_t(max(8192, len(meta.Tensors().Items())*5)),
			true,
		),
Michael Yang's avatar
Michael Yang committed
322
323
324
325
326
327
328
329
330
331
332
333
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

func (b *Backend) Config() ml.Config {
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
334
335
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
336
337
338
339
340
341
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
342
	maxTensors := max(8192, len(b.meta.Tensors().Items())*5)
Michael Yang's avatar
Michael Yang committed
343
	return &Context{
344
345
346
347
348
349
		b:          b,
		maxTensors: maxTensors,
		ctx: C.ggml_init(C.struct_ggml_init_params{
			mem_size: C.size_t(maxTensors)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxTensors), false),
			no_alloc: true,
		}),
Michael Yang's avatar
Michael Yang committed
350
351
352
	}
}

353
func (b *Backend) CacheConfig() ml.CacheConfig {
354
355
356
357
358
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
359
360
}

Michael Yang's avatar
Michael Yang committed
361
type Context struct {
362
	b *Backend
Michael Yang's avatar
Michael Yang committed
363

364
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
365
	graph *C.struct_ggml_cgraph
366
367

	maxTensors int
Michael Yang's avatar
Michael Yang committed
368
369
}

370
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
371
	if c.graph == nil {
372
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxTensors), false)
Michael Yang's avatar
Michael Yang committed
373
374
	}

375
376
377
378
379
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
380
381
}

382
func (c *Context) Compute(tensors ...ml.Tensor) {
383
	C.ggml_backend_sched_reset(c.b.sched)
384
385
	C.ggml_backend_sched_alloc_graph(c.b.sched, c.graph)
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
386

387
388
389
	needSync := true
	sync := func() {
		if needSync {
390
			C.ggml_backend_sched_synchronize(c.b.sched)
391
392
393
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
394

395
396
397
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
398
399
		}
	}
Michael Yang's avatar
Michael Yang committed
400
401
}

Jesse Gross's avatar
Jesse Gross committed
402
func (c *Context) MaxTensors() int {
403
	return c.maxTensors
Jesse Gross's avatar
Jesse Gross committed
404
405
}

406
407
408
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
409
		sh[i] = C.int64_t(s)
410
411
412
413
414
	}

	return &sh[0]
}

415
func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
416
417
418
419
420
421
422
423
424
425
426
427
428
	if len(shape) < 1 || len(shape) > 4 {
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

	var t *C.struct_ggml_tensor
	switch dtype {
	case ml.DTypeF32:
429
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
Jesse Gross's avatar
Jesse Gross committed
430
	case ml.DTypeF16:
431
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
432
	case ml.DTypeI32:
433
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
434
435
436
437
	default:
		panic("unsupported dtype")
	}

438
	b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
439
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
440
	C.ggml_set_input(t)
441
442
443
444
	return &Tensor{b: ctx.b, t: t}
}

func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
445
	return newTensor(c, dtype, shape)
446
447
448
}

func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
449
450
451
	t := newTensor(c, dtype, shape)
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
452
453
454
455
}

func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
	n := len(s)
456
457
458
459

	if n == 0 {
		var shape C.int64_t = 0
		t := C.ggml_new_tensor(ctx.ctx, dtype, 1, &shape)
460
		return &Tensor{b: ctx.b, t: t}, nil
461
462
	}

Michael Yang's avatar
Michael Yang committed
463
464
465
466
467
468
469
470
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
	}

471
	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
472
	b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
473
474
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
475
	C.ggml_set_input(t)
476
	return &Tensor{b: ctx.b, t: t}, nil
Michael Yang's avatar
Michael Yang committed
477
478
479
480
481
482
483
484
485
486
}

func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
	return fromSlice(c, s, shape, C.GGML_TYPE_F32)
}

func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
}

487
func (c *Context) Close() {
488
489
490
	if c != nil {
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
491
492
493
}

type Tensor struct {
494
	b    *Backend
Michael Yang's avatar
Michael Yang committed
495
	t    *C.struct_ggml_tensor
496
	sync func()
Michael Yang's avatar
Michael Yang committed
497
498
499
500
501
502
503
504
505
506
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

507
508
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
509
510
}

511
512
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
513
514
}

515
516
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
517
518
519
520
521
522
523
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

524
525
526
527
528
529
530
531
532
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
533
534
}

535
536
537
538
539
540
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
541
542
543
544
545
546
547
548
549
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
550
551
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
Michael Yang's avatar
Michael Yang committed
552
553
554
555
556
557
558
559
560
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
561
		b: t.b,
Michael Yang's avatar
Michael Yang committed
562
563
564
565
566
567
568
569
570
571
572
573
574
575
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
576
		b: t.b,
Michael Yang's avatar
Michael Yang committed
577
578
579
580
581
582
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
583
		b: t.b,
Michael Yang's avatar
Michael Yang committed
584
585
586
587
588
589
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
590
		b: t.b,
Michael Yang's avatar
Michael Yang committed
591
592
593
594
595
596
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
597
		b: t.b,
Michael Yang's avatar
Michael Yang committed
598
599
600
601
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

602
603
604
605
606
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
607
		b: t.b,
608
609
610
611
		t: mul,
	}
}

Michael Yang's avatar
Michael Yang committed
612
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
613
	tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
614
615
616
617
618
619
620
621
	if b != nil {
		tt = tt.Add(ctx, b)
	}

	return tt
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
622
	return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
623
624
}

625
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
626
627
628
629
630
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
631
		b: t.b,
Michael Yang's avatar
Michael Yang committed
632
633
634
635
636
637
638
639
640
641
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
642
		b: t.b,
Michael Yang's avatar
Michael Yang committed
643
644
645
646
647
648
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
649
		b: t.b,
Michael Yang's avatar
Michael Yang committed
650
651
652
653
654
655
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
656
		b: t.b,
Michael Yang's avatar
Michael Yang committed
657
658
659
660
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

661
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
662
663
664
	switch len(shape) {
	case 1:
		return &Tensor{
665
			b: t.b,
Michael Yang's avatar
Michael Yang committed
666
667
668
669
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
670
			b: t.b,
Michael Yang's avatar
Michael Yang committed
671
672
673
674
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
675
			b: t.b,
Michael Yang's avatar
Michael Yang committed
676
677
678
679
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
680
			b: t.b,
Michael Yang's avatar
Michael Yang committed
681
682
683
684
685
686
687
688
689
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
690
		b: t.b,
Michael Yang's avatar
Michael Yang committed
691
692
693
694
695
696
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
697
		b: t.b,
Michael Yang's avatar
Michael Yang committed
698
699
700
701
702
703
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
704
		b: t.b,
Michael Yang's avatar
Michael Yang committed
705
706
707
708
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

709
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
710
711
712
713
714
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
715
		b: t.b,
Michael Yang's avatar
Michael Yang committed
716
717
718
719
720
721
722
723
		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
724
			b: t.b,
Michael Yang's avatar
Michael Yang committed
725
726
727
728
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
729
			b: t.b,
Michael Yang's avatar
Michael Yang committed
730
731
732
733
734
735
736
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
737
			b: t.b,
Michael Yang's avatar
Michael Yang committed
738
739
740
741
742
743
744
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
745
			b: t.b,
Michael Yang's avatar
Michael Yang committed
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

const (
	ropeTypeNorm C.int = iota
)

func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
	if ropeFactors == nil {
762
		ropeFactors = &Tensor{b: t.b}
Michael Yang's avatar
Michael Yang committed
763
764
	}

Jesse Gross's avatar
Jesse Gross committed
765
766
767
768
769
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
770
	return &Tensor{
771
		b: t.b,
Michael Yang's avatar
Michael Yang committed
772
		t: C.ggml_rope_ext(
Jesse Gross's avatar
Jesse Gross committed
773
			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
			C.int(ropeDim),
			131072,       // YaRN n_ctx_train
			ropeTypeNorm, // ROPE_TYPE_NORM
			C.float(ropeBase),
			C.float(ropeScale),
			0.,  // YaRN ext_factor
			1.,  // YaRN attn_factor
			32., // YaRN beta_fast
			1.,  // YaRN beta_slow
		),
	}
}

func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
789
		b: t.b,
Michael Yang's avatar
Michael Yang committed
790
791
792
793
794
795
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
796
		b: t.b,
Michael Yang's avatar
Michael Yang committed
797
798
799
800
801
802
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
803
		b: t.b,
Michael Yang's avatar
Michael Yang committed
804
805
806
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
807

808
809
810
811
812
813
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

814
815
816
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

817
818
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
819

820
821
822
823
824
825
826
827
828
829
830
831
832
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
833
}