"helpers/model_init_scripts/init_model_300M.py" did not exist on "91542bfaa7895a8b07b238680cda7b0b633b74f6"
ggml.go 20.3 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"errors"
Michael Yang's avatar
Michael Yang committed
13
14
	"fmt"
	"io"
15
	"iter"
Michael Yang's avatar
Michael Yang committed
16
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
20
21
22
	"slices"
	"strconv"
	"strings"
	"unicode"
Michael Yang's avatar
Michael Yang committed
23
24
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
	fs "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml"
28
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
Michael Yang's avatar
Michael Yang committed
29
30
31
	"golang.org/x/sync/errgroup"
)

32
33
func devices() iter.Seq[*C.struct_ggml_backend_device] {
	return func(yield func(*C.struct_ggml_backend_device) bool) {
34
		ggml.OnceLoad()
35
36
37
38
39
		for i := range C.ggml_backend_dev_count() {
			if !yield(C.ggml_backend_dev_get(i)) {
				return
			}
		}
Michael Yang's avatar
Michael Yang committed
40
	}
41
}
Michael Yang's avatar
Michael Yang committed
42
43

type Backend struct {
44
	meta *fs.GGML
45

46
	flashAttention bool
47
48

	sched *C.struct_ggml_backend_sched
49
50
51
52
53

	tensors  map[string]*C.struct_ggml_tensor
	ctxs     []*C.struct_ggml_context
	backends []*C.struct_ggml_backend
	bufts    []*C.struct_ggml_backend_buffer_type
Michael Yang's avatar
Michael Yang committed
54
55
}

56
func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
Michael Yang's avatar
Michael Yang committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
	meta, n, err := fs.Decode(r, -1)
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

72
73
74
75
76
77
78
79
80
81
82
83
	type dbt struct {
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
	for d := range devices() {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			cpus = append(cpus, d)
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
84
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
85
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
86
87
88
		}
	}

89
90
91
92
93
94
	var cpuBufferTypes []*C.struct_ggml_backend_buffer_type
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			cpuBufferTypes = append(cpuBufferTypes, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
95
		}
96
97
98
99
	}

	var sum uint64
	var cumsum []uint64
Michael Yang's avatar
Michael Yang committed
100

101
102
103
104
105
106
107
108
109
110
111
112
	var gpuBufferTypes []dbt
	for _, d := range gpus {
		var free, total C.size_t
		C.ggml_backend_dev_memory(d, &free, &total)
		sum += uint64(free)
		cumsum = append(cumsum, sum)

		bt := C.ggml_backend_dev_buffer_type(d)
		gpuBufferTypes = append(gpuBufferTypes, dbt{
			d:   d,
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
		})
Michael Yang's avatar
Michael Yang committed
113
114
	}

115
116
117
118
119
120
121
122
123
124
125
126
	splits := make([]float64, len(cumsum))
	for i := range splits {
		splits[i] = float64(cumsum[i]) / float64(sum)
	}

	input := dbt{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
	slog.Info("input layer", "device", C.GoString(C.ggml_backend_dev_name(input.d)))

	var blocks int
	for key, value := range meta.KV() {
		if strings.HasSuffix(key, ".block_count") {
			blocks += int(value.(uint32))
Michael Yang's avatar
Michael Yang committed
127
		}
128
	}
Michael Yang's avatar
Michael Yang committed
129

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
	indexFunc := func(i int) func(float64) bool {
		return func(f float64) bool {
			return float64(i)/float64(blocks+1) < f
		}
	}

	layers := make([]dbt, blocks)
	for i := range layers {
		layers[i] = gpuBufferTypes[slices.IndexFunc(splits, indexFunc(i))]
		slog.Info("layer", "i", i, "device", C.GoString(C.ggml_backend_dev_name(layers[i].d)))
	}

	output := gpuBufferTypes[slices.IndexFunc(splits, indexFunc(blocks))]
	slog.Info("output layer", "device", C.GoString(C.ggml_backend_dev_name(output.d)))

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
	maxTensors += blocks * 2

	slog.Info("max tensors", "max_tensors", maxTensors)

151
152
153
154
155
156
157
	type tensor struct {
		source *fs.Tensor
		target string
	}

	targets := make(map[string][]string)

158
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
159
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
160
161
162
163
164
165
166
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
167

168
169
170
171
172
173
174
175
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
176
			defer C.free(unsafe.Pointer(cname))
177
178
179
180
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

181
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
182
183
			C.ggml_set_name(tt, cname)

184
			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
185
186
187
188
189
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
190
191
	}

192
193
194
195
196
197
198
199
200
	hasPart := func(s string, parts ...string) bool {
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
201
202
	}

203
204
205
	for _, t := range meta.Tensors().Items() {
		switch {
		case hasPart(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
206
			createTensor(tensor{source: t}, input.bts)
207
		case hasPart(t.Name, "cls", "output", "output_norm"):
208
			createTensor(tensor{source: t}, output.bts)
209
210
211
212
213
214
215
216
217
218
		default:
			if i := func() int {
				if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
					if i, err := strconv.Atoi(fields[0]); err == nil {
						return i
					}
				}

				return -1
			}(); i >= 0 {
219
				createTensor(tensor{source: t}, layers[i].bts)
220
			} else {
221
222
223
224
225
				for i, layer := range layers {
					createTensor(tensor{
						source: t,
						target: "blk." + strconv.Itoa(i) + "." + t.Name,
					}, layer.bts)
226
227
228
229
				}
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
	bbs := make(map[*C.struct_ggml_context][]*C.struct_ggml_backend_buffer, len(ctxs))

	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
		bbs[c] = append(bbs[c], b)
	}

	for bs := range maps.Values(bbs) {
		for _, b := range bs {
			slog.Info("model", "buffer", C.GoString(C.ggml_backend_buffer_name(b)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(b))))
		}
	}

	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
Michael Yang's avatar
Michael Yang committed
257
	var g errgroup.Group
258
	for _, t := range meta.Tensors().Items() {
259
260
261
262
263
		for _, target := range targets[t.Name] {
			g.Go(func() error {
				if target == "" {
					target = t.Name
				}
264

265
266
267
268
				tt, ok := tensors[target]
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
269

270
271
272
273
274
				bts := make([]byte, t.Size())
				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
				if err != nil {
					return err
				}
Michael Yang's avatar
Michael Yang committed
275

276
277
278
				if n != len(bts) {
					return errors.New("short read")
				}
Michael Yang's avatar
Michael Yang committed
279

280
281
282
283
284
285
286
				cname := C.CString(t.Name)
				C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size()))
				C.free(unsafe.Pointer(cname))

				return nil
			})
		}
Michael Yang's avatar
Michael Yang committed
287
288
	}

289
	if g.Wait() != nil {
Michael Yang's avatar
Michael Yang committed
290
291
292
		return nil, err
	}

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	var backends []*C.struct_ggml_backend
	var bufts []*C.struct_ggml_backend_buffer_type
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		backends = append(backends, b)

		bt := C.ggml_backend_get_default_buffer_type(b)
		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
				bt = hbt
			}
		}

		bufts = append(bufts, bt)

		slog.Info("compute buffer", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
309
310
	}

Michael Yang's avatar
Michael Yang committed
311
	return &Backend{
312
		flashAttention: params.FlashAttention,
313
314
		meta:              meta,
		tensors:           tensors,
315
316
317
318
319
320
321
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
			C.int(len(backends)),
			C.size_t(max(8192, len(meta.Tensors().Items())*5)),
			true,
		),
Michael Yang's avatar
Michael Yang committed
322
323
324
325
326
327
328
329
330
331
332
333
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

func (b *Backend) Config() ml.Config {
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
334
335
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
336
337
338
339
340
341
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
342
	maxGraphNodes := max(8192, len(b.meta.Tensors().Items())*5)
Michael Yang's avatar
Michael Yang committed
343
	return &Context{
344
345
		b:          b,
		ctx: C.ggml_init(C.struct_ggml_init_params{
346
			mem_size: C.size_t(maxGraphNodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxGraphNodes), false),
347
348
			no_alloc: true,
		}),
349
350
		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
		maxGraphNodes: maxGraphNodes,
Michael Yang's avatar
Michael Yang committed
351
352
353
	}
}

354
func (b *Backend) CacheConfig() ml.CacheConfig {
355
356
357
358
359
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
360
361
}

Michael Yang's avatar
Michael Yang committed
362
type Context struct {
363
	b *Backend
Michael Yang's avatar
Michael Yang committed
364

365
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
366
	graph *C.struct_ggml_cgraph
367
	backend *C.struct_ggml_backend
368

369
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
370
371
}

372
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
373
	if c.graph == nil {
374
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
375
376
	}

377
378
379
380
381
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
382
383
}

384
func (c *Context) Compute(tensors ...ml.Tensor) {
385
	C.ggml_backend_sched_reset(c.b.sched)
386
387
	C.ggml_backend_sched_alloc_graph(c.b.sched, c.graph)
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
388

389
390
391
	needSync := true
	sync := func() {
		if needSync {
392
			C.ggml_backend_sched_synchronize(c.b.sched)
393
394
395
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
396

397
398
399
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
400
401
		}
	}
Michael Yang's avatar
Michael Yang committed
402
403
}

404
405
func (c *Context) MaxGraphNodes() int {
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
406
407
}

408
409
410
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
411
		sh[i] = C.int64_t(s)
412
413
414
415
416
	}

	return &sh[0]
}

417
func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
418
419
420
421
422
423
424
425
426
427
428
429
430
	if len(shape) < 1 || len(shape) > 4 {
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

	var t *C.struct_ggml_tensor
	switch dtype {
	case ml.DTypeF32:
431
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
Jesse Gross's avatar
Jesse Gross committed
432
	case ml.DTypeF16:
433
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
434
	case ml.DTypeI32:
435
		t = C.ggml_new_tensor(ctx.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
436
437
438
439
	default:
		panic("unsupported dtype")
	}

440
	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
441
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
442
	C.ggml_set_input(t)
443
444
445
446
	return &Tensor{b: ctx.b, t: t}
}

func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
447
	return newTensor(c, dtype, shape)
448
449
450
}

func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
451
452
453
	t := newTensor(c, dtype, shape)
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
454
455
456
457
}

func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
	n := len(s)
458
459
460
461

	if n == 0 {
		var shape C.int64_t = 0
		t := C.ggml_new_tensor(ctx.ctx, dtype, 1, &shape)
462
		return &Tensor{b: ctx.b, t: t}, nil
463
464
	}

Michael Yang's avatar
Michael Yang committed
465
466
467
468
469
470
471
472
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
	}

473
	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
474
	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
475
476
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
477
	C.ggml_set_input(t)
478
	return &Tensor{b: ctx.b, t: t}, nil
Michael Yang's avatar
Michael Yang committed
479
480
481
482
483
484
485
486
487
488
}

func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
	return fromSlice(c, s, shape, C.GGML_TYPE_F32)
}

func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
}

489
490
func (c Context) Close() {
	if c.ctx != nil {
491
492
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
493
494
495
}

type Tensor struct {
496
	b    *Backend
Michael Yang's avatar
Michael Yang committed
497
	t    *C.struct_ggml_tensor
498
	sync func()
Michael Yang's avatar
Michael Yang committed
499
500
501
502
503
504
505
506
507
508
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

509
510
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
511
512
}

513
514
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
515
516
}

517
518
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
519
520
521
522
523
524
525
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

526
527
528
529
530
531
532
533
534
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
535
536
}

537
538
539
540
541
542
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
543
544
545
546
547
548
549
550
551
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
552
553
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
Michael Yang's avatar
Michael Yang committed
554
555
556
557
558
559
560
561
562
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
563
		b: t.b,
Michael Yang's avatar
Michael Yang committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
578
		b: t.b,
Michael Yang's avatar
Michael Yang committed
579
580
581
582
583
584
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
585
		b: t.b,
Michael Yang's avatar
Michael Yang committed
586
587
588
589
590
591
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
592
		b: t.b,
Michael Yang's avatar
Michael Yang committed
593
594
595
596
597
598
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
599
		b: t.b,
Michael Yang's avatar
Michael Yang committed
600
601
602
603
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

604
605
606
607
608
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
609
		b: t.b,
610
611
612
613
		t: mul,
	}
}

Michael Yang's avatar
Michael Yang committed
614
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
615
	tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
616
617
618
619
620
621
622
623
	if b != nil {
		tt = tt.Add(ctx, b)
	}

	return tt
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
624
	return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
625
626
}

627
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
628
629
630
631
632
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
633
		b: t.b,
Michael Yang's avatar
Michael Yang committed
634
635
636
637
638
639
640
641
642
643
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
644
		b: t.b,
Michael Yang's avatar
Michael Yang committed
645
646
647
648
649
650
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
651
		b: t.b,
Michael Yang's avatar
Michael Yang committed
652
653
654
655
656
657
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
658
		b: t.b,
Michael Yang's avatar
Michael Yang committed
659
660
661
662
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

663
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
664
665
666
	switch len(shape) {
	case 1:
		return &Tensor{
667
			b: t.b,
Michael Yang's avatar
Michael Yang committed
668
669
670
671
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
672
			b: t.b,
Michael Yang's avatar
Michael Yang committed
673
674
675
676
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
677
			b: t.b,
Michael Yang's avatar
Michael Yang committed
678
679
680
681
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
682
			b: t.b,
Michael Yang's avatar
Michael Yang committed
683
684
685
686
687
688
689
690
691
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
692
		b: t.b,
Michael Yang's avatar
Michael Yang committed
693
694
695
696
697
698
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
699
		b: t.b,
Michael Yang's avatar
Michael Yang committed
700
701
702
703
704
705
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
706
		b: t.b,
Michael Yang's avatar
Michael Yang committed
707
708
709
710
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

711
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
712
713
714
715
716
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
717
		b: t.b,
Michael Yang's avatar
Michael Yang committed
718
719
720
721
722
723
724
725
		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
726
			b: t.b,
Michael Yang's avatar
Michael Yang committed
727
728
729
730
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
731
			b: t.b,
Michael Yang's avatar
Michael Yang committed
732
733
734
735
736
737
738
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
739
			b: t.b,
Michael Yang's avatar
Michael Yang committed
740
741
742
743
744
745
746
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
747
			b: t.b,
Michael Yang's avatar
Michael Yang committed
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

const (
	ropeTypeNorm C.int = iota
)

func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
	if ropeFactors == nil {
764
		ropeFactors = &Tensor{b: t.b}
Michael Yang's avatar
Michael Yang committed
765
766
	}

Jesse Gross's avatar
Jesse Gross committed
767
768
769
770
771
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
772
	return &Tensor{
773
		b: t.b,
Michael Yang's avatar
Michael Yang committed
774
		t: C.ggml_rope_ext(
Jesse Gross's avatar
Jesse Gross committed
775
			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
			C.int(ropeDim),
			131072,       // YaRN n_ctx_train
			ropeTypeNorm, // ROPE_TYPE_NORM
			C.float(ropeBase),
			C.float(ropeScale),
			0.,  // YaRN ext_factor
			1.,  // YaRN attn_factor
			32., // YaRN beta_fast
			1.,  // YaRN beta_slow
		),
	}
}

func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
791
		b: t.b,
Michael Yang's avatar
Michael Yang committed
792
793
794
795
796
797
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
798
		b: t.b,
Michael Yang's avatar
Michael Yang committed
799
800
801
802
803
804
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
805
		b: t.b,
Michael Yang's avatar
Michael Yang committed
806
807
808
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
809

810
811
812
813
814
815
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

816
817
818
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

819
820
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
821

822
823
824
825
826
827
828
829
830
831
832
833
834
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
835
}