ggml.go 21.7 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"errors"
Michael Yang's avatar
Michael Yang committed
13
14
	"fmt"
	"io"
15
	"iter"
Michael Yang's avatar
Michael Yang committed
16
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
20
21
22
	"slices"
	"strconv"
	"strings"
	"unicode"
Michael Yang's avatar
Michael Yang committed
23
24
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
	fs "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml"
28
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
Michael Yang's avatar
Michael Yang committed
29
30
31
	"golang.org/x/sync/errgroup"
)

32
33
func devices() iter.Seq[*C.struct_ggml_backend_device] {
	return func(yield func(*C.struct_ggml_backend_device) bool) {
34
		ggml.OnceLoad()
35
36
37
38
39
		for i := range C.ggml_backend_dev_count() {
			if !yield(C.ggml_backend_dev_get(i)) {
				return
			}
		}
Michael Yang's avatar
Michael Yang committed
40
	}
41
}
Michael Yang's avatar
Michael Yang committed
42
43

type Backend struct {
44
45
46
47
48
49
	meta    *fs.GGML
	sched   *C.struct_ggml_backend_sched
	tensors map[string]*C.struct_ggml_tensor
	input   *C.struct_ggml_backend
	output  *C.struct_ggml_backend
	layers  map[int]*C.struct_ggml_backend
50

51
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
52
53
}

54
func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
Michael Yang's avatar
Michael Yang committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
	meta, n, err := fs.Decode(r, -1)
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

70
	type deviceBufferType struct {
71
72
73
74
75
76
77
78
79
80
81
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
	for d := range devices() {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			cpus = append(cpus, d)
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
82
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
83
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
84
85
86
		}
	}

87
88
89
90
91
92
	var cpuBufferTypes []*C.struct_ggml_backend_buffer_type
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			cpuBufferTypes = append(cpuBufferTypes, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
93
		}
94
95
	}

96
	var gpuDeviceBufferTypes []deviceBufferType
97
98
	for _, d := range gpus {
		bt := C.ggml_backend_dev_buffer_type(d)
99
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
100
101
102
			d:   d,
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
		})
Michael Yang's avatar
Michael Yang committed
103
104
	}

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
	splits := make([]float32, len(gpus))
	if func() bool {
		for _, s := range params.TensorSplit {
			if s != 0 {
				return true
			}
		}

		return false
	}() {
		splits = params.TensorSplit
	} else {
		for i := range splits {
			var free, total C.size_t
			C.ggml_backend_dev_memory(gpus[i], &free, &total)
			splits[i] = float32(free)
		}
	}

	var sum float32
	for i := range splits {
		sum += splits[i]
		splits[i] = sum
	}

130
	for i := range splits {
131
		splits[i] /= sum
132
133
	}

134
135
	cpuDeviceBufferTypes := deviceBufferType{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
	input := cpuDeviceBufferTypes
136

137
	blocks := int(meta.KV().BlockCount())
138
139
140
	assignLayer := func(i int) (temp deviceBufferType) {
		if i >= params.NumGPULayers {
			return cpuDeviceBufferTypes
141
		}
142

143
144
145
146
147
148
		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i)/float32(blocks+1) < f })
		if index < 0 || index >= len(gpuDeviceBufferTypes) {
			return cpuDeviceBufferTypes
		}

		return gpuDeviceBufferTypes[index]
149
150
	}

151
	layers := make([]deviceBufferType, blocks)
152
	for i := range layers {
153
		layers[i] = assignLayer(i)
154
155
	}

156
	output := assignLayer(blocks)
157
158
159
160
161

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
	maxTensors += blocks * 2

162
163
164
165
166
167
168
	type tensor struct {
		source *fs.Tensor
		target string
	}

	targets := make(map[string][]string)

169
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
170
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
171
172
173
174
175
176
177
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
178

179
180
181
182
183
184
185
186
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
187
			defer C.free(unsafe.Pointer(cname))
188
189
190
191
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

192
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
193
194
			C.ggml_set_name(tt, cname)

195
			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
196
197
198
199
200
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
201
202
	}

203
	contains := func(s string, parts ...string) bool {
204
205
206
207
208
209
210
211
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
212
213
	}

214
215
	for _, t := range meta.Tensors().Items() {
		switch {
216
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
217
			createTensor(tensor{source: t}, input.bts)
218
		case contains(t.Name, "cls", "output", "output_norm"):
219
			createTensor(tensor{source: t}, output.bts)
220
221
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
			createTensor(tensor{source: t}, input.bts)
222
223
224
225
226
227
228
229
230
231
		default:
			if i := func() int {
				if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
					if i, err := strconv.Atoi(fields[0]); err == nil {
						return i
					}
				}

				return -1
			}(); i >= 0 {
232
				createTensor(tensor{source: t}, layers[i].bts)
233
			} else {
234
235
236
237
238
				for i, layer := range layers {
					createTensor(tensor{
						source: t,
						target: "blk." + strconv.Itoa(i) + "." + t.Name,
					}, layer.bts)
239
240
241
242
				}
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
243

244
245
246
247
248
249
250
251
252
253
254
255
256
257
	bbs := make(map[*C.struct_ggml_context][]*C.struct_ggml_backend_buffer, len(ctxs))

	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
		bbs[c] = append(bbs[c], b)
	}

	for bs := range maps.Values(bbs) {
		for _, b := range bs {
258
			slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(b)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(b))))
259
260
261
262
263
264
265
266
267
268
269
		}
	}

	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
Michael Yang's avatar
Michael Yang committed
270
	var g errgroup.Group
271
	for _, t := range meta.Tensors().Items() {
272
273
274
275
276
		for _, target := range targets[t.Name] {
			g.Go(func() error {
				if target == "" {
					target = t.Name
				}
277

278
279
280
281
				tt, ok := tensors[target]
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
282

283
284
285
286
287
				bts := make([]byte, t.Size())
				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
				if err != nil {
					return err
				}
Michael Yang's avatar
Michael Yang committed
288

289
290
291
				if n != len(bts) {
					return errors.New("short read")
				}
Michael Yang's avatar
Michael Yang committed
292

293
294
295
296
297
298
299
				cname := C.CString(t.Name)
				C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size()))
				C.free(unsafe.Pointer(cname))

				return nil
			})
		}
Michael Yang's avatar
Michael Yang committed
300
301
	}

302
	if g.Wait() != nil {
Michael Yang's avatar
Michael Yang committed
303
304
305
		return nil, err
	}

306
	deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend)
307
308
309
310
311
	var backends []*C.struct_ggml_backend
	var bufts []*C.struct_ggml_backend_buffer_type
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		backends = append(backends, b)
312
		deviceBackends[d] = b
313
314
315
316
317
318
319
320
321
322

		bt := C.ggml_backend_get_default_buffer_type(b)
		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
				bt = hbt
			}
		}

		bufts = append(bufts, bt)

323
		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
324
325
326
327

		if C.ggml_backend_is_cpu(b) {
			C.ggml_backend_cpu_set_n_threads(b, C.int(params.NumThreads))
		}
328
329
	}

Michael Yang's avatar
Michael Yang committed
330
	return &Backend{
331
		flashAttention: params.FlashAttention,
332
333
		meta:           meta,
		tensors:        tensors,
334
335
336
337
338
339
340
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
			C.int(len(backends)),
			C.size_t(max(8192, len(meta.Tensors().Items())*5)),
			true,
		),
341
342
343
344
345
346
347
348
349
		input:  deviceBackends[input.d],
		output: deviceBackends[output.d],
		layers: func() map[int]*C.struct_ggml_backend {
			m := make(map[int]*C.struct_ggml_backend)
			for i, layer := range layers {
				m[i] = deviceBackends[layer.d]
			}
			return m
		}(),
Michael Yang's avatar
Michael Yang committed
350
351
352
353
354
355
356
357
358
359
360
361
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

func (b *Backend) Config() ml.Config {
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
362
363
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
364
365
366
367
368
369
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
370
371
372
373
	return b.NewContextSize(max(8192, len(b.meta.Tensors().Items())*5))
}

func (b *Backend) NewContextSize(n int) ml.Context {
Michael Yang's avatar
Michael Yang committed
374
	return &Context{
375
		b: b,
376
		ctx: C.ggml_init(C.struct_ggml_init_params{
377
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
378
379
			no_alloc: true,
		}),
380
		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
381
382
383
384
		maxGraphNodes: n,
		input:         b.input,
		output:        b.output,
		layers:        b.layers,
Michael Yang's avatar
Michael Yang committed
385
386
387
	}
}

388
func (b *Backend) CacheConfig() ml.CacheConfig {
389
390
391
392
393
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
394
395
}

Michael Yang's avatar
Michael Yang committed
396
type Context struct {
397
	b *Backend
Michael Yang's avatar
Michael Yang committed
398

399
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
400
	graph *C.struct_ggml_cgraph
401
402

	// backend is the backend used for new tensors
403
	backend *C.struct_ggml_backend
404

405
406
407
408
409
410
411
412
413
	// input is the backend used for inputs
	input *C.struct_ggml_backend

	// output is the backend used for outputs
	output *C.struct_ggml_backend

	// output is the backend used for repeating layers
	layers map[int]*C.struct_ggml_backend

414
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
415
416
}

417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
func (c *Context) Input() ml.Context {
	if c.input != nil {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       c.input,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

func (c *Context) Output() ml.Context {
	if c.output != nil {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       c.output,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

func (c *Context) Layer(i int) ml.Context {
	if backend, ok := c.layers[i]; ok {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       backend,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

456
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
457
	if c.graph == nil {
458
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
459
460
	}

461
462
463
464
465
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
466
467
}

468
func (c *Context) Compute(tensors ...ml.Tensor) {
469
	C.ggml_backend_sched_reset(c.b.sched)
470
471
	C.ggml_backend_sched_alloc_graph(c.b.sched, c.graph)
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
472

473
474
475
	needSync := true
	sync := func() {
		if needSync {
476
			C.ggml_backend_sched_synchronize(c.b.sched)
477
478
479
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
480

481
482
483
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
484
485
		}
	}
Michael Yang's avatar
Michael Yang committed
486
487
}

488
489
func (c *Context) MaxGraphNodes() int {
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
490
491
}

492
493
494
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
495
		sh[i] = C.int64_t(s)
496
497
498
499
500
	}

	return &sh[0]
}

501
func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
502
503
504
505
506
507
508
509
510
511
512
513
514
	if len(shape) < 1 || len(shape) > 4 {
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

	var t *C.struct_ggml_tensor
	switch dtype {
	case ml.DTypeF32:
515
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
Jesse Gross's avatar
Jesse Gross committed
516
	case ml.DTypeF16:
517
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
518
	case ml.DTypeI32:
519
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
520
521
522
523
	default:
		panic("unsupported dtype")
	}

524
	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
525
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
526
	return &Tensor{b: c.b, t: t}
527
528
529
}

func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
530
	return c.newTensor(dtype, shape)
531
532
533
}

func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
534
	t := c.newTensor(dtype, shape)
535
536
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
537
538
}

539
func checkShape[S ~[]E, E any](s S, shape ...int) error {
Michael Yang's avatar
Michael Yang committed
540
541
542
543
544
545
	n := len(s)
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
546
		return fmt.Errorf("invalid shape: %v", shape)
Michael Yang's avatar
Michael Yang committed
547
548
	}

549
	return nil
Michael Yang's avatar
Michael Yang committed
550
551
552
}

func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
553
554
555
556
557
558
559
	if err := checkShape(s, shape...); err != nil {
		return nil, err
	}

	t := c.newTensor(ml.DTypeF32, shape)
	C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	return t, nil
Michael Yang's avatar
Michael Yang committed
560
561
562
}

func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
563
564
565
566
567
568
569
	if err := checkShape(s, shape...); err != nil {
		return nil, err
	}

	t := c.newTensor(ml.DTypeI32, shape)
	C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	return t, nil
Michael Yang's avatar
Michael Yang committed
570
571
}

572
573
func (c Context) Close() {
	if c.ctx != nil {
574
575
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
576
577
578
}

type Tensor struct {
579
	b    *Backend
Michael Yang's avatar
Michael Yang committed
580
	t    *C.struct_ggml_tensor
581
	sync func()
Michael Yang's avatar
Michael Yang committed
582
583
584
585
586
587
588
589
590
591
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

592
593
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
594
595
}

596
597
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
598
599
}

600
601
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
602
603
604
605
606
607
608
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

609
610
611
612
613
614
615
616
617
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
618
619
}

620
621
622
623
624
625
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
626
627
628
629
630
631
632
633
634
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
635
636
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
Michael Yang's avatar
Michael Yang committed
637
638
639
640
641
642
643
644
645
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
646
		b: t.b,
Michael Yang's avatar
Michael Yang committed
647
648
649
650
651
652
653
654
655
656
657
658
659
660
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
661
		b: t.b,
Michael Yang's avatar
Michael Yang committed
662
663
664
665
666
667
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
668
		b: t.b,
Michael Yang's avatar
Michael Yang committed
669
670
671
672
673
674
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
675
		b: t.b,
Michael Yang's avatar
Michael Yang committed
676
677
678
679
680
681
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
682
		b: t.b,
Michael Yang's avatar
Michael Yang committed
683
684
685
686
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

687
688
689
690
691
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
692
		b: t.b,
693
694
695
696
		t: mul,
	}
}

Michael Yang's avatar
Michael Yang committed
697
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
698
	tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
699
700
701
702
703
704
705
706
	if b != nil {
		tt = tt.Add(ctx, b)
	}

	return tt
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
707
	return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
708
709
}

710
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
711
712
713
714
715
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
716
		b: t.b,
Michael Yang's avatar
Michael Yang committed
717
718
719
720
721
722
723
724
725
726
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
727
		b: t.b,
Michael Yang's avatar
Michael Yang committed
728
729
730
731
732
733
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
734
		b: t.b,
Michael Yang's avatar
Michael Yang committed
735
736
737
738
739
740
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
741
		b: t.b,
Michael Yang's avatar
Michael Yang committed
742
743
744
745
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

746
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
747
748
749
	switch len(shape) {
	case 1:
		return &Tensor{
750
			b: t.b,
Michael Yang's avatar
Michael Yang committed
751
752
753
754
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
755
			b: t.b,
Michael Yang's avatar
Michael Yang committed
756
757
758
759
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
760
			b: t.b,
Michael Yang's avatar
Michael Yang committed
761
762
763
764
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
765
			b: t.b,
Michael Yang's avatar
Michael Yang committed
766
767
768
769
770
771
772
773
774
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
775
		b: t.b,
Michael Yang's avatar
Michael Yang committed
776
777
778
779
780
781
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
782
		b: t.b,
Michael Yang's avatar
Michael Yang committed
783
784
785
786
787
788
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
789
		b: t.b,
Michael Yang's avatar
Michael Yang committed
790
791
792
793
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

794
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
795
796
797
798
799
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
800
		b: t.b,
Michael Yang's avatar
Michael Yang committed
801
802
803
804
805
806
807
808
		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
809
			b: t.b,
Michael Yang's avatar
Michael Yang committed
810
811
812
813
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
814
			b: t.b,
Michael Yang's avatar
Michael Yang committed
815
816
817
818
819
820
821
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
822
			b: t.b,
Michael Yang's avatar
Michael Yang committed
823
824
825
826
827
828
829
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
830
			b: t.b,
Michael Yang's avatar
Michael Yang committed
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

const (
	ropeTypeNorm C.int = iota
)

func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
	if ropeFactors == nil {
847
		ropeFactors = &Tensor{b: t.b}
Michael Yang's avatar
Michael Yang committed
848
849
	}

Jesse Gross's avatar
Jesse Gross committed
850
851
852
853
854
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
855
	return &Tensor{
856
		b: t.b,
Michael Yang's avatar
Michael Yang committed
857
		t: C.ggml_rope_ext(
Jesse Gross's avatar
Jesse Gross committed
858
			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
			C.int(ropeDim),
			131072,       // YaRN n_ctx_train
			ropeTypeNorm, // ROPE_TYPE_NORM
			C.float(ropeBase),
			C.float(ropeScale),
			0.,  // YaRN ext_factor
			1.,  // YaRN attn_factor
			32., // YaRN beta_fast
			1.,  // YaRN beta_slow
		),
	}
}

func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
874
		b: t.b,
Michael Yang's avatar
Michael Yang committed
875
876
877
878
879
880
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
881
		b: t.b,
Michael Yang's avatar
Michael Yang committed
882
883
884
885
886
887
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
888
		b: t.b,
Michael Yang's avatar
Michael Yang committed
889
890
891
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
892

893
894
895
896
897
898
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

899
900
901
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

902
903
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
904

905
906
907
908
909
910
911
912
913
914
915
916
917
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
918
}