ggml.go 21.7 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"errors"
Michael Yang's avatar
Michael Yang committed
13
14
	"fmt"
	"io"
15
	"iter"
Michael Yang's avatar
Michael Yang committed
16
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
20
21
22
	"slices"
	"strconv"
	"strings"
	"unicode"
Michael Yang's avatar
Michael Yang committed
23
24
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
	fs "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml"
28
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
Michael Yang's avatar
Michael Yang committed
29
30
31
	"golang.org/x/sync/errgroup"
)

32
33
func devices() iter.Seq[*C.struct_ggml_backend_device] {
	return func(yield func(*C.struct_ggml_backend_device) bool) {
34
		ggml.OnceLoad()
35
36
37
38
39
		for i := range C.ggml_backend_dev_count() {
			if !yield(C.ggml_backend_dev_get(i)) {
				return
			}
		}
Michael Yang's avatar
Michael Yang committed
40
	}
41
}
Michael Yang's avatar
Michael Yang committed
42
43

type Backend struct {
44
45
46
47
48
49
	meta    *fs.GGML
	sched   *C.struct_ggml_backend_sched
	tensors map[string]*C.struct_ggml_tensor
	input   *C.struct_ggml_backend
	output  *C.struct_ggml_backend
	layers  map[int]*C.struct_ggml_backend
50

51
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
52
53
}

54
func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
Michael Yang's avatar
Michael Yang committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
	meta, n, err := fs.Decode(r, -1)
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

70
	type deviceBufferType struct {
71
72
73
74
75
76
77
78
79
80
81
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
	for d := range devices() {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			cpus = append(cpus, d)
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
82
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
83
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
84
85
86
		}
	}

87
88
89
90
91
92
	var cpuBufferTypes []*C.struct_ggml_backend_buffer_type
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			cpuBufferTypes = append(cpuBufferTypes, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
93
		}
94
95
	}

96
	var gpuDeviceBufferTypes []deviceBufferType
97
98
	for _, d := range gpus {
		bt := C.ggml_backend_dev_buffer_type(d)
99
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
100
101
102
			d:   d,
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
		})
Michael Yang's avatar
Michael Yang committed
103
104
	}

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
	splits := make([]float32, len(gpus))
	if func() bool {
		for _, s := range params.TensorSplit {
			if s != 0 {
				return true
			}
		}

		return false
	}() {
		splits = params.TensorSplit
	} else {
		for i := range splits {
			var free, total C.size_t
			C.ggml_backend_dev_memory(gpus[i], &free, &total)
			splits[i] = float32(free)
		}
	}

	var sum float32
	for i := range splits {
		sum += splits[i]
		splits[i] = sum
	}

130
	for i := range splits {
131
		splits[i] /= sum
132
133
	}

134
135
	cpuDeviceBufferTypes := deviceBufferType{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
	input := cpuDeviceBufferTypes
136
137
138
139
140

	var blocks int
	for key, value := range meta.KV() {
		if strings.HasSuffix(key, ".block_count") {
			blocks += int(value.(uint32))
Michael Yang's avatar
Michael Yang committed
141
		}
142
	}
Michael Yang's avatar
Michael Yang committed
143

144
145
146
	assignLayer := func(i int) (temp deviceBufferType) {
		if i >= params.NumGPULayers {
			return cpuDeviceBufferTypes
147
		}
148

149
150
151
152
153
154
		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i)/float32(blocks+1) < f })
		if index < 0 || index >= len(gpuDeviceBufferTypes) {
			return cpuDeviceBufferTypes
		}

		return gpuDeviceBufferTypes[index]
155
156
	}

157
	layers := make([]deviceBufferType, blocks)
158
	for i := range layers {
159
		layers[i] = assignLayer(i)
160
161
	}

162
	output := assignLayer(blocks)
163
164
165
166
167

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
	maxTensors += blocks * 2

168
169
170
171
172
173
174
	type tensor struct {
		source *fs.Tensor
		target string
	}

	targets := make(map[string][]string)

175
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
176
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
177
178
179
180
181
182
183
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
184

185
186
187
188
189
190
191
192
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
193
			defer C.free(unsafe.Pointer(cname))
194
195
196
197
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

198
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
199
200
			C.ggml_set_name(tt, cname)

201
			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
202
203
204
205
206
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
207
208
	}

209
210
211
212
213
214
215
216
217
	hasPart := func(s string, parts ...string) bool {
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
218
219
	}

220
221
222
	for _, t := range meta.Tensors().Items() {
		switch {
		case hasPart(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
223
			createTensor(tensor{source: t}, input.bts)
224
		case hasPart(t.Name, "cls", "output", "output_norm"):
225
			createTensor(tensor{source: t}, output.bts)
226
227
228
229
230
231
232
233
234
235
		default:
			if i := func() int {
				if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
					if i, err := strconv.Atoi(fields[0]); err == nil {
						return i
					}
				}

				return -1
			}(); i >= 0 {
236
				createTensor(tensor{source: t}, layers[i].bts)
237
			} else {
238
239
240
241
242
				for i, layer := range layers {
					createTensor(tensor{
						source: t,
						target: "blk." + strconv.Itoa(i) + "." + t.Name,
					}, layer.bts)
243
244
245
246
				}
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
247

248
249
250
251
252
253
254
255
256
257
258
259
260
261
	bbs := make(map[*C.struct_ggml_context][]*C.struct_ggml_backend_buffer, len(ctxs))

	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
		bbs[c] = append(bbs[c], b)
	}

	for bs := range maps.Values(bbs) {
		for _, b := range bs {
262
			slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(b)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(b))))
263
264
265
266
267
268
269
270
271
272
273
		}
	}

	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
Michael Yang's avatar
Michael Yang committed
274
	var g errgroup.Group
275
	for _, t := range meta.Tensors().Items() {
276
277
278
279
280
		for _, target := range targets[t.Name] {
			g.Go(func() error {
				if target == "" {
					target = t.Name
				}
281

282
283
284
285
				tt, ok := tensors[target]
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
286

287
288
289
290
291
				bts := make([]byte, t.Size())
				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
				if err != nil {
					return err
				}
Michael Yang's avatar
Michael Yang committed
292

293
294
295
				if n != len(bts) {
					return errors.New("short read")
				}
Michael Yang's avatar
Michael Yang committed
296

297
298
299
300
301
302
303
				cname := C.CString(t.Name)
				C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size()))
				C.free(unsafe.Pointer(cname))

				return nil
			})
		}
Michael Yang's avatar
Michael Yang committed
304
305
	}

306
	if g.Wait() != nil {
Michael Yang's avatar
Michael Yang committed
307
308
309
		return nil, err
	}

310
	deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend)
311
312
313
314
315
	var backends []*C.struct_ggml_backend
	var bufts []*C.struct_ggml_backend_buffer_type
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		backends = append(backends, b)
316
		deviceBackends[d] = b
317
318
319
320
321
322
323
324
325
326

		bt := C.ggml_backend_get_default_buffer_type(b)
		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
				bt = hbt
			}
		}

		bufts = append(bufts, bt)

327
		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
328
329
330
331

		if C.ggml_backend_is_cpu(b) {
			C.ggml_backend_cpu_set_n_threads(b, C.int(params.NumThreads))
		}
332
333
	}

Michael Yang's avatar
Michael Yang committed
334
	return &Backend{
335
		flashAttention: params.FlashAttention,
336
337
		meta:           meta,
		tensors:        tensors,
338
339
340
341
342
343
344
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
			C.int(len(backends)),
			C.size_t(max(8192, len(meta.Tensors().Items())*5)),
			true,
		),
345
346
347
348
349
350
351
352
353
		input:  deviceBackends[input.d],
		output: deviceBackends[output.d],
		layers: func() map[int]*C.struct_ggml_backend {
			m := make(map[int]*C.struct_ggml_backend)
			for i, layer := range layers {
				m[i] = deviceBackends[layer.d]
			}
			return m
		}(),
Michael Yang's avatar
Michael Yang committed
354
355
356
357
358
359
360
361
362
363
364
365
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

func (b *Backend) Config() ml.Config {
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
366
367
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
368
369
370
371
372
373
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
374
375
376
377
	return b.NewContextSize(max(8192, len(b.meta.Tensors().Items())*5))
}

func (b *Backend) NewContextSize(n int) ml.Context {
Michael Yang's avatar
Michael Yang committed
378
	return &Context{
379
		b: b,
380
		ctx: C.ggml_init(C.struct_ggml_init_params{
381
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
382
383
			no_alloc: true,
		}),
384
		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
385
386
387
388
		maxGraphNodes: n,
		input:         b.input,
		output:        b.output,
		layers:        b.layers,
Michael Yang's avatar
Michael Yang committed
389
390
391
	}
}

392
func (b *Backend) CacheConfig() ml.CacheConfig {
393
394
395
396
397
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
398
399
}

Michael Yang's avatar
Michael Yang committed
400
type Context struct {
401
	b *Backend
Michael Yang's avatar
Michael Yang committed
402

403
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
404
	graph *C.struct_ggml_cgraph
405
406

	// backend is the backend used for new tensors
407
	backend *C.struct_ggml_backend
408

409
410
411
412
413
414
415
416
417
	// input is the backend used for inputs
	input *C.struct_ggml_backend

	// output is the backend used for outputs
	output *C.struct_ggml_backend

	// output is the backend used for repeating layers
	layers map[int]*C.struct_ggml_backend

418
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
419
420
}

421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
func (c *Context) Input() ml.Context {
	if c.input != nil {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       c.input,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

func (c *Context) Output() ml.Context {
	if c.output != nil {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       c.output,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

func (c *Context) Layer(i int) ml.Context {
	if backend, ok := c.layers[i]; ok {
		return &Context{
			b:             c.b,
			ctx:           c.ctx,
			backend:       backend,
			maxGraphNodes: c.maxGraphNodes,
		}
	}

	return c
}

460
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
461
	if c.graph == nil {
462
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
463
464
	}

465
466
467
468
469
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
470
471
}

472
func (c *Context) Compute(tensors ...ml.Tensor) {
473
	C.ggml_backend_sched_reset(c.b.sched)
474
475
	C.ggml_backend_sched_alloc_graph(c.b.sched, c.graph)
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
476

477
478
479
	needSync := true
	sync := func() {
		if needSync {
480
			C.ggml_backend_sched_synchronize(c.b.sched)
481
482
483
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
484

485
486
487
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
488
489
		}
	}
Michael Yang's avatar
Michael Yang committed
490
491
}

492
493
func (c *Context) MaxGraphNodes() int {
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
494
495
}

496
497
498
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
499
		sh[i] = C.int64_t(s)
500
501
502
503
504
	}

	return &sh[0]
}

505
func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
506
507
508
509
510
511
512
513
514
515
516
517
518
	if len(shape) < 1 || len(shape) > 4 {
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

	var t *C.struct_ggml_tensor
	switch dtype {
	case ml.DTypeF32:
519
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
Jesse Gross's avatar
Jesse Gross committed
520
	case ml.DTypeF16:
521
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
522
	case ml.DTypeI32:
523
		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
Michael Yang's avatar
Michael Yang committed
524
525
526
527
	default:
		panic("unsupported dtype")
	}

528
	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
Michael Yang's avatar
Michael Yang committed
529
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
530
	return &Tensor{b: c.b, t: t}
531
532
533
}

func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
534
	return c.newTensor(dtype, shape)
535
536
537
}

func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
538
	t := c.newTensor(dtype, shape)
539
540
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
541
542
}

543
func checkShape[S ~[]E, E any](s S, shape ...int) error {
Michael Yang's avatar
Michael Yang committed
544
545
546
547
548
549
	n := len(s)
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
550
		return fmt.Errorf("invalid shape: %v", shape)
Michael Yang's avatar
Michael Yang committed
551
552
	}

553
	return nil
Michael Yang's avatar
Michael Yang committed
554
555
556
}

func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
557
558
559
560
561
562
563
	if err := checkShape(s, shape...); err != nil {
		return nil, err
	}

	t := c.newTensor(ml.DTypeF32, shape)
	C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	return t, nil
Michael Yang's avatar
Michael Yang committed
564
565
566
}

func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
567
568
569
570
571
572
573
	if err := checkShape(s, shape...); err != nil {
		return nil, err
	}

	t := c.newTensor(ml.DTypeI32, shape)
	C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	return t, nil
Michael Yang's avatar
Michael Yang committed
574
575
}

576
577
func (c Context) Close() {
	if c.ctx != nil {
578
579
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
580
581
582
}

type Tensor struct {
583
	b    *Backend
Michael Yang's avatar
Michael Yang committed
584
	t    *C.struct_ggml_tensor
585
	sync func()
Michael Yang's avatar
Michael Yang committed
586
587
588
589
590
591
592
593
594
595
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

596
597
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
598
599
}

600
601
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
602
603
}

604
605
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
606
607
608
609
610
611
612
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

613
614
615
616
617
618
619
620
621
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
622
623
}

624
625
626
627
628
629
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
630
631
632
633
634
635
636
637
638
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
639
640
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
Michael Yang's avatar
Michael Yang committed
641
642
643
644
645
646
647
648
649
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
650
		b: t.b,
Michael Yang's avatar
Michael Yang committed
651
652
653
654
655
656
657
658
659
660
661
662
663
664
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
665
		b: t.b,
Michael Yang's avatar
Michael Yang committed
666
667
668
669
670
671
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
672
		b: t.b,
Michael Yang's avatar
Michael Yang committed
673
674
675
676
677
678
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
679
		b: t.b,
Michael Yang's avatar
Michael Yang committed
680
681
682
683
684
685
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
686
		b: t.b,
Michael Yang's avatar
Michael Yang committed
687
688
689
690
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

691
692
693
694
695
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
696
		b: t.b,
697
698
699
700
		t: mul,
	}
}

Michael Yang's avatar
Michael Yang committed
701
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
702
	tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
703
704
705
706
707
708
709
710
	if b != nil {
		tt = tt.Add(ctx, b)
	}

	return tt
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
711
	return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
Michael Yang's avatar
Michael Yang committed
712
713
}

714
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
715
716
717
718
719
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
720
		b: t.b,
Michael Yang's avatar
Michael Yang committed
721
722
723
724
725
726
727
728
729
730
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
731
		b: t.b,
Michael Yang's avatar
Michael Yang committed
732
733
734
735
736
737
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
738
		b: t.b,
Michael Yang's avatar
Michael Yang committed
739
740
741
742
743
744
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
745
		b: t.b,
Michael Yang's avatar
Michael Yang committed
746
747
748
749
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

750
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
751
752
753
	switch len(shape) {
	case 1:
		return &Tensor{
754
			b: t.b,
Michael Yang's avatar
Michael Yang committed
755
756
757
758
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
759
			b: t.b,
Michael Yang's avatar
Michael Yang committed
760
761
762
763
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
764
			b: t.b,
Michael Yang's avatar
Michael Yang committed
765
766
767
768
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
769
			b: t.b,
Michael Yang's avatar
Michael Yang committed
770
771
772
773
774
775
776
777
778
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
779
		b: t.b,
Michael Yang's avatar
Michael Yang committed
780
781
782
783
784
785
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
786
		b: t.b,
Michael Yang's avatar
Michael Yang committed
787
788
789
790
791
792
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
793
		b: t.b,
Michael Yang's avatar
Michael Yang committed
794
795
796
797
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

798
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
799
800
801
802
803
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
804
		b: t.b,
Michael Yang's avatar
Michael Yang committed
805
806
807
808
809
810
811
812
		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
813
			b: t.b,
Michael Yang's avatar
Michael Yang committed
814
815
816
817
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
818
			b: t.b,
Michael Yang's avatar
Michael Yang committed
819
820
821
822
823
824
825
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
826
			b: t.b,
Michael Yang's avatar
Michael Yang committed
827
828
829
830
831
832
833
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
834
			b: t.b,
Michael Yang's avatar
Michael Yang committed
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

const (
	ropeTypeNorm C.int = iota
)

func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
	if ropeFactors == nil {
851
		ropeFactors = &Tensor{b: t.b}
Michael Yang's avatar
Michael Yang committed
852
853
	}

Jesse Gross's avatar
Jesse Gross committed
854
855
856
857
858
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
859
	return &Tensor{
860
		b: t.b,
Michael Yang's avatar
Michael Yang committed
861
		t: C.ggml_rope_ext(
Jesse Gross's avatar
Jesse Gross committed
862
			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
			C.int(ropeDim),
			131072,       // YaRN n_ctx_train
			ropeTypeNorm, // ROPE_TYPE_NORM
			C.float(ropeBase),
			C.float(ropeScale),
			0.,  // YaRN ext_factor
			1.,  // YaRN attn_factor
			32., // YaRN beta_fast
			1.,  // YaRN beta_slow
		),
	}
}

func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
878
		b: t.b,
Michael Yang's avatar
Michael Yang committed
879
880
881
882
883
884
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
885
		b: t.b,
Michael Yang's avatar
Michael Yang committed
886
887
888
889
890
891
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
892
		b: t.b,
Michael Yang's avatar
Michael Yang committed
893
894
895
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
896

897
898
899
900
901
902
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

903
904
905
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

906
907
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
908

909
910
911
912
913
914
915
916
917
918
919
920
921
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
922
}