ggml.go 29.5 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
13
	"errors"
Michael Yang's avatar
Michael Yang committed
14
15
16
	"fmt"
	"io"
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
	"runtime"
20
21
22
	"slices"
	"strconv"
	"strings"
23
	"sync/atomic"
24
	"unicode"
Michael Yang's avatar
Michael Yang committed
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
28
29
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
30
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
31
	"github.com/ollama/ollama/ml"
32
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
Michael Yang's avatar
Michael Yang committed
33
34
35
	"golang.org/x/sync/errgroup"
)

Michael Yang's avatar
Michael Yang committed
36
37
38
39
40
func devices() []*C.struct_ggml_backend_device {
	ggml.OnceLoad()
	ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
	for i := range ds {
		ds[i] = C.ggml_backend_dev_get(C.size_t(i))
Michael Yang's avatar
Michael Yang committed
41
	}
Michael Yang's avatar
Michael Yang committed
42
43

	return ds
44
}
Michael Yang's avatar
Michael Yang committed
45
46

type Backend struct {
47
48
49
50
51
52
	meta *fsggml.GGML

	sched         *C.struct_ggml_backend_sched
	schedBackends []*C.struct_ggml_backend
	schedBufts    []*C.struct_ggml_backend_buffer_type

53
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
54
55

	// input is the backend used for inputs
56
	input *C.struct_ggml_backend_buffer_type
Michael Yang's avatar
Michael Yang committed
57
58

	// layers is the backend used for repeating layers
59
	layers map[int]*C.struct_ggml_backend_buffer_type
60

61
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
62
63
64

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
65
66
}

67
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
68
	meta, n, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

83
	type deviceBufferType struct {
84
85
86
87
88
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
Michael Yang's avatar
Michael Yang committed
89
	for _, d := range devices() {
90
91
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
92
93
94
95
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
96
97
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
98
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
99
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
100
101
102
		}
	}

Michael Yang's avatar
Michael Yang committed
103
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
104
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
105
106
107
108
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Michael Yang's avatar
Michael Yang committed
109
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
110
		}
111
112
	}

Michael Yang's avatar
Michael Yang committed
113
	// create list of buffer types for each gpu
114
	var gpuDeviceBufferTypes []deviceBufferType
115
116
	for _, d := range gpus {
		bt := C.ggml_backend_dev_buffer_type(d)
117
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
118
			d:   d,
Michael Yang's avatar
Michael Yang committed
119
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
120
		})
Michael Yang's avatar
Michael Yang committed
121
122
	}

Michael Yang's avatar
Michael Yang committed
123
124
125
126
127
	useDefaultSplit := true
	for _, s := range params.TensorSplit {
		if s != 0 {
			useDefaultSplit = false
			break
128
		}
Michael Yang's avatar
Michael Yang committed
129
	}
130

Michael Yang's avatar
Michael Yang committed
131
132
133
134
	// calculate splits
	splits := make([]float32, len(gpus))
	if useDefaultSplit {
		// default: split on free memory
135
136
137
138
139
		for i := range splits {
			var free, total C.size_t
			C.ggml_backend_dev_memory(gpus[i], &free, &total)
			splits[i] = float32(free)
		}
Michael Yang's avatar
Michael Yang committed
140
141
	} else {
		splits = params.TensorSplit
142
143
144
	}

	var sum float32
Michael Yang's avatar
Michael Yang committed
145
	// cumulative sum of all splits
146
147
148
149
150
	for i := range splits {
		sum += splits[i]
		splits[i] = sum
	}

Michael Yang's avatar
Michael Yang committed
151
	// normalize splits
152
	for i := range splits {
153
		splits[i] /= sum
154
155
	}

Michael Yang's avatar
Michael Yang committed
156
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
157
	input := cpuDeviceBufferType
158

159
	blocks := int(meta.KV().BlockCount())
Michael Yang's avatar
Michael Yang committed
160
161
162
163

	// define a range of gpu layers. anything outside of this range is assigned to the cpu
	gpuRangeStart := max(0, blocks-params.NumGPULayers)
	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
Michael Yang's avatar
Michael Yang committed
164
	assignLayer := func(i int) deviceBufferType {
Michael Yang's avatar
Michael Yang committed
165
		if i < gpuRangeStart || i >= gpuRangeStop {
Michael Yang's avatar
Michael Yang committed
166
			return cpuDeviceBufferType
167
		}
168

Michael Yang's avatar
Michael Yang committed
169
		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
170
		if index < 0 || index >= len(gpuDeviceBufferTypes) {
Michael Yang's avatar
Michael Yang committed
171
			return cpuDeviceBufferType
172
173
174
		}

		return gpuDeviceBufferTypes[index]
175
176
	}

Michael Yang's avatar
Michael Yang committed
177
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
178
	layers := make([]deviceBufferType, blocks)
179
	for i := range layers {
180
		layers[i] = assignLayer(i)
181
182
	}

Michael Yang's avatar
Michael Yang committed
183
	// outputs are assigned iff allowed by splits and configured number of gpu layers
184
	output := assignLayer(blocks)
185
186
187

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
188
	// each layer has at most 2 extra tensors for rope operations
189
190
	maxTensors += blocks * 2

191
	type tensor struct {
192
		source *fsggml.Tensor
193
194
195
		target string
	}

Michael Yang's avatar
Michael Yang committed
196
	// some tensors are mapped to different names so keep a list
197
198
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
199
	// contexts are shared by tensors of the same buffer type
200
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
201
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
202
203
204
205
206
207
208
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
209

210
211
212
213
214
215
216
217
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
218
			defer C.free(unsafe.Pointer(cname))
219
220
221
222
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

223
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
224
225
			C.ggml_set_name(tt, cname)

226
			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
227
228
229
230
231
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
232
233
	}

234
	contains := func(s string, parts ...string) bool {
235
236
237
238
239
240
241
242
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
243
244
	}

245
246
	for _, t := range meta.Tensors().Items() {
		switch {
247
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
248
			createTensor(tensor{source: t}, input.bts)
Michael Yang's avatar
Michael Yang committed
249
250
251
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
			}
252
		case contains(t.Name, "cls", "output", "output_norm"):
253
			createTensor(tensor{source: t}, output.bts)
254
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
255
			// TODO: assign vision tensors to the gpu if possible
Michael Yang's avatar
Michael Yang committed
256
			createTensor(tensor{source: t}, output.bts)
Michael Yang's avatar
Michael Yang committed
257
258
259
260
261
262
263
264
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
				}, layer.bts)
			}
265
		default:
Michael Yang's avatar
Michael Yang committed
266
267
268
269
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
270
				}
Michael Yang's avatar
Michael Yang committed
271
			}
272

Michael Yang's avatar
Michael Yang committed
273
274
			if layerIndex >= 0 {
				createTensor(tensor{source: t}, layers[layerIndex].bts)
275
			} else {
Michael Yang's avatar
Michael Yang committed
276
277
				// load all other tensors on the cpu
				createTensor(tensor{source: t}, input.bts)
278
279
280
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
281

Michael Yang's avatar
Michael Yang committed
282
283
	// allocate buffers for each context
	bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
284
285
286
287
288
289
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
290
291
292
293
		if b == nil {
			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
		}

294
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
Michael Yang's avatar
Michael Yang committed
295
		bbs[c] = b
296
297
298
	}

	for bs := range maps.Values(bbs) {
Michael Yang's avatar
Michael Yang committed
299
		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
300
301
	}

Michael Yang's avatar
Michael Yang committed
302
	// map tensor names to tensors for easy lookup later
303
304
305
306
307
308
309
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

310
311
312
313
314
	var doneBytes atomic.Uint64
	totalBytes := uint64(n) - meta.Tensors().Offset

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
315
	for _, t := range meta.Tensors().Items() {
316
		t := t
317
318
319
320
		g.Go(func() error {
			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
			for i := range tts {
				target := targets[t.Name][i]
321
322
323
				if target == "" {
					target = t.Name
				}
324

325
326
327
328
				tt, ok := tensors[target]
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
329

330
331
332
				tts[i] = tt
			}

333
334
335
336
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
			file, err := os.Open(r.Name())
			if err != nil {
Jesse Gross's avatar
Jesse Gross committed
337
				slog.Warn("file open error", "file", r.Name(), "error", err)
338
339
340
341
				return err
			}
			defer file.Close()
			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
342
343
344
345
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
346
347
348
349
350
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

351
352
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
Jesse Gross's avatar
Jesse Gross committed
353
					slog.Warn("file read error", "file", r.Name(), "error", err)
354
					return err
355
				}
Michael Yang's avatar
Michael Yang committed
356

357
358
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
359
				}
Michael Yang's avatar
Michael Yang committed
360

361
362
363
364
365
366
367
368
369
370
				s += uint64(n)

				if params.Progress != nil {
					done := doneBytes.Add(uint64(n))
					params.Progress(float32(done) / float32(totalBytes))
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
371
372
	}

373
	if err := g.Wait(); err != nil {
Michael Yang's avatar
Michael Yang committed
374
375
376
		return nil, err
	}

377
378
	// map devices to backend buffer types so new tensors can be assigned to the correct device
	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
Michael Yang's avatar
Michael Yang committed
379
380
381
382

	// create backends and buffer types used for the compute graph scheduler
	var schedBackends []*C.struct_ggml_backend
	var schedBufts []*C.struct_ggml_backend_buffer_type
383
384
385
386
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		bt := C.ggml_backend_get_default_buffer_type(b)

387
388
389
		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
Michael Yang's avatar
Michael Yang committed
390
		schedBufts = append(schedBufts, bt)
391

392
		if C.ggml_backend_is_cpu(b) {
Michael Yang's avatar
Michael Yang committed
393
			// set number of threads for cpu backend
Michael Yang's avatar
Michael Yang committed
394
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
395
		}
396
397
	}

Michael Yang's avatar
Michael Yang committed
398
	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
Michael Yang's avatar
Michael Yang committed
399
	return &Backend{
400
		flashAttention: params.FlashAttention,
401
402
		meta:           meta,
		tensors:        tensors,
403
		sched: C.ggml_backend_sched_new(
Michael Yang's avatar
Michael Yang committed
404
405
406
407
			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
			C.int(len(schedBackends)),
			C.size_t(maxGraphNodes),
408
			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
409
		),
410
411
412
		schedBackends: schedBackends,
		schedBufts:    schedBufts,
		input:         deviceBufferTypes[input.d],
413
414
		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
			m := make(map[int]*C.struct_ggml_backend_buffer_type)
415
			for i, layer := range layers {
416
				m[i] = deviceBufferTypes[layer.d]
417
418
419
			}
			return m
		}(),
Michael Yang's avatar
Michael Yang committed
420
		maxGraphNodes: maxGraphNodes,
Michael Yang's avatar
Michael Yang committed
421
422
423
424
425
426
427
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

428
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
429
430
431
432
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
433
434
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
435
436
437
438
439
440
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
441
	return b.NewContextSize(b.maxGraphNodes)
442
443
444
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
445
446
447
448
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

449
450
	var allocatedBuffers []*C.struct_ggml_backend_buffer

Michael Yang's avatar
Michael Yang committed
451
	return &Context{
452
453
		b:             b,
		maxGraphNodes: n,
454
		ctx: C.ggml_init(C.struct_ggml_init_params{
455
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
456
457
			no_alloc: true,
		}),
458
		allocatedBuffers: &allocatedBuffers,
Michael Yang's avatar
Michael Yang committed
459
460
461
	}
}

462
func (b *Backend) CacheConfig() ml.CacheConfig {
463
464
465
466
467
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
468
469
}

Michael Yang's avatar
Michael Yang committed
470
type Context struct {
471
	b *Backend
Michael Yang's avatar
Michael Yang committed
472

473
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
474
	graph *C.struct_ggml_cgraph
475

476
477
	// buft is the buffer type used for new tensors
	buft *C.struct_ggml_backend_buffer_type
478

479
480
481
482
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
	allocatedBuffers *[]*C.struct_ggml_backend_buffer

Michael Yang's avatar
Michael Yang committed
483
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
484
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
485
486
}

487
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
488
	if c.b.input != nil {
489
		return &Context{
490
491
492
493
494
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
495
496
497
		}
	}

498
	return c
499
500
}

501
func (c *Context) Layer(i int) ml.Context {
502
	if buft, ok := c.b.layers[i]; ok {
503
		return &Context{
504
505
506
507
508
			b:                c.b,
			ctx:              c.ctx,
			buft:             buft,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
509
510
511
		}
	}

512
	return c
513
514
}

515
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
516
	if c.graph == nil {
517
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
518
519
	}

520
521
522
523
524
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
525
526
}

527
func (c *Context) Compute(tensors ...ml.Tensor) {
528
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
529
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
530

531
532
533
	needSync := true
	sync := func() {
		if needSync {
534
			C.ggml_backend_sched_synchronize(c.b.sched)
535
536
537
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
538

539
540
541
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
542
543
		}
	}
Michael Yang's avatar
Michael Yang committed
544
545
}

546
func (c *Context) Reserve() error {
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
		C.ggml_backend_sched_reset(c.b.sched)
		return errors.New("failed to reserve graph")
	}

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
	for i := range c.b.schedBackends {
		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
			"size", format.HumanBytes2(uint64(size)))
	}

	C.ggml_backend_sched_reset(c.b.sched)

	return nil
}

564
func (c *Context) MaxGraphNodes() int {
565
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
566
567
}

568
569
570
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
571
		sh[i] = C.int64_t(s)
572
573
574
575
576
	}

	return &sh[0]
}

577
578
579
580
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

581
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
582
	if c.buft == nil {
583
		panic("set Input or Layer before creating tensors")
584
585
	}

Michael Yang's avatar
Michael Yang committed
586
587
588
589
590
591
	var cdtype uint32
	switch dtype {
	case ml.DTypeF32:
		cdtype = C.GGML_TYPE_F32
	case ml.DTypeF16:
		cdtype = C.GGML_TYPE_F16
592
593
594
595
	case ml.DTypeQ80:
		cdtype = C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		cdtype = C.GGML_TYPE_Q4_0
Michael Yang's avatar
Michael Yang committed
596
597
598
599
600
601
	case ml.DTypeI32:
		cdtype = C.GGML_TYPE_I32
	default:
		panic("unsupported dtype")
	}

Jesse Gross's avatar
Jesse Gross committed
602
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
603
		var shape C.int64_t = 0
604
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
Michael Yang's avatar
Michael Yang committed
605
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
606
607
608
609
610
611
612
613
614
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
615
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
616
617
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
618
619
620
	if b == nil {
		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
	}
621
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
622

Michael Yang's avatar
Michael Yang committed
623
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
624
	return &Tensor{b: c.b, t: t}, nil
625
626
}

627
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
628
629
630
631
632
633
	t, err := c.newTensor(dtype, shape)
	if err != nil {
		panic(err)
	}

	return t
634
635
}

636
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
637
638
639
640
641
	t, err := c.newTensor(dtype, shape)
	if err != nil {
		panic(err)
	}

642
643
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
644
645
}

646
func checkShape[S ~[]E, E any](s S, shape ...int) error {
Michael Yang's avatar
Michael Yang committed
647
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
648
649
650
651
652

	if n == 0 {
		return nil
	}

Michael Yang's avatar
Michael Yang committed
653
654
655
656
657
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
658
		return fmt.Errorf("invalid shape: %v", shape)
Michael Yang's avatar
Michael Yang committed
659
660
	}

661
	return nil
Michael Yang's avatar
Michael Yang committed
662
663
}

664
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
Jesse Gross's avatar
Jesse Gross committed
665
	if err := checkShape(s, shape...); err != nil {
666
667
668
		return nil, err
	}

669
670
671
672
673
	t, err := c.newTensor(ml.DTypeF32, shape)
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
674
675
676
677
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

678
	return t, nil
Michael Yang's avatar
Michael Yang committed
679
680
}

681
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
Jesse Gross's avatar
Jesse Gross committed
682
	if err := checkShape(s, shape...); err != nil {
683
684
685
		return nil, err
	}

686
687
688
689
690
	t, err := c.newTensor(ml.DTypeI32, shape)
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
691
692
693
694
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

695
	return t, nil
Michael Yang's avatar
Michael Yang committed
696
697
}

Michael Yang's avatar
arange  
Michael Yang committed
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

		t, err := c.Input().FromIntSlice(arange, len(arange))
		if err != nil {
			panic(err)
		}

		return t
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
724
725
func (c *Context) Close() {
	if c != nil {
726
727
728
729
730
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

731
732
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
733
734
735
}

type Tensor struct {
736
	b    *Backend
Michael Yang's avatar
Michael Yang committed
737
	t    *C.struct_ggml_tensor
738
	sync func()
Michael Yang's avatar
Michael Yang committed
739
740
741
742
743
744
745
746
747
748
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

749
750
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
751
752
}

753
754
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
755
756
}

757
758
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
759
760
761
762
763
764
765
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

766
767
768
769
770
771
772
773
774
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
775
776
}

777
778
779
780
781
782
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
783
784
785
786
787
788
789
790
791
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
792
793
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
794
795
796
797
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
798
799
800
801
802
803
804
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

805
806
807
808
809
810
811
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
812
813
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
814
		b: t.b,
Michael Yang's avatar
Michael Yang committed
815
816
817
818
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
840
841
842
843
844
845
846
847
848
849
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
850
		b: t.b,
Michael Yang's avatar
Michael Yang committed
851
852
853
854
855
856
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
857
		b: t.b,
Michael Yang's avatar
Michael Yang committed
858
859
860
861
862
863
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
864
		b: t.b,
Michael Yang's avatar
Michael Yang committed
865
866
867
868
869
870
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
871
		b: t.b,
Michael Yang's avatar
Michael Yang committed
872
873
874
875
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

876
877
878
879
880
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
881
		b: t.b,
882
883
884
885
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
886
887
888
889
890
891
892
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
893
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
894
895
896
897
898
899
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
900
901
	}

Michael Yang's avatar
llama4  
Michael Yang committed
902
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
903
904
905
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
906
907
908
909
910
911
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
912
913
}

914
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
915
916
917
918
919
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
920
		b: t.b,
Michael Yang's avatar
Michael Yang committed
921
922
923
924
925
926
927
928
929
930
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
931
		b: t.b,
Michael Yang's avatar
Michael Yang committed
932
933
934
935
936
937
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
938
		b: t.b,
Michael Yang's avatar
Michael Yang committed
939
940
941
942
943
944
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
945
		b: t.b,
Michael Yang's avatar
Michael Yang committed
946
947
948
949
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

950
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
951
952
953
	switch len(shape) {
	case 1:
		return &Tensor{
954
			b: t.b,
Michael Yang's avatar
Michael Yang committed
955
956
957
958
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
959
			b: t.b,
Michael Yang's avatar
Michael Yang committed
960
961
962
963
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
964
			b: t.b,
Michael Yang's avatar
Michael Yang committed
965
966
967
968
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
969
			b: t.b,
Michael Yang's avatar
Michael Yang committed
970
971
972
973
974
975
976
977
978
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
979
		b: t.b,
Michael Yang's avatar
Michael Yang committed
980
981
982
983
984
985
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
986
		b: t.b,
Michael Yang's avatar
Michael Yang committed
987
988
989
990
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1005
1006
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1007
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1008
1009
1010
1011
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1012
1013
1014
1015
1016
1017
1018
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

1019
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1020
1021
1022
1023
1024
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
1025
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1026
1027
1028
1029
1030
1031
1032
1033
		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1034
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1035
1036
1037
1038
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1039
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1040
1041
1042
1043
1044
1045
1046
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1047
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1048
1049
1050
1051
1052
1053
1054
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1055
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

const (
Patrick Devine's avatar
Patrick Devine committed
1067
1068
1069
1070
	ropeTypeNorm   C.int = 0
	ropeTypeNeox   C.int = 2
	ropeTypeMrope  C.int = 8
	ropeTypeVision C.int = 24
Michael Yang's avatar
Michael Yang committed
1071
1072
)

Patrick Devine's avatar
Patrick Devine committed
1073
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1074
	if ropeFactors == nil {
1075
		ropeFactors = &Tensor{b: t.b}
Michael Yang's avatar
Michael Yang committed
1076
1077
	}

Jesse Gross's avatar
Jesse Gross committed
1078
1079
1080
1081
1082
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1083
	return &Tensor{
1084
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1085
		t: C.ggml_rope_ext(
Jesse Gross's avatar
Jesse Gross committed
1086
			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1087
			C.int(ropeDim),
Patrick Devine's avatar
Patrick Devine committed
1088
1089
			C.int(ropeType),
			131072, // YaRN n_ctx_train
Michael Yang's avatar
Michael Yang committed
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
			C.float(ropeBase),
			C.float(ropeScale),
			0.,  // YaRN ext_factor
			1.,  // YaRN attn_factor
			32., // YaRN beta_fast
			1.,  // YaRN beta_slow
		),
	}
}

1100
1101
1102
1103
1104
1105
1106
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

Michael Yang's avatar
Michael Yang committed
1107
1108
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1109
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1110
1111
1112
1113
1114
1115
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1116
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1117
1118
1119
1120
1121
1122
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1123
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1124
1125
1126
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1127

Michael Yang's avatar
Michael Yang committed
1128
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1129
1130
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1131
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1132
1133
1134
	}
}

Michael Yang's avatar
Michael Yang committed
1135
1136
1137
1138
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1139
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1140
	case 1:
Michael Yang's avatar
Michael Yang committed
1141
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1142
1143
1144
1145
1146
1147
1148
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1149
1150
1151
1152
1153
1154
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1155
1156
1157
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1158
1159
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1160

1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1174
}
1175
1176
1177
1178
1179
1180
1181

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1182
1183
1184
1185
1186
1187
1188

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}