ggml.go 30.3 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
13
	"errors"
Michael Yang's avatar
Michael Yang committed
14
15
16
	"fmt"
	"io"
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
	"runtime"
20
21
22
	"slices"
	"strconv"
	"strings"
23
	"sync/atomic"
24
	"unicode"
Michael Yang's avatar
Michael Yang committed
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
28
29
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
30
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
31
	"github.com/ollama/ollama/ml"
32
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
33
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
34
35
36
	"golang.org/x/sync/errgroup"
)

Michael Yang's avatar
Michael Yang committed
37
38
39
40
41
func devices() []*C.struct_ggml_backend_device {
	ggml.OnceLoad()
	ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
	for i := range ds {
		ds[i] = C.ggml_backend_dev_get(C.size_t(i))
Michael Yang's avatar
Michael Yang committed
42
	}
Michael Yang's avatar
Michael Yang committed
43
44

	return ds
45
}
Michael Yang's avatar
Michael Yang committed
46
47

type Backend struct {
48
49
50
	// modelPath is the location of the model data
	modelPath string

51
52
	meta *fsggml.GGML

53
54
55
56
	// tensorLoadTargets maps from the name of the tensor in the file
	// to the name that is used by the model definition
	tensorLoadTargets map[string][]string

57
58
59
60
	sched         *C.struct_ggml_backend_sched
	schedBackends []*C.struct_ggml_backend
	schedBufts    []*C.struct_ggml_backend_buffer_type

61
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
62
63

	// input is the backend used for inputs
64
	input *C.struct_ggml_backend_buffer_type
Michael Yang's avatar
Michael Yang committed
65
66

	// layers is the backend used for repeating layers
67
	layers map[int]*C.struct_ggml_backend_buffer_type
68

69
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
70
71
72

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
73
74
}

75
76
77
78
79
80
81
82
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
	r, err := os.Open(modelPath)
	if err != nil {
		return nil, err
	}
	defer r.Close()

	meta, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

97
	type deviceBufferType struct {
98
99
100
101
102
		d   *C.struct_ggml_backend_device
		bts []*C.struct_ggml_backend_buffer_type
	}

	var cpus, accels, gpus []*C.struct_ggml_backend_device
Michael Yang's avatar
Michael Yang committed
103
	for _, d := range devices() {
104
105
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
106
107
108
109
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
110
111
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
Michael Yang's avatar
Michael Yang committed
112
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
113
			gpus = append(gpus, d)
Michael Yang's avatar
Michael Yang committed
114
115
116
		}
	}

Michael Yang's avatar
Michael Yang committed
117
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
118
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
119
120
121
122
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Michael Yang's avatar
Michael Yang committed
123
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
Michael Yang's avatar
Michael Yang committed
124
		}
125
126
	}

Michael Yang's avatar
Michael Yang committed
127
	// create list of buffer types for each gpu
128
	var gpuDeviceBufferTypes []deviceBufferType
129
130
	for _, d := range gpus {
		bt := C.ggml_backend_dev_buffer_type(d)
131
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
132
			d:   d,
Michael Yang's avatar
Michael Yang committed
133
			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
134
		})
Michael Yang's avatar
Michael Yang committed
135
136
	}

Michael Yang's avatar
Michael Yang committed
137
138
139
140
141
	useDefaultSplit := true
	for _, s := range params.TensorSplit {
		if s != 0 {
			useDefaultSplit = false
			break
142
		}
Michael Yang's avatar
Michael Yang committed
143
	}
144

Michael Yang's avatar
Michael Yang committed
145
146
147
148
	// calculate splits
	splits := make([]float32, len(gpus))
	if useDefaultSplit {
		// default: split on free memory
149
150
151
152
153
		for i := range splits {
			var free, total C.size_t
			C.ggml_backend_dev_memory(gpus[i], &free, &total)
			splits[i] = float32(free)
		}
Michael Yang's avatar
Michael Yang committed
154
155
	} else {
		splits = params.TensorSplit
156
157
158
	}

	var sum float32
Michael Yang's avatar
Michael Yang committed
159
	// cumulative sum of all splits
160
161
162
163
164
	for i := range splits {
		sum += splits[i]
		splits[i] = sum
	}

Michael Yang's avatar
Michael Yang committed
165
	// normalize splits
166
	for i := range splits {
167
		splits[i] /= sum
168
169
	}

Michael Yang's avatar
Michael Yang committed
170
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
171
	input := cpuDeviceBufferType
172

173
	blocks := int(meta.KV().BlockCount())
Michael Yang's avatar
Michael Yang committed
174
175
176
177

	// define a range of gpu layers. anything outside of this range is assigned to the cpu
	gpuRangeStart := max(0, blocks-params.NumGPULayers)
	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
Michael Yang's avatar
Michael Yang committed
178
	assignLayer := func(i int) deviceBufferType {
Michael Yang's avatar
Michael Yang committed
179
		if i < gpuRangeStart || i >= gpuRangeStop {
Michael Yang's avatar
Michael Yang committed
180
			return cpuDeviceBufferType
181
		}
182

Michael Yang's avatar
Michael Yang committed
183
		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
184
		if index < 0 || index >= len(gpuDeviceBufferTypes) {
Michael Yang's avatar
Michael Yang committed
185
			return cpuDeviceBufferType
186
187
188
		}

		return gpuDeviceBufferTypes[index]
189
190
	}

Michael Yang's avatar
Michael Yang committed
191
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
192
	layers := make([]deviceBufferType, blocks)
193
	for i := range layers {
194
		layers[i] = assignLayer(i)
195
196
	}

Michael Yang's avatar
Michael Yang committed
197
	// outputs are assigned iff allowed by splits and configured number of gpu layers
198
	output := assignLayer(blocks)
199
200
201

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
202
	// each layer has at most 2 extra tensors for rope operations
203
204
	maxTensors += blocks * 2

205
	type tensor struct {
206
		source *fsggml.Tensor
207
208
209
		target string
	}

Michael Yang's avatar
Michael Yang committed
210
	// some tensors are mapped to different names so keep a list
211
212
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
213
	// contexts are shared by tensors of the same buffer type
214
	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
215
	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
216
217
218
219
220
221
222
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
223

224
225
226
227
228
229
230
231
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
232
			defer C.free(unsafe.Pointer(cname))
233
234
235
236
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

237
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
238
239
			C.ggml_set_name(tt, cname)

240
			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
241
242
243
244
245
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
246
247
	}

248
	contains := func(s string, parts ...string) bool {
249
250
251
252
253
254
255
256
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
257
258
	}

259
260
	for _, t := range meta.Tensors().Items() {
		switch {
261
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
262
			createTensor(tensor{source: t}, input.bts)
Michael Yang's avatar
Michael Yang committed
263
264
265
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
			}
266
		case contains(t.Name, "cls", "output", "output_norm"):
267
			createTensor(tensor{source: t}, output.bts)
268
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
269
			// TODO: assign vision tensors to the gpu if possible
Michael Yang's avatar
Michael Yang committed
270
			createTensor(tensor{source: t}, output.bts)
Michael Yang's avatar
Michael Yang committed
271
272
273
274
275
276
277
278
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
				}, layer.bts)
			}
279
		default:
Michael Yang's avatar
Michael Yang committed
280
281
282
283
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
284
				}
Michael Yang's avatar
Michael Yang committed
285
			}
286

Michael Yang's avatar
Michael Yang committed
287
288
			if layerIndex >= 0 {
				createTensor(tensor{source: t}, layers[layerIndex].bts)
289
			} else {
Michael Yang's avatar
Michael Yang committed
290
291
				// load all other tensors on the cpu
				createTensor(tensor{source: t}, input.bts)
292
293
294
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
295

Michael Yang's avatar
Michael Yang committed
296
297
	// allocate buffers for each context
	bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
298
299
300
301
302
303
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
304
305
306
307
		if b == nil {
			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
		}

308
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
Michael Yang's avatar
Michael Yang committed
309
		bbs[c] = b
310
311
312
	}

	for bs := range maps.Values(bbs) {
Michael Yang's avatar
Michael Yang committed
313
		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
314
315
	}

Michael Yang's avatar
Michael Yang committed
316
	// map tensor names to tensors for easy lookup later
317
318
319
320
321
322
323
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
	// map devices to backend buffer types so new tensors can be assigned to the correct device
	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)

	// create backends and buffer types used for the compute graph scheduler
	var schedBackends []*C.struct_ggml_backend
	var schedBufts []*C.struct_ggml_backend_buffer_type
	for _, d := range append(gpus, append(accels, cpus...)...) {
		b := C.ggml_backend_dev_init(d, nil)
		bt := C.ggml_backend_get_default_buffer_type(b)

		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
		schedBufts = append(schedBufts, bt)

		if C.ggml_backend_is_cpu(b) {
			// set number of threads for cpu backend
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
		}
	}

	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
	return &Backend{
		modelPath:         modelPath,
		flashAttention:    params.FlashAttention,
		meta:              meta,
		tensorLoadTargets: targets,
		tensors:           tensors,
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
			C.int(len(schedBackends)),
			C.size_t(maxGraphNodes),
			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
			C._Bool(false),
		),
		schedBackends: schedBackends,
		schedBufts:    schedBufts,
		input:         deviceBufferTypes[input.d],
		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
			m := make(map[int]*C.struct_ggml_backend_buffer_type)
			for i, layer := range layers {
				m[i] = deviceBufferTypes[layer.d]
			}
			return m
		}(),
		maxGraphNodes: maxGraphNodes,
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
379
	var doneBytes atomic.Uint64
380
	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
381
382
383

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
384
	for _, t := range b.meta.Tensors().Items() {
385
		t := t
386
		g.Go(func() error {
387
			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
388
			for i := range tts {
389
				target := b.tensorLoadTargets[t.Name][i]
390
391
392
				if target == "" {
					target = t.Name
				}
393

394
				tt, ok := b.tensors[target]
395
396
397
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
398

399
400
401
				tts[i] = tt
			}

402
403
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
404
			file, err := os.Open(b.modelPath)
405
			if err != nil {
406
				slog.Warn("file open error", "file", b.modelPath, "error", err)
407
408
409
				return err
			}
			defer file.Close()
410
			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
411
412
413
414
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
415
416
417
418
419
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

420
421
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
422
					slog.Warn("file read error", "file", b.modelPath, "error", err)
423
					return err
424
				}
Michael Yang's avatar
Michael Yang committed
425

426
427
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
428
				}
Michael Yang's avatar
Michael Yang committed
429

430
431
				s += uint64(n)

432
				if progress != nil {
433
					done := doneBytes.Add(uint64(n))
434
					progress(float32(done) / float32(totalBytes))
435
436
437
438
439
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
440
441
	}

442
	if err := g.Wait(); err != nil {
443
		return err
444
445
	}

446
	return nil
Michael Yang's avatar
Michael Yang committed
447
448
}

449
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
450
451
452
453
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
454
455
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
456
457
458
459
460
461
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
462
	return b.NewContextSize(b.maxGraphNodes)
463
464
465
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
466
467
468
469
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

470
471
	var allocatedBuffers []*C.struct_ggml_backend_buffer

Michael Yang's avatar
Michael Yang committed
472
	return &Context{
473
474
		b:             b,
		maxGraphNodes: n,
475
		ctx: C.ggml_init(C.struct_ggml_init_params{
476
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
477
478
			no_alloc: true,
		}),
479
		allocatedBuffers: &allocatedBuffers,
Michael Yang's avatar
Michael Yang committed
480
481
482
	}
}

483
func (b *Backend) CacheConfig() ml.CacheConfig {
484
485
486
487
488
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
489
490
}

Michael Yang's avatar
Michael Yang committed
491
type Context struct {
492
	b *Backend
Michael Yang's avatar
Michael Yang committed
493

494
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
495
	graph *C.struct_ggml_cgraph
496

497
498
	// buft is the buffer type used for new tensors
	buft *C.struct_ggml_backend_buffer_type
499

500
501
502
503
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
	allocatedBuffers *[]*C.struct_ggml_backend_buffer

Michael Yang's avatar
Michael Yang committed
504
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
505
	maxGraphNodes int
Michael Yang's avatar
Michael Yang committed
506
507
}

508
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
509
	if c.b.input != nil {
510
		return &Context{
511
512
513
514
515
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
516
517
518
		}
	}

519
	return c
520
521
}

522
func (c *Context) Layer(i int) ml.Context {
523
	if buft, ok := c.b.layers[i]; ok {
524
		return &Context{
525
526
527
528
529
			b:                c.b,
			ctx:              c.ctx,
			buft:             buft,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
530
531
532
		}
	}

533
	return c
534
535
}

536
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
537
	if c.graph == nil {
538
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
539
540
	}

541
542
543
544
545
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
546
547
}

548
func (c *Context) Compute(tensors ...ml.Tensor) {
549
	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
Michael Yang's avatar
Michael Yang committed
550
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
551

552
553
554
	needSync := true
	sync := func() {
		if needSync {
555
			C.ggml_backend_sched_synchronize(c.b.sched)
556
557
558
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
559

560
561
562
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
563
564
		}
	}
Michael Yang's avatar
Michael Yang committed
565
566
}

567
func (c *Context) Reserve() error {
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
		C.ggml_backend_sched_reset(c.b.sched)
		return errors.New("failed to reserve graph")
	}

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
	for i := range c.b.schedBackends {
		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
			"size", format.HumanBytes2(uint64(size)))
	}

	C.ggml_backend_sched_reset(c.b.sched)

	return nil
}

585
func (c *Context) MaxGraphNodes() int {
586
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
587
588
}

589
590
591
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
592
		sh[i] = C.int64_t(s)
593
594
595
596
597
	}

	return &sh[0]
}

598
599
600
601
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

602
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
603
	if c.buft == nil {
604
		panic("set Input or Layer before creating tensors")
605
606
	}

Michael Yang's avatar
Michael Yang committed
607
608
609
610
611
612
	var cdtype uint32
	switch dtype {
	case ml.DTypeF32:
		cdtype = C.GGML_TYPE_F32
	case ml.DTypeF16:
		cdtype = C.GGML_TYPE_F16
613
614
615
616
	case ml.DTypeQ80:
		cdtype = C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		cdtype = C.GGML_TYPE_Q4_0
Michael Yang's avatar
Michael Yang committed
617
618
619
620
621
622
	case ml.DTypeI32:
		cdtype = C.GGML_TYPE_I32
	default:
		panic("unsupported dtype")
	}

Jesse Gross's avatar
Jesse Gross committed
623
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
624
		var shape C.int64_t = 0
625
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
Michael Yang's avatar
Michael Yang committed
626
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
627
628
629
630
631
632
633
634
635
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
636
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
637
638
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
639
640
641
	if b == nil {
		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
	}
642
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
643

Michael Yang's avatar
Michael Yang committed
644
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
645
	return &Tensor{b: c.b, t: t}, nil
646
647
}

648
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
649
650
651
652
653
654
	t, err := c.newTensor(dtype, shape)
	if err != nil {
		panic(err)
	}

	return t
655
656
}

657
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
658
659
660
661
662
	t, err := c.newTensor(dtype, shape)
	if err != nil {
		panic(err)
	}

663
664
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
665
666
}

667
func checkShape[S ~[]E, E any](s S, shape ...int) error {
Michael Yang's avatar
Michael Yang committed
668
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
669
670
671
672
673

	if n == 0 {
		return nil
	}

Michael Yang's avatar
Michael Yang committed
674
675
676
677
678
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
679
		return fmt.Errorf("invalid shape: %v", shape)
Michael Yang's avatar
Michael Yang committed
680
681
	}

682
	return nil
Michael Yang's avatar
Michael Yang committed
683
684
}

685
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
Jesse Gross's avatar
Jesse Gross committed
686
	if err := checkShape(s, shape...); err != nil {
687
688
689
		return nil, err
	}

690
691
692
693
694
	t, err := c.newTensor(ml.DTypeF32, shape)
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
695
696
697
698
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

699
	return t, nil
Michael Yang's avatar
Michael Yang committed
700
701
}

702
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
Jesse Gross's avatar
Jesse Gross committed
703
	if err := checkShape(s, shape...); err != nil {
704
705
706
		return nil, err
	}

707
708
709
710
711
	t, err := c.newTensor(ml.DTypeI32, shape)
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
712
713
714
715
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

716
	return t, nil
Michael Yang's avatar
Michael Yang committed
717
718
}

Michael Yang's avatar
arange  
Michael Yang committed
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

		t, err := c.Input().FromIntSlice(arange, len(arange))
		if err != nil {
			panic(err)
		}

		return t
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
745
746
func (c *Context) Close() {
	if c != nil {
747
748
749
750
751
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

752
753
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
754
755
756
}

type Tensor struct {
757
	b    *Backend
Michael Yang's avatar
Michael Yang committed
758
	t    *C.struct_ggml_tensor
759
	sync func()
Michael Yang's avatar
Michael Yang committed
760
761
762
763
764
765
766
767
768
769
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

770
771
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
772
773
}

774
775
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
776
777
}

778
779
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
780
781
782
783
784
785
786
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

787
788
789
790
791
792
793
794
795
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
796
797
}

798
799
800
801
802
803
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
804
805
806
807
808
809
810
811
812
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
813
814
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
815
816
817
818
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
819
820
821
822
823
824
825
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
	default:
		return ml.DTypeOther
	}
}

826
827
828
829
830
831
832
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
833
834
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
835
		b: t.b,
Michael Yang's avatar
Michael Yang committed
836
837
838
839
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
861
862
863
864
865
866
867
868
869
870
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
871
		b: t.b,
Michael Yang's avatar
Michael Yang committed
872
873
874
875
876
877
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
	return &Tensor{
878
		b: t.b,
Michael Yang's avatar
Michael Yang committed
879
880
881
882
883
884
		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
885
		b: t.b,
Michael Yang's avatar
Michael Yang committed
886
887
888
889
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

890
891
892
893
894
895
896
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
897
898
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
899
		b: t.b,
Michael Yang's avatar
Michael Yang committed
900
901
902
903
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

904
905
906
907
908
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
909
		b: t.b,
910
911
912
913
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
914
915
916
917
918
919
920
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
921
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
922
923
924
925
926
927
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
928
929
	}

Michael Yang's avatar
llama4  
Michael Yang committed
930
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
931
932
933
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
934
935
936
937
938
939
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
940
941
}

942
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
943
944
	if len(shape) != 4 {
		panic("expected 4 dimensions")
945
946
	} else if shape[3] != 0 {
		panic("cuda does not support 4d tensors")
Michael Yang's avatar
Michael Yang committed
947
948
949
	}

	return &Tensor{
950
		b: t.b,
Michael Yang's avatar
Michael Yang committed
951
952
953
954
955
956
957
958
959
960
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
961
		b: t.b,
Michael Yang's avatar
Michael Yang committed
962
963
964
965
966
967
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
968
		b: t.b,
Michael Yang's avatar
Michael Yang committed
969
970
971
972
973
974
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
975
		b: t.b,
Michael Yang's avatar
Michael Yang committed
976
977
978
979
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

980
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
981
982
983
	switch len(shape) {
	case 1:
		return &Tensor{
984
			b: t.b,
Michael Yang's avatar
Michael Yang committed
985
986
987
988
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
989
			b: t.b,
Michael Yang's avatar
Michael Yang committed
990
991
992
993
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
994
			b: t.b,
Michael Yang's avatar
Michael Yang committed
995
996
997
998
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
999
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1000
1001
1002
1003
1004
1005
1006
1007
1008
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
1009
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1010
1011
1012
1013
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

1014
1015
1016
1017
1018
1019
1020
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1021
1022
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
1023
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1024
1025
1026
1027
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1042
1043
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1044
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1045
1046
1047
1048
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1049
1050
1051
1052
1053
1054
1055
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1056
1057
1058
1059
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1060
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1061
1062
1063
1064
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1065
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1066
1067
1068
1069
1070
1071
1072
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1073
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1074
1075
1076
1077
1078
1079
1080
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1081
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

1092
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
1093
	// Default options
1094
	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
1095
1096
1097
1098
1099
1100

	// Apply any provided options
	for _, option := range options {
		option(opts)
	}

Jesse Gross's avatar
Jesse Gross committed
1101
1102
1103
1104
1105
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1106
	return &Tensor{
1107
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1108
		t: C.ggml_rope_ext(
1109
1110
			ctx.(*Context).ctx,
			dequant,
1111
1112
			positions.(*Tensor).t,
			opts.Factors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1113
			C.int(ropeDim),
1114
1115
			C.int(opts.Type),
			C.int(opts.OriginalContextLength),
Michael Yang's avatar
Michael Yang committed
1116
1117
			C.float(ropeBase),
			C.float(ropeScale),
1118
1119
1120
1121
			C.float(0.0),
			C.float(1.0),
			C.float(32.0),
			C.float(1.0),
Michael Yang's avatar
Michael Yang committed
1122
1123
1124
1125
		),
	}
}

1126
1127
1128
1129
1130
1131
1132
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

Michael Yang's avatar
Michael Yang committed
1133
1134
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1135
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1136
1137
1138
1139
1140
1141
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1142
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1143
1144
1145
1146
1147
1148
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1149
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1150
1151
1152
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1153

Michael Yang's avatar
Michael Yang committed
1154
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1155
1156
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1157
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1158
1159
1160
	}
}

Michael Yang's avatar
Michael Yang committed
1161
1162
1163
1164
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1165
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1166
	case 1:
Michael Yang's avatar
Michael Yang committed
1167
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1168
1169
1170
1171
1172
1173
1174
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1175
1176
1177
1178
1179
1180
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1181
1182
1183
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1184
1185
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1186

1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1200
}
1201
1202
1203
1204
1205
1206
1207

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1208
1209
1210
1211
1212
1213
1214

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}
1215
1216
1217
1218
1219
1220
1221

func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
	}
}