ggml.go 40 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
Jesse Gross's avatar
Jesse Gross committed
13
	"errors"
Michael Yang's avatar
Michael Yang committed
14
15
16
	"fmt"
	"io"
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
	"runtime"
20
21
22
	"slices"
	"strconv"
	"strings"
Jesse Gross's avatar
Jesse Gross committed
23
	"sync"
24
	"sync/atomic"
25
	"unicode"
Michael Yang's avatar
Michael Yang committed
26
27
28
	"unsafe"

	"github.com/ollama/ollama/format"
29
30
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
31
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
32
	"github.com/ollama/ollama/ml"
33
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
34
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
35
36
37
	"golang.org/x/sync/errgroup"
)

Jesse Gross's avatar
Jesse Gross committed
38
39
40
41
42
43
var (
	cpus, accels, gpus []C.ggml_backend_dev_t
	backends           map[C.ggml_backend_dev_t]C.ggml_backend_t
)

var initDevices = sync.OnceFunc(func() {
Michael Yang's avatar
Michael Yang committed
44
45
	ggml.OnceLoad()

Jesse Gross's avatar
Jesse Gross committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
	backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
	for i := range C.ggml_backend_dev_count() {
		d := C.ggml_backend_dev_get(i)

		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			gpus = append(gpus, d)
		}

		backends[d] = C.ggml_backend_dev_init(d, nil)
	}
})
Michael Yang's avatar
Michael Yang committed
65

Jesse Gross's avatar
Jesse Gross committed
66
67
68
69
70
type layerDevice struct {
	d  C.ggml_backend_dev_t
	bt C.ggml_backend_buffer_type_t
}

Michael Yang's avatar
Michael Yang committed
71
type Backend struct {
72
73
74
	// modelPath is the location of the model data
	modelPath string

75
76
	meta *fsggml.GGML

Jesse Gross's avatar
Jesse Gross committed
77
78
79
80
	// allocMemory means that memory should be allocated for tensors and not
	// just a dry run
	allocMemory bool

81
82
83
84
	// tensorLoadTargets maps from the name of the tensor in the file
	// to the name that is used by the model definition
	tensorLoadTargets map[string][]string

85
	schedMu       sync.Mutex // Only one Compute can run at a time
86
87
88
	sched         C.ggml_backend_sched_t
	schedBackends []C.ggml_backend_t
	schedBufts    []C.ggml_backend_buffer_type_t
89

90
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
91

Jesse Gross's avatar
Jesse Gross committed
92
	// input is the backend buffer type used for inputs
93
	input C.ggml_backend_buffer_type_t
Michael Yang's avatar
Michael Yang committed
94

Jesse Gross's avatar
Jesse Gross committed
95
96
97
	// output is the backend device used for outputs
	output C.ggml_backend_dev_t

Michael Yang's avatar
Michael Yang committed
98
	// layers is the backend used for repeating layers
Jesse Gross's avatar
Jesse Gross committed
99
	layers map[int]layerDevice
100

101
102
103
104
	// requiredMemory is the cumulative memory allocations needed by the backend
	requiredMemory *ml.BackendMemory

	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
105
	btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
106

107
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
108
109
110

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Jesse Gross's avatar
Jesse Gross committed
111
112
113

	// weightBuffers are the GGML contexts and buffers for allocating weights
	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
Michael Yang's avatar
Michael Yang committed
114
115
}

Jesse Gross's avatar
Jesse Gross committed
116
117
var once sync.Once

118
119
120
121
122
123
124
125
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
	r, err := os.Open(modelPath)
	if err != nil {
		return nil, err
	}
	defer r.Close()

	meta, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
126
127
128
129
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
130
131
132
133
134
135
136
137
138
139
140
	once.Do(func() {
		slog.Info(
			"",
			"architecture", meta.KV().Architecture(),
			"file_type", meta.KV().FileType(),
			"name", meta.KV().String("general.name"),
			"description", meta.KV().String("general.description"),
			"num_tensors", len(meta.Tensors().Items()),
			"num_key_values", len(meta.KV()),
		)
	})
Michael Yang's avatar
Michael Yang committed
141

Jesse Gross's avatar
Jesse Gross committed
142
143
	initDevices()

144
	var requiredMemory ml.BackendMemory
145
	btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
146

147
	type deviceBufferType struct {
148
149
		d   C.ggml_backend_dev_t
		bts []C.ggml_backend_buffer_type_t
150
151
	}

152
153
	blocks := int(meta.KV().BlockCount())

Michael Yang's avatar
Michael Yang committed
154
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
155
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
156
157
158
159
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Jesse Gross's avatar
Jesse Gross committed
160
161
162
			bt := C.ggml_backend_dev_buffer_type(d)
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)

163
			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
Michael Yang's avatar
Michael Yang committed
164
		}
165
166
	}

167
	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
168
169
	var props C.struct_ggml_backend_dev_props
	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
170
	requiredMemory.CPU.ID = C.GoString(props.id)
171
172
	requiredMemory.CPU.Weights = make([]uint64, blocks+1)
	requiredMemory.CPU.Cache = make([]uint64, blocks+1)
173

Michael Yang's avatar
Michael Yang committed
174
	// create list of buffer types for each gpu
175
	var gpuDeviceBufferTypes []deviceBufferType
176
177
	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
	for i, d := range gpus {
178
		bt := C.ggml_backend_dev_buffer_type(d)
179
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
180
			d:   d,
181
			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
182
		})
Jesse Gross's avatar
Jesse Gross committed
183

184
185
		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
186
187
		var props C.struct_ggml_backend_dev_props
		C.ggml_backend_dev_get_props(d, &props)
188
		requiredMemory.GPUs[i].ID = C.GoString(props.id)
189
190
		requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
		requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
Michael Yang's avatar
Michael Yang committed
191
192
	}

Michael Yang's avatar
Michael Yang committed
193
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
194
	input := cpuDeviceBufferType
195

Jesse Gross's avatar
Jesse Gross committed
196
197
198
199
200
201
202
203
204
	assignLayer := func(layer int) deviceBufferType {
		for _, p := range params.GPULayers {
			for _, l := range p.Layers {
				if l == layer {
					for i := range requiredMemory.GPUs {
						if requiredMemory.GPUs[i].ID == p.ID {
							return gpuDeviceBufferTypes[i]
						}
					}
205

Jesse Gross's avatar
Jesse Gross committed
206
207
208
					return cpuDeviceBufferType
				}
			}
209
210
		}

Jesse Gross's avatar
Jesse Gross committed
211
		return cpuDeviceBufferType
212
213
	}

Michael Yang's avatar
Michael Yang committed
214
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
215
	layers := make([]deviceBufferType, blocks)
216
	for i := range layers {
217
		layers[i] = assignLayer(i)
218
219
	}

Michael Yang's avatar
Michael Yang committed
220
	// outputs are assigned iff allowed by splits and configured number of gpu layers
221
	output := assignLayer(blocks)
222
223
224

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
225
	// each layer has at most 2 extra tensors for rope operations
226
227
	maxTensors += blocks * 2

228
	type tensor struct {
229
		source *fsggml.Tensor
230
231
232
		target string
	}

Michael Yang's avatar
Michael Yang committed
233
	// some tensors are mapped to different names so keep a list
234
235
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
236
	// contexts are shared by tensors of the same buffer type
237
238
	ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
	createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
239
240
241
242
243
244
245
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
246

247
248
249
250
251
252
253
254
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
255
			defer C.free(unsafe.Pointer(cname))
256
257
258
259
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

260
261
262
263
264
265
266
267
268
269
			kind := t.source.Kind
			if t.source.Kind == 4 {
				// transform raw mxfp4 stream to ggml mxfp4 format
				kind = 39
			} else if t.source.Kind == uint32(fsggml.TensorTypeBF16) && strings.HasSuffix(t.source.Name, "_exps.bias") {
				// transform "_exps.bias" from bf16 to fp32; add_ids only supports fp32 tensors
				kind = uint32(fsggml.TensorTypeF32)
			}

			tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
270
271
			C.ggml_set_name(tt, cname)

272
			logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
273
274
275

			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
			if layer == -1 {
276
				requiredMemory.InputWeights += uint64(size)
277
			} else {
278
				btDeviceMemory[bt].Weights[layer] += uint64(size)
279
280
			}

281
282
283
284
285
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
286
287
	}

288
	contains := func(s string, parts ...string) bool {
289
290
291
292
293
294
295
296
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
297
298
	}

299
300
	for _, t := range meta.Tensors().Items() {
		switch {
301
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
302
			createTensor(tensor{source: t}, input.bts, -1)
Michael Yang's avatar
Michael Yang committed
303
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
304
				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
305
			}
Michael Yang's avatar
Michael Yang committed
306
307
308
		case contains(t.Name, "cls", "output", "output_norm",
			"altup_proj", "altup_unembd_proj",
			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
309
			createTensor(tensor{source: t}, output.bts, blocks)
310
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
311
			// TODO: assign vision tensors to the gpu if possible
312
			createTensor(tensor{source: t}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
313
314
315
316
317
318
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
319
				}, layer.bts, i)
Michael Yang's avatar
Michael Yang committed
320
			}
321
		default:
Michael Yang's avatar
Michael Yang committed
322
323
324
325
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
326
				}
Michael Yang's avatar
Michael Yang committed
327
			}
328

Michael Yang's avatar
Michael Yang committed
329
			if layerIndex >= 0 {
330
				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
331
			} else {
Michael Yang's avatar
Michael Yang committed
332
				// load all other tensors on the cpu
333
				createTensor(tensor{source: t}, input.bts, -1)
334
335
336
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
337

Michael Yang's avatar
Michael Yang committed
338
	// map tensor names to tensors for easy lookup later
339
340
341
342
343
344
345
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

346
	// map devices to backend buffer types so new tensors can be assigned to the correct device
347
	deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
348
349

	// create backends and buffer types used for the compute graph scheduler
350
351
	var schedBackends []C.ggml_backend_t
	var schedBufts []C.ggml_backend_buffer_type_t
352
	for _, d := range append(gpus, append(accels, cpus...)...) {
Jesse Gross's avatar
Jesse Gross committed
353
		b := backends[d]
354
355
		bt := C.ggml_backend_get_default_buffer_type(b)

Jesse Gross's avatar
Jesse Gross committed
356
357
358
359
360
361
362
		// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
		if !slices.Contains(cpuDeviceBufferType.bts, bt) {
			if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
				continue
			}
		}

363
364
365
366
367
368
369
370
371
372
373
374
		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
		schedBufts = append(schedBufts, bt)

		if C.ggml_backend_is_cpu(b) {
			// set number of threads for cpu backend
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
		}
	}

	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414

	sched := C.ggml_backend_sched_new_ext(
		(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
		(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
		C.int(len(schedBackends)),
		C.size_t(maxGraphNodes),
		C._Bool(false),
		C._Bool(false),
		C._Bool(params.AllocMemory),
	)

	// allocate buffers for each context
	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
		if b == nil {
			for _, b := range bbs {
				C.ggml_backend_buffer_free(b)
			}

			for _, ctx := range ctxs {
				C.ggml_free(ctx)
			}

			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
		}

		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
		bbs[c] = b
	}

	for bs := range maps.Values(bbs) {
		logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
	}

415
416
	return &Backend{
		modelPath:         modelPath,
Jesse Gross's avatar
Jesse Gross committed
417
		allocMemory:       params.AllocMemory,
418
419
420
421
		flashAttention:    params.FlashAttention,
		meta:              meta,
		tensorLoadTargets: targets,
		tensors:           tensors,
422
423
424
425
426
		sched:             sched,
		schedBackends:     schedBackends,
		schedBufts:        schedBufts,
		input:             deviceBufferTypes[input.d],
		output:            output.d,
Jesse Gross's avatar
Jesse Gross committed
427
428
		layers: func() map[int]layerDevice {
			m := make(map[int]layerDevice)
429
			for i, layer := range layers {
Jesse Gross's avatar
Jesse Gross committed
430
431
432
433
				m[i] = layerDevice{
					d:  layer.d,
					bt: deviceBufferTypes[layer.d],
				}
434
435
436
			}
			return m
		}(),
437
438
439
		requiredMemory: &requiredMemory,
		btDeviceMemory: btDeviceMemory,
		maxGraphNodes:  maxGraphNodes,
Jesse Gross's avatar
Jesse Gross committed
440
		weightBuffers:  bbs,
441
442
443
444
445
446
447
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

Jesse Gross's avatar
Jesse Gross committed
448
449
450
451
452
453
454
455
456
457
458
459
460
func (b *Backend) Close() {
	if b == nil {
		return
	}

	for ctx, b := range b.weightBuffers {
		C.ggml_backend_buffer_free(b)
		C.ggml_free(ctx)
	}

	C.ggml_backend_sched_free(b.sched)
}

461
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
Jesse Gross's avatar
Jesse Gross committed
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
	if !b.allocMemory {
		return errors.New("cannot load model without memory allocation")
	}

	// Mimic llama runner logs summarizing layers and memory
	gpuLayers := 0
	for layer := range maps.Values(b.layers) {
		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
			gpuLayers++
		}
	}
	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))

	switch C.ggml_backend_dev_type(b.output) {
	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
		slog.Info("offloading output layer to CPU")
	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
		slog.Info("offloading output layer to GPU")
		gpuLayers++
	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
		slog.Info("offloading output layer to ACCEL")
	}
	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))

486
	var doneBytes atomic.Uint64
487
	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
488
489
490

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
491
	for _, t := range b.meta.Tensors().Items() {
492
		t := t
493
		g.Go(func() error {
494
			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
495
			for i := range tts {
496
				target := b.tensorLoadTargets[t.Name][i]
497
498
499
				if target == "" {
					target = t.Name
				}
500

501
				tt, ok := b.tensors[target]
502
503
504
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
505

506
507
508
				tts[i] = tt
			}

509
510
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
511
			file, err := os.Open(b.modelPath)
512
			if err != nil {
513
				slog.Warn("file open error", "file", b.modelPath, "error", err)
514
515
516
				return err
			}
			defer file.Close()
517
			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
518
519
520
521
522
523
524

			if t.Kind == 4 && tts[0]._type == 39 {
				// source is mxfp4, target is ggml mxfp4

				const BS = 17                             // MXFP4 block size
				bts := make([]byte, 8*BS*format.KibiByte) // ~128k block aligned
				var s uint64
525
				var tmp [16]byte
526
527
528
529
530
531
532
533
534
535
536
537
				for s < t.Size() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					for j := range n / BS {
						for i := 1; i < 9; i++ {
538
539
540
541
							// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
							a, b := bts[j*BS+i], bts[j*BS+i+8]
							tmp[2*(i-1)] = (a & 0x0F) | (b << 4)
							tmp[2*(i-1)+1] = (a >> 4) | (b & 0xF0)
542
						}
543
						copy(bts[j*BS+1:j*BS+17], tmp[:])
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
					}

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
					}

					s += uint64(n)

					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			} else if strings.HasSuffix(t.Name, "_exps.bias") && t.Kind == 30 && tts[0]._type == 0 {
				// source is bf16, target is ggml fp32

				// data is bf16 but we need to convert to fp32
				bts := make([]byte, 128*format.KibiByte)
				var e uint64
				for e < t.Elements() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Elements()-e)*2)])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					fp32 := ConvertToF32(bts, uint32(fsggml.TensorTypeBF16), uint64(n/2))

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&fp32[0]), C.size_t(e*4), C.size_t(n*2))
					}
					e += uint64(n / 2)
					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			}

588
589
590
591
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
592
593
594
595
596
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

597
598
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
599
					slog.Warn("file read error", "file", b.modelPath, "error", err)
600
					return err
601
				}
Michael Yang's avatar
Michael Yang committed
602

603
604
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
605
				}
Michael Yang's avatar
Michael Yang committed
606

607
608
				s += uint64(n)

609
				if progress != nil {
610
					done := doneBytes.Add(uint64(n))
611
					progress(float32(done) / float32(totalBytes))
612
613
614
615
616
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
617
618
	}

619
620
621
622
623
624
625
626
627
628
629
630
	// Cleanup any backend state from devices that we didn't end up using
nextDevice:
	for _, d := range append(gpus, append(accels, cpus...)...) {
		for _, backend := range b.schedBackends {
			if d == C.ggml_backend_get_device(backend) {
				continue nextDevice
			}
		}

		C.ggml_backend_dev_reset(d)
	}

631
	if err := g.Wait(); err != nil {
632
		return err
633
634
	}

635
	return nil
Michael Yang's avatar
Michael Yang committed
636
637
}

638
639
640
641
func (b *Backend) BackendMemory() ml.BackendMemory {
	return *b.requiredMemory
}

642
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
643
644
645
646
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
647
648
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
649
650
651
652
653
654
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
655
	return b.NewContextSize(b.maxGraphNodes)
656
657
658
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
659
660
661
662
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

663
	var allocatedBuffers []C.ggml_backend_buffer_t
664

Michael Yang's avatar
Michael Yang committed
665
	return &Context{
666
667
		b:             b,
		maxGraphNodes: n,
668
		ctx: C.ggml_init(C.struct_ggml_init_params{
669
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
670
671
			no_alloc: true,
		}),
672
		allocatedBuffers: &allocatedBuffers,
673
		layer:            -1,
Michael Yang's avatar
Michael Yang committed
674
675
676
	}
}

677
func (b *Backend) CacheConfig() ml.CacheConfig {
678
679
680
681
682
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
683
684
}

Michael Yang's avatar
Michael Yang committed
685
type Context struct {
686
	b *Backend
Michael Yang's avatar
Michael Yang committed
687

688
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
689
	graph *C.struct_ggml_cgraph
690

691
	// buft is the buffer type used for new tensors
692
	buft C.ggml_backend_buffer_type_t
693

694
695
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
696
	allocatedBuffers *[]C.ggml_backend_buffer_t
697

Michael Yang's avatar
Michael Yang committed
698
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
699
	maxGraphNodes int
700
701
702

	// layer is the graph layer that this context is allocating for - assumed to be cache
	layer int
Michael Yang's avatar
Michael Yang committed
703
704
}

705
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
706
	if c.b.input != nil {
707
		return &Context{
708
709
710
711
712
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
713
			layer:            -1,
714
715
716
		}
	}

717
	return c
718
719
}

720
func (c *Context) Layer(i int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
721
	if layer, ok := c.b.layers[i]; ok {
722
		return &Context{
723
724
			b:                c.b,
			ctx:              c.ctx,
Jesse Gross's avatar
Jesse Gross committed
725
			buft:             layer.bt,
726
727
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
728
			layer:            i,
729
730
731
		}
	}

732
	return c
733
734
}

735
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
736
	if c.graph == nil {
737
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
738
739
	}

740
741
742
743
744
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
745
746
}

747
func (c *Context) Compute(tensors ...ml.Tensor) {
748
749
750
751
752
753
754
755
756
	c.ComputeWithNotify(nil, tensors...)
}

func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
	c.b.schedMu.Lock()
	defer c.b.schedMu.Unlock()
	if cb != nil {
		go cb()
	}
757
758
759
	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
		panic(fmt.Errorf("error computing ggml graph: %v", status))
	}
Michael Yang's avatar
Michael Yang committed
760
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
761

762
763
764
	needSync := true
	sync := func() {
		if needSync {
765
			C.ggml_backend_sched_synchronize(c.b.sched)
766
767
768
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
769

770
771
772
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
773
774
		}
	}
Michael Yang's avatar
Michael Yang committed
775
776
}

777
778
func (c *Context) Reserve() {
	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
779
780

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
781
782
783

	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
	for _, bt := range c.b.schedBufts {
784
		c.b.btDeviceMemory[bt].Graph = 0
785
786
	}

787
	for i := range c.b.schedBackends {
788
789
		bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
		c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)
790

791
		logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
792
			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
793
794
	}

795
796
797
	if !reserved {
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
	}
798
799
}

800
func (c *Context) MaxGraphNodes() int {
801
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
802
803
}

804
805
806
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
807
		sh[i] = C.int64_t(s)
808
809
810
811
812
	}

	return &sh[0]
}

813
814
815
816
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

817
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
818
	if c.buft == nil {
819
		panic("set Input or Layer before creating tensors")
820
821
	}

822
	cdtype := ggmlDType(dtype)
Michael Yang's avatar
Michael Yang committed
823

Jesse Gross's avatar
Jesse Gross committed
824
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
825
		var shape C.int64_t = 0
826
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
Michael Yang's avatar
Michael Yang committed
827
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
828
829
830
831
832
833
834
835
836
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
837
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
838
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
839

840
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
841
	if c.layer >= 0 {
842
		c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size)
843
844
	}

845
	if b == nil {
846
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
847
848
	}

849
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
Michael Yang's avatar
Michael Yang committed
850
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
851
	return &Tensor{b: c.b, t: t}
852
853
}

854
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
855
	return c.newTensor(dtype, shape)
856
857
}

858
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
859
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
860
861
862
	if c.b.allocMemory {
		C.ggml_set_zero(t.(*Tensor).t)
	}
863
	return t
Michael Yang's avatar
Michael Yang committed
864
865
}

866
func checkShape[S ~[]E, E any](s S, shape ...int) {
Michael Yang's avatar
Michael Yang committed
867
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
868
869

	if n == 0 {
870
		return
Jesse Gross's avatar
Jesse Gross committed
871
872
	}

Michael Yang's avatar
Michael Yang committed
873
874
875
876
877
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
878
		panic(fmt.Errorf("invalid shape: %v", shape))
Michael Yang's avatar
Michael Yang committed
879
880
881
	}
}

882
883
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
884

885
	t := c.newTensor(ml.DTypeF32, shape)
886

Jesse Gross's avatar
Jesse Gross committed
887
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
888
889
890
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

891
	return t
Michael Yang's avatar
Michael Yang committed
892
893
}

894
895
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
896

897
	t := c.newTensor(ml.DTypeI32, shape)
898

Jesse Gross's avatar
Jesse Gross committed
899
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
900
901
902
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

903
	return t
Michael Yang's avatar
Michael Yang committed
904
905
}

Michael Yang's avatar
arange  
Michael Yang committed
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

921
		return c.Input().FromIntSlice(arange, len(arange))
Michael Yang's avatar
arange  
Michael Yang committed
922
923
924
925
926
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
927
928
func (c *Context) Close() {
	if c != nil {
929
930
931
932
933
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

934
935
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
936
937
938
}

type Tensor struct {
939
	b    *Backend
Michael Yang's avatar
Michael Yang committed
940
	t    *C.struct_ggml_tensor
941
	sync func()
Michael Yang's avatar
Michael Yang committed
942
943
944
945
946
947
948
949
950
951
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

952
953
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
954
955
}

956
957
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
958
959
}

960
961
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
962
963
964
965
966
967
968
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

969
970
971
972
973
974
975
976
977
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
978
979
}

980
981
982
983
984
985
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
986
987
988
989
990
	}

	return
}

991
992
993
994
995
996
func (t *Tensor) SetValueFromIntSlice(s []int32) {
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
	}
}

Michael Yang's avatar
Michael Yang committed
997
998
999
1000
func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
1001
1002
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
1003
1004
1005
1006
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
1007
1008
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
Michael Yang's avatar
Michael Yang committed
1009
1010
	case C.GGML_TYPE_MXFP4:
		return ml.DTypeMXFP4
Michael Yang's avatar
Michael Yang committed
1011
1012
1013
1014
1015
	default:
		return ml.DTypeOther
	}
}

1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
func ggmlDType(dtype ml.DType) uint32 {
	switch dtype {
	case ml.DTypeF32:
		return C.GGML_TYPE_F32
	case ml.DTypeF16:
		return C.GGML_TYPE_F16
	case ml.DTypeQ80:
		return C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		return C.GGML_TYPE_Q4_0
	case ml.DTypeI32:
		return C.GGML_TYPE_I32
	case ml.DTypeMXFP4:
		return C.GGML_TYPE_MXFP4
	default:
		panic("unsupported dtype")
	}
}

func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
	}
}

1042
1043
1044
1045
1046
1047
1048
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1049
1050
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1051
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1052
1053
1054
1055
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1056
1057
1058
1059
1060
1061
1062
func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
1094
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1095
1096
1097
1098
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

Michael Yang's avatar
Michael Yang committed
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
	switch len(shape) {
	case 0:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
		}
	case 1:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
Michael Yang's avatar
Michael Yang committed
1128
1129
1130
1131
1132
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1133
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1134
1135
1136
1137
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1138
1139
1140
1141
1142
1143
1144
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1145
1146
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1147
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1148
1149
1150
1151
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1152
1153
1154
1155
1156
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
1157
		b: t.b,
1158
1159
1160
1161
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1162
1163
1164
1165
1166
1167
1168
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

1169
1170
1171
1172
1173
1174
1175
func (t *Tensor) AddID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_add_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

1176
1177
1178
1179
1180
1181
1182
func (t *Tensor) L2Norm(ctx ml.Context, eps float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_l2_norm(ctx.(*Context).ctx, t.t, C.float(eps)),
	}
}

Michael Yang's avatar
Michael Yang committed
1183
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1184
1185
1186
1187
1188
1189
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
1190
1191
	}

Michael Yang's avatar
llama4  
Michael Yang committed
1192
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1193
1194
1195
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1196
1197
1198
1199
1200
1201
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1202
1203
}

1204
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1205
1206
	if len(shape) != 4 {
		panic("expected 4 dimensions")
1207
1208
	} else if shape[3] != 0 {
		panic("cuda does not support 4d tensors")
Michael Yang's avatar
Michael Yang committed
1209
1210
1211
	}

	return &Tensor{
1212
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
1223
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1224
1225
1226
1227
1228
1229
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1230
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1231
1232
1233
1234
1235
1236
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1237
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1238
1239
1240
1241
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1242
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1243
1244
1245
	switch len(shape) {
	case 1:
		return &Tensor{
1246
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1247
1248
1249
1250
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
1251
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1252
1253
1254
1255
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
1256
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1257
1258
1259
1260
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
1261
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1262
1263
1264
1265
1266
1267
1268
1269
1270
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
1271
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1272
1273
1274
1275
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

1276
1277
1278
1279
1280
1281
1282
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1283
1284
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
1285
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1286
1287
1288
1289
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1304
1305
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1306
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1307
1308
1309
1310
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1311
1312
1313
1314
1315
1316
1317
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1318
1319
1320
1321
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1322
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1323
1324
1325
1326
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1327
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1328
1329
1330
1331
1332
1333
1334
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1335
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1336
1337
1338
1339
1340
1341
1342
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1343
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

1354
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
1355
	// Default options
Michael Yang's avatar
Michael Yang committed
1356
1357
1358
1359
1360
1361
1362
1363
	opts := rope.Options{
		Factors:               &Tensor{},
		OriginalContextLength: 131072,
		ExtrapolationFactor:   0.,
		AttentionFactor:       1.,
		BetaFast:              32.,
		BetaSlow:              1.,
	}
1364
1365
1366

	// Apply any provided options
	for _, option := range options {
Michael Yang's avatar
Michael Yang committed
1367
		option(&opts)
1368
1369
	}

Jesse Gross's avatar
Jesse Gross committed
1370
1371
1372
1373
1374
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1375
	return &Tensor{
1376
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1377
		t: C.ggml_rope_ext(
1378
1379
			ctx.(*Context).ctx,
			dequant,
1380
1381
			positions.(*Tensor).t,
			opts.Factors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1382
			C.int(ropeDim),
1383
1384
			C.int(opts.Type),
			C.int(opts.OriginalContextLength),
Michael Yang's avatar
Michael Yang committed
1385
1386
			C.float(ropeBase),
			C.float(ropeScale),
Michael Yang's avatar
Michael Yang committed
1387
1388
1389
1390
			C.float(opts.ExtrapolationFactor),
			C.float(opts.AttentionFactor),
			C.float(opts.BetaFast),
			C.float(opts.BetaSlow),
Michael Yang's avatar
Michael Yang committed
1391
1392
1393
1394
		),
	}
}

1395
1396
1397
1398
1399
1400
1401
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

1402
1403
1404
1405
1406
1407
1408
func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_geglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
	}
Michael Yang's avatar
Michael Yang committed
1409
	return &Tensor{
1410
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1411
1412
1413
1414
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1415
1416
1417
1418
1419
1420
func (t *Tensor) SILU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_swiglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
Michael Yang's avatar
Michael Yang committed
1421
	}
Michael Yang's avatar
Michael Yang committed
1422
	return &Tensor{
1423
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1424
1425
1426
1427
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1428
1429
1430
1431
1432
1433
1434
func (t *Tensor) RELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_reglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
	}
Michael Yang's avatar
Michael Yang committed
1435
1436
1437
1438
1439
1440
	return &Tensor{
		b: t.b,
		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1441
func (t *Tensor) SILUAlphaLimit(ctx ml.Context, up ml.Tensor, alpha, limit float32) ml.Tensor {
1442
1443
1444
1445
1446
1447
	return &Tensor{
		b: t.b,
		t: C.ggml_swiglu_oai(ctx.(*Context).ctx, t.t, up.(*Tensor).t, C.float(alpha), C.float(limit)),
	}
}

Michael Yang's avatar
Michael Yang committed
1448
1449
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1450
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1451
1452
1453
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1454

Michael Yang's avatar
Michael Yang committed
1455
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1456
1457
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1458
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1459
1460
1461
	}
}

Michael Yang's avatar
Michael Yang committed
1462
1463
1464
1465
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1466
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1467
	case 1:
Michael Yang's avatar
Michael Yang committed
1468
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1469
1470
1471
1472
1473
1474
1475
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1476
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sinks ml.Tensor, scale float64) ml.Tensor {
1477
1478
1479
1480
1481
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1482
1483
1484
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1485
1486
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1487

1488
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
1489
1490
1491
		if sinks != nil {
			C.ggml_flash_attn_ext_add_sinks(kqv, sinks.(*Tensor).t)
		}
1492
1493
1494
1495
1496
1497
1498
1499
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}
1500
1501
1502
		if sinks != nil {
			C.ggml_soft_max_add_sinks(kq.(*Tensor).t, sinks.(*Tensor).t)
		}
1503
1504
1505
1506

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1507
}
1508
1509
1510
1511
1512
1513
1514

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1515
1516
1517
1518
1519
1520
1521

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}
1522
1523
1524
1525
1526
1527
1528

func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
	}
}
Michael Yang's avatar
Michael Yang committed
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567

func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
		Sqr(ctx).
		SumRows(ctx).
		Scale(ctx, 1/float64(t.Dim(0)))
}

func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
	return t.Variance(ctx).Sqrt(ctx)
}

func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
	}
}
Michael Yang's avatar
Michael Yang committed
1568
1569
1570
1571

func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
	// Unchecked to handle quantized types
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
1572
	if c.b.allocMemory && len(s) > 0 {
Michael Yang's avatar
Michael Yang committed
1573
1574
1575
1576
1577
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

	return t
}