ggml.go 40.1 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
Jesse Gross's avatar
Jesse Gross committed
13
	"errors"
Michael Yang's avatar
Michael Yang committed
14
15
16
	"fmt"
	"io"
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
	"runtime"
20
21
22
	"slices"
	"strconv"
	"strings"
Jesse Gross's avatar
Jesse Gross committed
23
	"sync"
24
	"sync/atomic"
25
	"unicode"
Michael Yang's avatar
Michael Yang committed
26
27
28
	"unsafe"

	"github.com/ollama/ollama/format"
29
30
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
31
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
32
	"github.com/ollama/ollama/ml"
33
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
34
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
35
36
37
	"golang.org/x/sync/errgroup"
)

Jesse Gross's avatar
Jesse Gross committed
38
39
40
41
42
43
var (
	cpus, accels, gpus []C.ggml_backend_dev_t
	backends           map[C.ggml_backend_dev_t]C.ggml_backend_t
)

var initDevices = sync.OnceFunc(func() {
Michael Yang's avatar
Michael Yang committed
44
45
	ggml.OnceLoad()

Jesse Gross's avatar
Jesse Gross committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
	backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
	for i := range C.ggml_backend_dev_count() {
		d := C.ggml_backend_dev_get(i)

		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			gpus = append(gpus, d)
		}

		backends[d] = C.ggml_backend_dev_init(d, nil)
	}
})
Michael Yang's avatar
Michael Yang committed
65

Jesse Gross's avatar
Jesse Gross committed
66
67
68
69
70
type layerDevice struct {
	d  C.ggml_backend_dev_t
	bt C.ggml_backend_buffer_type_t
}

Michael Yang's avatar
Michael Yang committed
71
type Backend struct {
72
73
74
	// modelPath is the location of the model data
	modelPath string

75
76
	meta *fsggml.GGML

Jesse Gross's avatar
Jesse Gross committed
77
78
79
80
	// allocMemory means that memory should be allocated for tensors and not
	// just a dry run
	allocMemory bool

81
82
83
84
	// tensorLoadTargets maps from the name of the tensor in the file
	// to the name that is used by the model definition
	tensorLoadTargets map[string][]string

85
	schedMu       sync.Mutex // Only one Compute can run at a time
86
87
88
	sched         C.ggml_backend_sched_t
	schedBackends []C.ggml_backend_t
	schedBufts    []C.ggml_backend_buffer_type_t
89

90
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
91

Jesse Gross's avatar
Jesse Gross committed
92
	// input is the backend buffer type used for inputs
93
	input C.ggml_backend_buffer_type_t
Michael Yang's avatar
Michael Yang committed
94

Jesse Gross's avatar
Jesse Gross committed
95
96
97
	// output is the backend device used for outputs
	output C.ggml_backend_dev_t

Michael Yang's avatar
Michael Yang committed
98
	// layers is the backend used for repeating layers
Jesse Gross's avatar
Jesse Gross committed
99
	layers map[int]layerDevice
100

101
102
103
104
	// requiredMemory is the cumulative memory allocations needed by the backend
	requiredMemory *ml.BackendMemory

	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
105
	btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
106

107
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
108
109
110

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Jesse Gross's avatar
Jesse Gross committed
111
112
113

	// weightBuffers are the GGML contexts and buffers for allocating weights
	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
Michael Yang's avatar
Michael Yang committed
114
115
}

Jesse Gross's avatar
Jesse Gross committed
116
117
var once sync.Once

118
119
120
121
122
123
124
125
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
	r, err := os.Open(modelPath)
	if err != nil {
		return nil, err
	}
	defer r.Close()

	meta, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
126
127
128
129
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
130
131
132
133
134
135
136
137
138
139
140
	once.Do(func() {
		slog.Info(
			"",
			"architecture", meta.KV().Architecture(),
			"file_type", meta.KV().FileType(),
			"name", meta.KV().String("general.name"),
			"description", meta.KV().String("general.description"),
			"num_tensors", len(meta.Tensors().Items()),
			"num_key_values", len(meta.KV()),
		)
	})
Michael Yang's avatar
Michael Yang committed
141

Jesse Gross's avatar
Jesse Gross committed
142
143
	initDevices()

144
	var requiredMemory ml.BackendMemory
145
	btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
146

147
	type deviceBufferType struct {
148
149
		d   C.ggml_backend_dev_t
		bts []C.ggml_backend_buffer_type_t
150
151
	}

152
153
	blocks := int(meta.KV().BlockCount())

Michael Yang's avatar
Michael Yang committed
154
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
155
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
156
157
158
159
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Jesse Gross's avatar
Jesse Gross committed
160
161
162
163
			bt := C.ggml_backend_dev_buffer_type(d)
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
			C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

164
			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
Michael Yang's avatar
Michael Yang committed
165
		}
166
167
	}

168
	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
169
170
	var props C.struct_ggml_backend_dev_props
	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
171
	requiredMemory.CPU.ID = C.GoString(props.id)
172
173
	requiredMemory.CPU.Weights = make([]uint64, blocks+1)
	requiredMemory.CPU.Cache = make([]uint64, blocks+1)
174

Michael Yang's avatar
Michael Yang committed
175
	// create list of buffer types for each gpu
176
	var gpuDeviceBufferTypes []deviceBufferType
177
178
	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
	for i, d := range gpus {
179
		bt := C.ggml_backend_dev_buffer_type(d)
180
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
181
			d:   d,
182
			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
183
		})
Jesse Gross's avatar
Jesse Gross committed
184
185
		C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

186
187
		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
188
189
		var props C.struct_ggml_backend_dev_props
		C.ggml_backend_dev_get_props(d, &props)
190
		requiredMemory.GPUs[i].ID = C.GoString(props.id)
191
192
		requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
		requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
Michael Yang's avatar
Michael Yang committed
193
194
	}

Michael Yang's avatar
Michael Yang committed
195
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
196
	input := cpuDeviceBufferType
197

Jesse Gross's avatar
Jesse Gross committed
198
199
200
201
202
203
204
205
206
	assignLayer := func(layer int) deviceBufferType {
		for _, p := range params.GPULayers {
			for _, l := range p.Layers {
				if l == layer {
					for i := range requiredMemory.GPUs {
						if requiredMemory.GPUs[i].ID == p.ID {
							return gpuDeviceBufferTypes[i]
						}
					}
207

Jesse Gross's avatar
Jesse Gross committed
208
209
210
					return cpuDeviceBufferType
				}
			}
211
212
		}

Jesse Gross's avatar
Jesse Gross committed
213
		return cpuDeviceBufferType
214
215
	}

Michael Yang's avatar
Michael Yang committed
216
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
217
	layers := make([]deviceBufferType, blocks)
218
	for i := range layers {
219
		layers[i] = assignLayer(i)
220
221
	}

Michael Yang's avatar
Michael Yang committed
222
	// outputs are assigned iff allowed by splits and configured number of gpu layers
223
	output := assignLayer(blocks)
224
225
226

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
227
	// each layer has at most 2 extra tensors for rope operations
228
229
	maxTensors += blocks * 2

230
	type tensor struct {
231
		source *fsggml.Tensor
232
233
234
		target string
	}

Michael Yang's avatar
Michael Yang committed
235
	// some tensors are mapped to different names so keep a list
236
237
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
238
	// contexts are shared by tensors of the same buffer type
239
240
	ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
	createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
241
242
243
244
245
246
247
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
248

249
250
251
252
253
254
255
256
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
257
			defer C.free(unsafe.Pointer(cname))
258
259
260
261
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

262
263
264
265
266
267
268
269
270
271
			kind := t.source.Kind
			if t.source.Kind == 4 {
				// transform raw mxfp4 stream to ggml mxfp4 format
				kind = 39
			} else if t.source.Kind == uint32(fsggml.TensorTypeBF16) && strings.HasSuffix(t.source.Name, "_exps.bias") {
				// transform "_exps.bias" from bf16 to fp32; add_ids only supports fp32 tensors
				kind = uint32(fsggml.TensorTypeF32)
			}

			tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
272
273
			C.ggml_set_name(tt, cname)

274
			logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
275
276
277

			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
			if layer == -1 {
278
				requiredMemory.InputWeights += uint64(size)
279
			} else {
280
				btDeviceMemory[bt].Weights[layer] += uint64(size)
281
282
			}

283
284
285
286
287
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
288
289
	}

290
	contains := func(s string, parts ...string) bool {
291
292
293
294
295
296
297
298
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
299
300
	}

301
302
	for _, t := range meta.Tensors().Items() {
		switch {
303
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
304
			createTensor(tensor{source: t}, input.bts, -1)
Michael Yang's avatar
Michael Yang committed
305
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
306
				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
307
			}
Michael Yang's avatar
Michael Yang committed
308
309
310
		case contains(t.Name, "cls", "output", "output_norm",
			"altup_proj", "altup_unembd_proj",
			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
311
			createTensor(tensor{source: t}, output.bts, blocks)
312
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
313
			// TODO: assign vision tensors to the gpu if possible
314
			createTensor(tensor{source: t}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
315
316
317
318
319
320
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
321
				}, layer.bts, i)
Michael Yang's avatar
Michael Yang committed
322
			}
323
		default:
Michael Yang's avatar
Michael Yang committed
324
325
326
327
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
328
				}
Michael Yang's avatar
Michael Yang committed
329
			}
330

Michael Yang's avatar
Michael Yang committed
331
			if layerIndex >= 0 {
332
				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
333
			} else {
Michael Yang's avatar
Michael Yang committed
334
				// load all other tensors on the cpu
335
				createTensor(tensor{source: t}, input.bts, -1)
336
337
338
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
339

Michael Yang's avatar
Michael Yang committed
340
	// allocate buffers for each context
341
	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
342
343
344
345
346
347
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
348
		if b == nil {
Jesse Gross's avatar
Jesse Gross committed
349
350
351
352
353
354
355
356
			for _, b := range bbs {
				C.ggml_backend_buffer_free(b)
			}

			for _, ctx := range ctxs {
				C.ggml_free(ctx)
			}

357
			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
358
359
		}

360
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
Michael Yang's avatar
Michael Yang committed
361
		bbs[c] = b
362
363
364
	}

	for bs := range maps.Values(bbs) {
365
		logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
Jesse Gross's avatar
Jesse Gross committed
366
			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
367
368
	}

Michael Yang's avatar
Michael Yang committed
369
	// map tensor names to tensors for easy lookup later
370
371
372
373
374
375
376
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

377
	// map devices to backend buffer types so new tensors can be assigned to the correct device
378
	deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
379
380

	// create backends and buffer types used for the compute graph scheduler
381
382
	var schedBackends []C.ggml_backend_t
	var schedBufts []C.ggml_backend_buffer_type_t
383
	for _, d := range append(gpus, append(accels, cpus...)...) {
Jesse Gross's avatar
Jesse Gross committed
384
		b := backends[d]
385
386
		bt := C.ggml_backend_get_default_buffer_type(b)

Jesse Gross's avatar
Jesse Gross committed
387
388
389
390
391
392
393
		// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
		if !slices.Contains(cpuDeviceBufferType.bts, bt) {
			if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
				continue
			}
		}

394
395
396
397
398
399
400
401
402
403
404
405
406
407
		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
		schedBufts = append(schedBufts, bt)

		if C.ggml_backend_is_cpu(b) {
			// set number of threads for cpu backend
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
		}
	}

	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
	return &Backend{
		modelPath:         modelPath,
Jesse Gross's avatar
Jesse Gross committed
408
		allocMemory:       params.AllocMemory,
409
410
411
412
413
414
415
416
417
		flashAttention:    params.FlashAttention,
		meta:              meta,
		tensorLoadTargets: targets,
		tensors:           tensors,
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
			C.int(len(schedBackends)),
			C.size_t(maxGraphNodes),
418
			C._Bool(false),
419
420
421
422
423
			C._Bool(false),
		),
		schedBackends: schedBackends,
		schedBufts:    schedBufts,
		input:         deviceBufferTypes[input.d],
Jesse Gross's avatar
Jesse Gross committed
424
425
426
		output:        output.d,
		layers: func() map[int]layerDevice {
			m := make(map[int]layerDevice)
427
			for i, layer := range layers {
Jesse Gross's avatar
Jesse Gross committed
428
429
430
431
				m[i] = layerDevice{
					d:  layer.d,
					bt: deviceBufferTypes[layer.d],
				}
432
433
434
			}
			return m
		}(),
435
436
437
		requiredMemory: &requiredMemory,
		btDeviceMemory: btDeviceMemory,
		maxGraphNodes:  maxGraphNodes,
Jesse Gross's avatar
Jesse Gross committed
438
		weightBuffers:  bbs,
439
440
441
442
443
444
445
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

Jesse Gross's avatar
Jesse Gross committed
446
447
448
449
450
451
452
453
454
455
456
457
458
func (b *Backend) Close() {
	if b == nil {
		return
	}

	for ctx, b := range b.weightBuffers {
		C.ggml_backend_buffer_free(b)
		C.ggml_free(ctx)
	}

	C.ggml_backend_sched_free(b.sched)
}

459
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
Jesse Gross's avatar
Jesse Gross committed
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
	if !b.allocMemory {
		return errors.New("cannot load model without memory allocation")
	}

	// Mimic llama runner logs summarizing layers and memory
	gpuLayers := 0
	for layer := range maps.Values(b.layers) {
		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
			gpuLayers++
		}
	}
	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))

	switch C.ggml_backend_dev_type(b.output) {
	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
		slog.Info("offloading output layer to CPU")
	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
		slog.Info("offloading output layer to GPU")
		gpuLayers++
	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
		slog.Info("offloading output layer to ACCEL")
	}
	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))

484
	var doneBytes atomic.Uint64
485
	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
486
487
488

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
489
	for _, t := range b.meta.Tensors().Items() {
490
		t := t
491
		g.Go(func() error {
492
			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
493
			for i := range tts {
494
				target := b.tensorLoadTargets[t.Name][i]
495
496
497
				if target == "" {
					target = t.Name
				}
498

499
				tt, ok := b.tensors[target]
500
501
502
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
503

504
505
506
				tts[i] = tt
			}

507
508
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
509
			file, err := os.Open(b.modelPath)
510
			if err != nil {
511
				slog.Warn("file open error", "file", b.modelPath, "error", err)
512
513
514
				return err
			}
			defer file.Close()
515
			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
516
517
518
519
520
521
522

			if t.Kind == 4 && tts[0]._type == 39 {
				// source is mxfp4, target is ggml mxfp4

				const BS = 17                             // MXFP4 block size
				bts := make([]byte, 8*BS*format.KibiByte) // ~128k block aligned
				var s uint64
523
				var tmp [16]byte
524
525
526
527
528
529
530
531
532
533
534
535
				for s < t.Size() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					for j := range n / BS {
						for i := 1; i < 9; i++ {
536
537
538
539
							// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
							a, b := bts[j*BS+i], bts[j*BS+i+8]
							tmp[2*(i-1)] = (a & 0x0F) | (b << 4)
							tmp[2*(i-1)+1] = (a >> 4) | (b & 0xF0)
540
						}
541
						copy(bts[j*BS+1:j*BS+17], tmp[:])
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
					}

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
					}

					s += uint64(n)

					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			} else if strings.HasSuffix(t.Name, "_exps.bias") && t.Kind == 30 && tts[0]._type == 0 {
				// source is bf16, target is ggml fp32

				// data is bf16 but we need to convert to fp32
				bts := make([]byte, 128*format.KibiByte)
				var e uint64
				for e < t.Elements() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Elements()-e)*2)])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					fp32 := ConvertToF32(bts, uint32(fsggml.TensorTypeBF16), uint64(n/2))

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&fp32[0]), C.size_t(e*4), C.size_t(n*2))
					}
					e += uint64(n / 2)
					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			}

586
587
588
589
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
590
591
592
593
594
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

595
596
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
597
					slog.Warn("file read error", "file", b.modelPath, "error", err)
598
					return err
599
				}
Michael Yang's avatar
Michael Yang committed
600

601
602
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
603
				}
Michael Yang's avatar
Michael Yang committed
604

605
606
				s += uint64(n)

607
				if progress != nil {
608
					done := doneBytes.Add(uint64(n))
609
					progress(float32(done) / float32(totalBytes))
610
611
612
613
614
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
615
616
	}

617
618
619
620
621
622
623
624
625
626
627
628
	// Cleanup any backend state from devices that we didn't end up using
nextDevice:
	for _, d := range append(gpus, append(accels, cpus...)...) {
		for _, backend := range b.schedBackends {
			if d == C.ggml_backend_get_device(backend) {
				continue nextDevice
			}
		}

		C.ggml_backend_dev_reset(d)
	}

629
	if err := g.Wait(); err != nil {
630
		return err
631
632
	}

633
	return nil
Michael Yang's avatar
Michael Yang committed
634
635
}

636
637
638
639
func (b *Backend) BackendMemory() ml.BackendMemory {
	return *b.requiredMemory
}

640
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
641
642
643
644
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
645
646
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
647
648
649
650
651
652
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
653
	return b.NewContextSize(b.maxGraphNodes)
654
655
656
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
657
658
659
660
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

661
	var allocatedBuffers []C.ggml_backend_buffer_t
662

Michael Yang's avatar
Michael Yang committed
663
	return &Context{
664
665
		b:             b,
		maxGraphNodes: n,
666
		ctx: C.ggml_init(C.struct_ggml_init_params{
667
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
668
669
			no_alloc: true,
		}),
670
		allocatedBuffers: &allocatedBuffers,
671
		layer:            -1,
Michael Yang's avatar
Michael Yang committed
672
673
674
	}
}

675
func (b *Backend) CacheConfig() ml.CacheConfig {
676
677
678
679
680
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
681
682
}

Michael Yang's avatar
Michael Yang committed
683
type Context struct {
684
	b *Backend
Michael Yang's avatar
Michael Yang committed
685

686
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
687
	graph *C.struct_ggml_cgraph
688

689
	// buft is the buffer type used for new tensors
690
	buft C.ggml_backend_buffer_type_t
691

692
693
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
694
	allocatedBuffers *[]C.ggml_backend_buffer_t
695

Michael Yang's avatar
Michael Yang committed
696
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
697
	maxGraphNodes int
698
699
700

	// layer is the graph layer that this context is allocating for - assumed to be cache
	layer int
Michael Yang's avatar
Michael Yang committed
701
702
}

703
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
704
	if c.b.input != nil {
705
		return &Context{
706
707
708
709
710
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
711
			layer:            -1,
712
713
714
		}
	}

715
	return c
716
717
}

718
func (c *Context) Layer(i int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
719
	if layer, ok := c.b.layers[i]; ok {
720
		return &Context{
721
722
			b:                c.b,
			ctx:              c.ctx,
Jesse Gross's avatar
Jesse Gross committed
723
			buft:             layer.bt,
724
725
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
726
			layer:            i,
727
728
729
		}
	}

730
	return c
731
732
}

733
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
734
	if c.graph == nil {
735
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
736
737
	}

738
739
740
741
742
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
743
744
}

745
func (c *Context) Compute(tensors ...ml.Tensor) {
746
747
748
749
750
751
752
753
754
	c.ComputeWithNotify(nil, tensors...)
}

func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
	c.b.schedMu.Lock()
	defer c.b.schedMu.Unlock()
	if cb != nil {
		go cb()
	}
755
756
757
	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
		panic(fmt.Errorf("error computing ggml graph: %v", status))
	}
Michael Yang's avatar
Michael Yang committed
758
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
759

760
761
762
	needSync := true
	sync := func() {
		if needSync {
763
			C.ggml_backend_sched_synchronize(c.b.sched)
764
765
766
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
767

768
769
770
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
771
772
		}
	}
Michael Yang's avatar
Michael Yang committed
773
774
}

775
776
func (c *Context) Reserve() {
	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
777
778

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
779
780
781

	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
	for _, bt := range c.b.schedBufts {
782
		c.b.btDeviceMemory[bt].Graph = 0
783
784
	}

785
	for i := range c.b.schedBackends {
786
787
		bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
		c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)
788

789
		logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
790
			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
791
792
	}

793
794
795
	if !reserved {
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
	}
796
797
}

798
func (c *Context) MaxGraphNodes() int {
799
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
800
801
}

802
803
804
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
805
		sh[i] = C.int64_t(s)
806
807
808
809
810
	}

	return &sh[0]
}

811
812
813
814
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

815
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
816
	if c.buft == nil {
817
		panic("set Input or Layer before creating tensors")
818
819
	}

820
	cdtype := ggmlDType(dtype)
Michael Yang's avatar
Michael Yang committed
821

Jesse Gross's avatar
Jesse Gross committed
822
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
823
		var shape C.int64_t = 0
824
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
Michael Yang's avatar
Michael Yang committed
825
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
826
827
828
829
830
831
832
833
834
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
835
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
836
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
837

838
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
839
	if c.layer >= 0 {
840
		c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size)
841
842
	}

843
	if b == nil {
844
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
845
846
	}

847
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
Michael Yang's avatar
Michael Yang committed
848
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
849
	return &Tensor{b: c.b, t: t}
850
851
}

852
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
853
	return c.newTensor(dtype, shape)
854
855
}

856
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
857
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
858
859
860
	if c.b.allocMemory {
		C.ggml_set_zero(t.(*Tensor).t)
	}
861
	return t
Michael Yang's avatar
Michael Yang committed
862
863
}

864
func checkShape[S ~[]E, E any](s S, shape ...int) {
Michael Yang's avatar
Michael Yang committed
865
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
866
867

	if n == 0 {
868
		return
Jesse Gross's avatar
Jesse Gross committed
869
870
	}

Michael Yang's avatar
Michael Yang committed
871
872
873
874
875
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
876
		panic(fmt.Errorf("invalid shape: %v", shape))
Michael Yang's avatar
Michael Yang committed
877
878
879
	}
}

880
881
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
882

883
	t := c.newTensor(ml.DTypeF32, shape)
884

Jesse Gross's avatar
Jesse Gross committed
885
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
886
887
888
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

889
	return t
Michael Yang's avatar
Michael Yang committed
890
891
}

892
893
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
894

895
	t := c.newTensor(ml.DTypeI32, shape)
896

Jesse Gross's avatar
Jesse Gross committed
897
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
898
899
900
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

901
	return t
Michael Yang's avatar
Michael Yang committed
902
903
}

Michael Yang's avatar
arange  
Michael Yang committed
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

919
		return c.Input().FromIntSlice(arange, len(arange))
Michael Yang's avatar
arange  
Michael Yang committed
920
921
922
923
924
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
925
926
func (c *Context) Close() {
	if c != nil {
927
928
929
930
931
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

932
933
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
934
935
936
}

type Tensor struct {
937
	b    *Backend
Michael Yang's avatar
Michael Yang committed
938
	t    *C.struct_ggml_tensor
939
	sync func()
Michael Yang's avatar
Michael Yang committed
940
941
942
943
944
945
946
947
948
949
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

950
951
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
952
953
}

954
955
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
956
957
}

958
959
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
960
961
962
963
964
965
966
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

967
968
969
970
971
972
973
974
975
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
976
977
}

978
979
980
981
982
983
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
984
985
986
987
988
	}

	return
}

989
990
991
992
993
994
func (t *Tensor) SetValueFromIntSlice(s []int32) {
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
	}
}

Michael Yang's avatar
Michael Yang committed
995
996
997
998
func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
999
1000
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
1001
1002
1003
1004
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
1005
1006
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
Michael Yang's avatar
Michael Yang committed
1007
1008
	case C.GGML_TYPE_MXFP4:
		return ml.DTypeMXFP4
Michael Yang's avatar
Michael Yang committed
1009
1010
1011
1012
1013
	default:
		return ml.DTypeOther
	}
}

1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
func ggmlDType(dtype ml.DType) uint32 {
	switch dtype {
	case ml.DTypeF32:
		return C.GGML_TYPE_F32
	case ml.DTypeF16:
		return C.GGML_TYPE_F16
	case ml.DTypeQ80:
		return C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		return C.GGML_TYPE_Q4_0
	case ml.DTypeI32:
		return C.GGML_TYPE_I32
	case ml.DTypeMXFP4:
		return C.GGML_TYPE_MXFP4
	default:
		panic("unsupported dtype")
	}
}

func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
	}
}

1040
1041
1042
1043
1044
1045
1046
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1047
1048
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1049
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1050
1051
1052
1053
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1054
1055
1056
1057
1058
1059
1060
func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
1092
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1093
1094
1095
1096
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

Michael Yang's avatar
Michael Yang committed
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
	switch len(shape) {
	case 0:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
		}
	case 1:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
Michael Yang's avatar
Michael Yang committed
1126
1127
1128
1129
1130
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1131
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1132
1133
1134
1135
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1136
1137
1138
1139
1140
1141
1142
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1143
1144
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1145
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1146
1147
1148
1149
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1150
1151
1152
1153
1154
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
1155
		b: t.b,
1156
1157
1158
1159
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1160
1161
1162
1163
1164
1165
1166
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

1167
1168
1169
1170
1171
1172
1173
func (t *Tensor) AddID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_add_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

1174
1175
1176
1177
1178
1179
1180
func (t *Tensor) L2Norm(ctx ml.Context, eps float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_l2_norm(ctx.(*Context).ctx, t.t, C.float(eps)),
	}
}

Michael Yang's avatar
Michael Yang committed
1181
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1182
1183
1184
1185
1186
1187
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
1188
1189
	}

Michael Yang's avatar
llama4  
Michael Yang committed
1190
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1191
1192
1193
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1194
1195
1196
1197
1198
1199
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1200
1201
}

1202
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1203
1204
	if len(shape) != 4 {
		panic("expected 4 dimensions")
1205
1206
	} else if shape[3] != 0 {
		panic("cuda does not support 4d tensors")
Michael Yang's avatar
Michael Yang committed
1207
1208
1209
	}

	return &Tensor{
1210
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
1221
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1222
1223
1224
1225
1226
1227
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1228
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1229
1230
1231
1232
1233
1234
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1235
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1236
1237
1238
1239
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1240
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1241
1242
1243
	switch len(shape) {
	case 1:
		return &Tensor{
1244
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1245
1246
1247
1248
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
1249
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1250
1251
1252
1253
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
1254
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1255
1256
1257
1258
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
1259
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1260
1261
1262
1263
1264
1265
1266
1267
1268
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
1269
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1270
1271
1272
1273
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

1274
1275
1276
1277
1278
1279
1280
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1281
1282
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
1283
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1284
1285
1286
1287
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1302
1303
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1304
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1305
1306
1307
1308
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1309
1310
1311
1312
1313
1314
1315
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1316
1317
1318
1319
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1320
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1321
1322
1323
1324
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1325
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1326
1327
1328
1329
1330
1331
1332
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1333
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1334
1335
1336
1337
1338
1339
1340
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1341
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

1352
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
1353
	// Default options
Michael Yang's avatar
Michael Yang committed
1354
1355
1356
1357
1358
1359
1360
1361
	opts := rope.Options{
		Factors:               &Tensor{},
		OriginalContextLength: 131072,
		ExtrapolationFactor:   0.,
		AttentionFactor:       1.,
		BetaFast:              32.,
		BetaSlow:              1.,
	}
1362
1363
1364

	// Apply any provided options
	for _, option := range options {
Michael Yang's avatar
Michael Yang committed
1365
		option(&opts)
1366
1367
	}

Jesse Gross's avatar
Jesse Gross committed
1368
1369
1370
1371
1372
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1373
	return &Tensor{
1374
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1375
		t: C.ggml_rope_ext(
1376
1377
			ctx.(*Context).ctx,
			dequant,
1378
1379
			positions.(*Tensor).t,
			opts.Factors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1380
			C.int(ropeDim),
1381
1382
			C.int(opts.Type),
			C.int(opts.OriginalContextLength),
Michael Yang's avatar
Michael Yang committed
1383
1384
			C.float(ropeBase),
			C.float(ropeScale),
Michael Yang's avatar
Michael Yang committed
1385
1386
1387
1388
			C.float(opts.ExtrapolationFactor),
			C.float(opts.AttentionFactor),
			C.float(opts.BetaFast),
			C.float(opts.BetaSlow),
Michael Yang's avatar
Michael Yang committed
1389
1390
1391
1392
		),
	}
}

1393
1394
1395
1396
1397
1398
1399
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

1400
1401
1402
1403
1404
1405
1406
func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_geglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
	}
Michael Yang's avatar
Michael Yang committed
1407
	return &Tensor{
1408
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1409
1410
1411
1412
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1413
1414
1415
1416
1417
1418
func (t *Tensor) SILU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_swiglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
Michael Yang's avatar
Michael Yang committed
1419
	}
Michael Yang's avatar
Michael Yang committed
1420
	return &Tensor{
1421
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1422
1423
1424
1425
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1426
1427
1428
1429
1430
1431
1432
func (t *Tensor) RELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
	if len(t2) > 0 {
		return &Tensor{
			b: t.b,
			t: C.ggml_reglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
		}
	}
Michael Yang's avatar
Michael Yang committed
1433
1434
1435
1436
1437
1438
	return &Tensor{
		b: t.b,
		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1439
func (t *Tensor) SILUAlphaLimit(ctx ml.Context, up ml.Tensor, alpha, limit float32) ml.Tensor {
1440
1441
1442
1443
1444
1445
	return &Tensor{
		b: t.b,
		t: C.ggml_swiglu_oai(ctx.(*Context).ctx, t.t, up.(*Tensor).t, C.float(alpha), C.float(limit)),
	}
}

Michael Yang's avatar
Michael Yang committed
1446
1447
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1448
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1449
1450
1451
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1452

Michael Yang's avatar
Michael Yang committed
1453
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1454
1455
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1456
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1457
1458
1459
	}
}

Michael Yang's avatar
Michael Yang committed
1460
1461
1462
1463
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1464
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1465
	case 1:
Michael Yang's avatar
Michael Yang committed
1466
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1467
1468
1469
1470
1471
1472
1473
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1474
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sinks ml.Tensor, scale float64) ml.Tensor {
1475
1476
1477
1478
1479
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1480
1481
1482
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1483
1484
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1485

1486
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
1487
1488
1489
		if sinks != nil {
			C.ggml_flash_attn_ext_add_sinks(kqv, sinks.(*Tensor).t)
		}
1490
1491
1492
1493
1494
1495
1496
1497
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}
1498
1499
1500
		if sinks != nil {
			C.ggml_soft_max_add_sinks(kq.(*Tensor).t, sinks.(*Tensor).t)
		}
1501
1502
1503
1504

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1505
}
1506
1507
1508
1509
1510
1511
1512

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1513
1514
1515
1516
1517
1518
1519

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}
1520
1521
1522
1523
1524
1525
1526

func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
	}
}
Michael Yang's avatar
Michael Yang committed
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565

func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
		Sqr(ctx).
		SumRows(ctx).
		Scale(ctx, 1/float64(t.Dim(0)))
}

func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
	return t.Variance(ctx).Sqrt(ctx)
}

func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
	}
}
Michael Yang's avatar
Michael Yang committed
1566
1567
1568
1569

func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
	// Unchecked to handle quantized types
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
1570
	if c.b.allocMemory && len(s) > 0 {
Michael Yang's avatar
Michael Yang committed
1571
1572
1573
1574
1575
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

	return t
}