ggml.go 40.6 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
Jesse Gross's avatar
Jesse Gross committed
13
	"errors"
Michael Yang's avatar
Michael Yang committed
14
15
16
	"fmt"
	"io"
	"log/slog"
17
	"maps"
Michael Yang's avatar
Michael Yang committed
18
	"os"
19
	"runtime"
20
21
22
	"slices"
	"strconv"
	"strings"
Jesse Gross's avatar
Jesse Gross committed
23
	"sync"
24
	"sync/atomic"
25
	"unicode"
Michael Yang's avatar
Michael Yang committed
26
27
28
	"unsafe"

	"github.com/ollama/ollama/format"
29
30
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
31
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
32
	"github.com/ollama/ollama/ml"
33
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
34
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
35
36
37
	"golang.org/x/sync/errgroup"
)

Jesse Gross's avatar
Jesse Gross committed
38
39
40
41
42
43
var (
	cpus, accels, gpus []C.ggml_backend_dev_t
	backends           map[C.ggml_backend_dev_t]C.ggml_backend_t
)

var initDevices = sync.OnceFunc(func() {
Michael Yang's avatar
Michael Yang committed
44
45
	ggml.OnceLoad()

Jesse Gross's avatar
Jesse Gross committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
	backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
	for i := range C.ggml_backend_dev_count() {
		d := C.ggml_backend_dev_get(i)

		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			gpus = append(gpus, d)
		}

		backends[d] = C.ggml_backend_dev_init(d, nil)
	}
})
Michael Yang's avatar
Michael Yang committed
65

Jesse Gross's avatar
Jesse Gross committed
66
67
68
69
70
type layerDevice struct {
	d  C.ggml_backend_dev_t
	bt C.ggml_backend_buffer_type_t
}

Michael Yang's avatar
Michael Yang committed
71
type Backend struct {
72
73
74
	// modelPath is the location of the model data
	modelPath string

75
76
	meta *fsggml.GGML

Jesse Gross's avatar
Jesse Gross committed
77
78
79
80
	// allocMemory means that memory should be allocated for tensors and not
	// just a dry run
	allocMemory bool

81
82
83
84
	// tensorLoadTargets maps from the name of the tensor in the file
	// to the name that is used by the model definition
	tensorLoadTargets map[string][]string

85
	schedMu       sync.Mutex // Only one Compute can run at a time
86
87
88
	sched         C.ggml_backend_sched_t
	schedBackends []C.ggml_backend_t
	schedBufts    []C.ggml_backend_buffer_type_t
89

90
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
91

Jesse Gross's avatar
Jesse Gross committed
92
	// input is the backend buffer type used for inputs
93
	input C.ggml_backend_buffer_type_t
Michael Yang's avatar
Michael Yang committed
94

Jesse Gross's avatar
Jesse Gross committed
95
96
97
	// output is the backend device used for outputs
	output C.ggml_backend_dev_t

Michael Yang's avatar
Michael Yang committed
98
	// layers is the backend used for repeating layers
Jesse Gross's avatar
Jesse Gross committed
99
	layers map[int]layerDevice
100

101
102
103
104
	// requiredMemory is the cumulative memory allocations needed by the backend
	requiredMemory *ml.BackendMemory

	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
105
	btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
106

107
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
108
109
110

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Jesse Gross's avatar
Jesse Gross committed
111
112
113

	// weightBuffers are the GGML contexts and buffers for allocating weights
	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
Michael Yang's avatar
Michael Yang committed
114
115
}

Jesse Gross's avatar
Jesse Gross committed
116
117
var once sync.Once

118
119
120
121
122
123
124
125
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
	r, err := os.Open(modelPath)
	if err != nil {
		return nil, err
	}
	defer r.Close()

	meta, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
126
127
128
129
	if err != nil {
		return nil, err
	}

Jesse Gross's avatar
Jesse Gross committed
130
131
132
133
134
135
136
137
138
139
140
	once.Do(func() {
		slog.Info(
			"",
			"architecture", meta.KV().Architecture(),
			"file_type", meta.KV().FileType(),
			"name", meta.KV().String("general.name"),
			"description", meta.KV().String("general.description"),
			"num_tensors", len(meta.Tensors().Items()),
			"num_key_values", len(meta.KV()),
		)
	})
Michael Yang's avatar
Michael Yang committed
141

Jesse Gross's avatar
Jesse Gross committed
142
143
	initDevices()

144
	var requiredMemory ml.BackendMemory
145
	btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
146

147
	type deviceBufferType struct {
148
149
		d   C.ggml_backend_dev_t
		bts []C.ggml_backend_buffer_type_t
150
151
	}

152
153
	blocks := int(meta.KV().BlockCount())

Michael Yang's avatar
Michael Yang committed
154
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
155
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
156
157
158
159
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Jesse Gross's avatar
Jesse Gross committed
160
161
162
163
			bt := C.ggml_backend_dev_buffer_type(d)
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
			C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

164
			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
Michael Yang's avatar
Michael Yang committed
165
		}
166
167
	}

168
	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
169
170
	var props C.struct_ggml_backend_dev_props
	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
171
	requiredMemory.CPU.ID = C.GoString(props.id)
172
173
174
	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)

Michael Yang's avatar
Michael Yang committed
175
	// create list of buffer types for each gpu
176
	var gpuDeviceBufferTypes []deviceBufferType
177
178
	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
	for i, d := range gpus {
179
		bt := C.ggml_backend_dev_buffer_type(d)
180
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
181
			d:   d,
182
			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
183
		})
Jesse Gross's avatar
Jesse Gross committed
184
185
		C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

186
187
		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
188
189
		var props C.struct_ggml_backend_dev_props
		C.ggml_backend_dev_get_props(d, &props)
190
		requiredMemory.GPUs[i].ID = C.GoString(props.id)
191
192
		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
Michael Yang's avatar
Michael Yang committed
193
194
	}

Michael Yang's avatar
Michael Yang committed
195
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
196
	input := cpuDeviceBufferType
197

Jesse Gross's avatar
Jesse Gross committed
198
199
200
201
202
203
204
205
206
	assignLayer := func(layer int) deviceBufferType {
		for _, p := range params.GPULayers {
			for _, l := range p.Layers {
				if l == layer {
					for i := range requiredMemory.GPUs {
						if requiredMemory.GPUs[i].ID == p.ID {
							return gpuDeviceBufferTypes[i]
						}
					}
207

Jesse Gross's avatar
Jesse Gross committed
208
209
210
					return cpuDeviceBufferType
				}
			}
211
212
		}

Jesse Gross's avatar
Jesse Gross committed
213
		return cpuDeviceBufferType
214
215
	}

Michael Yang's avatar
Michael Yang committed
216
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
217
	layers := make([]deviceBufferType, blocks)
218
	for i := range layers {
219
		layers[i] = assignLayer(i)
220
221
	}

Michael Yang's avatar
Michael Yang committed
222
	// outputs are assigned iff allowed by splits and configured number of gpu layers
223
	output := assignLayer(blocks)
224
225
226

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
227
	// each layer has at most 2 extra tensors for rope operations
228
229
	maxTensors += blocks * 2

230
	type tensor struct {
231
		source *fsggml.Tensor
232
233
234
		target string
	}

Michael Yang's avatar
Michael Yang committed
235
	// some tensors are mapped to different names so keep a list
236
237
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
238
	// contexts are shared by tensors of the same buffer type
239
240
	ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
	createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
241
242
243
244
245
246
247
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
248

249
250
251
252
253
254
255
256
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
257
			defer C.free(unsafe.Pointer(cname))
258
259
260
261
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

262
263
264
265
266
267
268
269
270
271
			kind := t.source.Kind
			if t.source.Kind == 4 {
				// transform raw mxfp4 stream to ggml mxfp4 format
				kind = 39
			} else if t.source.Kind == uint32(fsggml.TensorTypeBF16) && strings.HasSuffix(t.source.Name, "_exps.bias") {
				// transform "_exps.bias" from bf16 to fp32; add_ids only supports fp32 tensors
				kind = uint32(fsggml.TensorTypeF32)
			}

			tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
272
273
			C.ggml_set_name(tt, cname)

274
			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
275
276
277
278

			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
			if layer == -1 {
				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
Jesse Gross's avatar
Jesse Gross committed
279
280
281
				if params.AllocMemory {
					requiredMemory.InputWeights.Status = ml.Allocated
				}
282
283
284
285
286
				requiredMemory.InputWeights.Size += uint64(size)
			} else {
				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
			}

287
288
289
290
291
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
292
293
	}

294
	contains := func(s string, parts ...string) bool {
295
296
297
298
299
300
301
302
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
303
304
	}

305
306
	for _, t := range meta.Tensors().Items() {
		switch {
307
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
308
			createTensor(tensor{source: t}, input.bts, -1)
Michael Yang's avatar
Michael Yang committed
309
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
310
				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
311
			}
Michael Yang's avatar
Michael Yang committed
312
313
314
		case contains(t.Name, "cls", "output", "output_norm",
			"altup_proj", "altup_unembd_proj",
			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
315
			createTensor(tensor{source: t}, output.bts, blocks)
316
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
317
			// TODO: assign vision tensors to the gpu if possible
318
			createTensor(tensor{source: t}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
319
320
321
322
323
324
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
325
				}, layer.bts, i)
Michael Yang's avatar
Michael Yang committed
326
			}
327
		default:
Michael Yang's avatar
Michael Yang committed
328
329
330
331
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
332
				}
Michael Yang's avatar
Michael Yang committed
333
			}
334

Michael Yang's avatar
Michael Yang committed
335
			if layerIndex >= 0 {
336
				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
337
			} else {
Michael Yang's avatar
Michael Yang committed
338
				// load all other tensors on the cpu
339
				createTensor(tensor{source: t}, input.bts, -1)
340
341
342
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
343

Michael Yang's avatar
Michael Yang committed
344
	// allocate buffers for each context
345
	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
346
347
348
349
350
351
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
Jesse Gross's avatar
Jesse Gross committed
352
353
354
355
356
357
358
359
		if params.AllocMemory {
			for i := range btDeviceMemory[bt].Weights {
				if btDeviceMemory[bt].Weights[i].Size != 0 {
					if b != nil {
						btDeviceMemory[bt].Weights[i].Status = ml.Allocated
					} else {
						btDeviceMemory[bt].Weights[i].Status = ml.Failed
					}
360
361
362
363
				}
			}
		}

364
		if b == nil {
Jesse Gross's avatar
Jesse Gross committed
365
366
367
368
369
370
371
372
			for _, b := range bbs {
				C.ggml_backend_buffer_free(b)
			}

			for _, ctx := range ctxs {
				C.ggml_free(ctx)
			}

373
			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
374
375
		}

376
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
Michael Yang's avatar
Michael Yang committed
377
		bbs[c] = b
378
379
380
	}

	for bs := range maps.Values(bbs) {
Jesse Gross's avatar
Jesse Gross committed
381
382
		slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
383
384
	}

Michael Yang's avatar
Michael Yang committed
385
	// map tensor names to tensors for easy lookup later
386
387
388
389
390
391
392
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

393
	// map devices to backend buffer types so new tensors can be assigned to the correct device
394
	deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
395
396

	// create backends and buffer types used for the compute graph scheduler
397
398
	var schedBackends []C.ggml_backend_t
	var schedBufts []C.ggml_backend_buffer_type_t
399
	for _, d := range append(gpus, append(accels, cpus...)...) {
Jesse Gross's avatar
Jesse Gross committed
400
		b := backends[d]
401
402
		bt := C.ggml_backend_get_default_buffer_type(b)

Jesse Gross's avatar
Jesse Gross committed
403
404
405
406
407
408
409
		// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
		if !slices.Contains(cpuDeviceBufferType.bts, bt) {
			if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
				continue
			}
		}

410
411
412
413
414
415
416
417
418
419
420
421
422
423
		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
		schedBufts = append(schedBufts, bt)

		if C.ggml_backend_is_cpu(b) {
			// set number of threads for cpu backend
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
		}
	}

	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
	return &Backend{
		modelPath:         modelPath,
Jesse Gross's avatar
Jesse Gross committed
424
		allocMemory:       params.AllocMemory,
425
426
427
428
429
430
431
432
433
		flashAttention:    params.FlashAttention,
		meta:              meta,
		tensorLoadTargets: targets,
		tensors:           tensors,
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
			C.int(len(schedBackends)),
			C.size_t(maxGraphNodes),
434
			C._Bool(false),
435
436
437
438
439
			C._Bool(false),
		),
		schedBackends: schedBackends,
		schedBufts:    schedBufts,
		input:         deviceBufferTypes[input.d],
Jesse Gross's avatar
Jesse Gross committed
440
441
442
		output:        output.d,
		layers: func() map[int]layerDevice {
			m := make(map[int]layerDevice)
443
			for i, layer := range layers {
Jesse Gross's avatar
Jesse Gross committed
444
445
446
447
				m[i] = layerDevice{
					d:  layer.d,
					bt: deviceBufferTypes[layer.d],
				}
448
449
450
			}
			return m
		}(),
451
452
453
		requiredMemory: &requiredMemory,
		btDeviceMemory: btDeviceMemory,
		maxGraphNodes:  maxGraphNodes,
Jesse Gross's avatar
Jesse Gross committed
454
		weightBuffers:  bbs,
455
456
457
458
459
460
461
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

Jesse Gross's avatar
Jesse Gross committed
462
463
464
465
466
467
468
469
470
471
472
473
474
func (b *Backend) Close() {
	if b == nil {
		return
	}

	for ctx, b := range b.weightBuffers {
		C.ggml_backend_buffer_free(b)
		C.ggml_free(ctx)
	}

	C.ggml_backend_sched_free(b.sched)
}

475
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
Jesse Gross's avatar
Jesse Gross committed
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
	if !b.allocMemory {
		return errors.New("cannot load model without memory allocation")
	}

	// Mimic llama runner logs summarizing layers and memory
	gpuLayers := 0
	for layer := range maps.Values(b.layers) {
		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
			gpuLayers++
		}
	}
	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))

	switch C.ggml_backend_dev_type(b.output) {
	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
		slog.Info("offloading output layer to CPU")
	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
		slog.Info("offloading output layer to GPU")
		gpuLayers++
	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
		slog.Info("offloading output layer to ACCEL")
	}
	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))

500
	var doneBytes atomic.Uint64
501
	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
502
503
504

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
505
	for _, t := range b.meta.Tensors().Items() {
506
		t := t
507
		g.Go(func() error {
508
			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
509
			for i := range tts {
510
				target := b.tensorLoadTargets[t.Name][i]
511
512
513
				if target == "" {
					target = t.Name
				}
514

515
				tt, ok := b.tensors[target]
516
517
518
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
519

520
521
522
				tts[i] = tt
			}

523
524
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
525
			file, err := os.Open(b.modelPath)
526
			if err != nil {
527
				slog.Warn("file open error", "file", b.modelPath, "error", err)
528
529
530
				return err
			}
			defer file.Close()
531
			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
532
533
534
535
536
537
538

			if t.Kind == 4 && tts[0]._type == 39 {
				// source is mxfp4, target is ggml mxfp4

				const BS = 17                             // MXFP4 block size
				bts := make([]byte, 8*BS*format.KibiByte) // ~128k block aligned
				var s uint64
539
				var tmp [16]byte
540
541
542
543
544
545
546
547
548
549
550
551
				for s < t.Size() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					for j := range n / BS {
						for i := 1; i < 9; i++ {
552
553
554
555
							// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
							a, b := bts[j*BS+i], bts[j*BS+i+8]
							tmp[2*(i-1)] = (a & 0x0F) | (b << 4)
							tmp[2*(i-1)+1] = (a >> 4) | (b & 0xF0)
556
						}
557
						copy(bts[j*BS+1:j*BS+17], tmp[:])
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
					}

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
					}

					s += uint64(n)

					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			} else if strings.HasSuffix(t.Name, "_exps.bias") && t.Kind == 30 && tts[0]._type == 0 {
				// source is bf16, target is ggml fp32

				// data is bf16 but we need to convert to fp32
				bts := make([]byte, 128*format.KibiByte)
				var e uint64
				for e < t.Elements() {
					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
					if err := ctx.Err(); err != nil {
						return err
					}
					n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Elements()-e)*2)])
					if err != nil {
						slog.Warn("file read error", "file", b.modelPath, "error", err)
						return err
					}
					fp32 := ConvertToF32(bts, uint32(fsggml.TensorTypeBF16), uint64(n/2))

					for _, tt := range tts {
						C.ggml_backend_tensor_set(tt, unsafe.Pointer(&fp32[0]), C.size_t(e*4), C.size_t(n*2))
					}
					e += uint64(n / 2)
					if progress != nil {
						done := doneBytes.Add(uint64(n))
						progress(float32(done) / float32(totalBytes))
					}
				}
				return nil
			}

602
603
604
605
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
606
607
608
609
610
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

611
612
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
613
					slog.Warn("file read error", "file", b.modelPath, "error", err)
614
					return err
615
				}
Michael Yang's avatar
Michael Yang committed
616

617
618
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
619
				}
Michael Yang's avatar
Michael Yang committed
620

621
622
				s += uint64(n)

623
				if progress != nil {
624
					done := doneBytes.Add(uint64(n))
625
					progress(float32(done) / float32(totalBytes))
626
627
628
629
630
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
631
632
	}

633
634
635
636
637
638
639
640
641
642
643
644
	// Cleanup any backend state from devices that we didn't end up using
nextDevice:
	for _, d := range append(gpus, append(accels, cpus...)...) {
		for _, backend := range b.schedBackends {
			if d == C.ggml_backend_get_device(backend) {
				continue nextDevice
			}
		}

		C.ggml_backend_dev_reset(d)
	}

645
	if err := g.Wait(); err != nil {
646
		return err
647
648
	}

649
	return nil
Michael Yang's avatar
Michael Yang committed
650
651
}

652
653
654
655
func (b *Backend) BackendMemory() ml.BackendMemory {
	return *b.requiredMemory
}

656
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
657
658
659
660
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
661
662
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
663
664
665
666
667
668
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
669
	return b.NewContextSize(b.maxGraphNodes)
670
671
672
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
673
674
675
676
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

677
	var allocatedBuffers []C.ggml_backend_buffer_t
678

Michael Yang's avatar
Michael Yang committed
679
	return &Context{
680
681
		b:             b,
		maxGraphNodes: n,
682
		ctx: C.ggml_init(C.struct_ggml_init_params{
683
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
684
685
			no_alloc: true,
		}),
686
		allocatedBuffers: &allocatedBuffers,
687
		layer:            -1,
Michael Yang's avatar
Michael Yang committed
688
689
690
	}
}

691
func (b *Backend) CacheConfig() ml.CacheConfig {
692
693
694
695
696
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
697
698
}

Michael Yang's avatar
Michael Yang committed
699
type Context struct {
700
	b *Backend
Michael Yang's avatar
Michael Yang committed
701

702
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
703
	graph *C.struct_ggml_cgraph
704

705
	// buft is the buffer type used for new tensors
706
	buft C.ggml_backend_buffer_type_t
707

708
709
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
710
	allocatedBuffers *[]C.ggml_backend_buffer_t
711

Michael Yang's avatar
Michael Yang committed
712
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
713
	maxGraphNodes int
714
715
716

	// layer is the graph layer that this context is allocating for - assumed to be cache
	layer int
Michael Yang's avatar
Michael Yang committed
717
718
}

719
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
720
	if c.b.input != nil {
721
		return &Context{
722
723
724
725
726
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
727
			layer:            -1,
728
729
730
		}
	}

731
	return c
732
733
}

734
func (c *Context) Layer(i int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
735
	if layer, ok := c.b.layers[i]; ok {
736
		return &Context{
737
738
			b:                c.b,
			ctx:              c.ctx,
Jesse Gross's avatar
Jesse Gross committed
739
			buft:             layer.bt,
740
741
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
742
			layer:            i,
743
744
745
		}
	}

746
	return c
747
748
}

749
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
750
	if c.graph == nil {
751
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
752
753
	}

754
755
756
757
758
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
759
760
}

761
func (c *Context) Compute(tensors ...ml.Tensor) {
762
763
764
765
766
767
768
769
770
	c.ComputeWithNotify(nil, tensors...)
}

func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
	c.b.schedMu.Lock()
	defer c.b.schedMu.Unlock()
	if cb != nil {
		go cb()
	}
771
772
773
	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
		panic(fmt.Errorf("error computing ggml graph: %v", status))
	}
Michael Yang's avatar
Michael Yang committed
774
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
775

776
777
778
	needSync := true
	sync := func() {
		if needSync {
779
			C.ggml_backend_sched_synchronize(c.b.sched)
780
781
782
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
783

784
785
786
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
787
788
		}
	}
Michael Yang's avatar
Michael Yang committed
789
790
}

791
792
func (c *Context) Reserve() {
	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
793
794

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
795
796
797
798
799
800

	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
	for _, bt := range c.b.schedBufts {
		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
	}

801
	for i := range c.b.schedBackends {
802
803
804
805
		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])

		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
		graph.Size += uint64(bufferStatus.size)
Jesse Gross's avatar
Jesse Gross committed
806
807
808
809
810
811
		if c.b.allocMemory {
			if bufferStatus.allocated && graph.Status != ml.Failed {
				graph.Status = ml.Allocated
			} else {
				graph.Status = ml.Failed
			}
812
813
		}

Jesse Gross's avatar
Jesse Gross committed
814
815
		slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
816
817
	}

818
819
820
	if !reserved {
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
	}
821
822
}

823
func (c *Context) MaxGraphNodes() int {
824
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
825
826
}

827
828
829
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
830
		sh[i] = C.int64_t(s)
831
832
833
834
835
	}

	return &sh[0]
}

836
837
838
839
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

840
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
841
	if c.buft == nil {
842
		panic("set Input or Layer before creating tensors")
843
844
	}

845
	cdtype := ggmlDType(dtype)
Michael Yang's avatar
Michael Yang committed
846

Jesse Gross's avatar
Jesse Gross committed
847
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
848
		var shape C.int64_t = 0
849
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
Michael Yang's avatar
Michael Yang committed
850
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
851
852
853
854
855
856
857
858
859
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
860
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
861
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
862

863
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
864
865
866
867
	if c.layer >= 0 {
		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]

		cache.Size += uint64(size)
Jesse Gross's avatar
Jesse Gross committed
868
869
870
871
872
873
		if c.b.allocMemory {
			if b != nil {
				cache.Status = ml.Allocated
			} else {
				cache.Status = ml.Failed
			}
874
875
876
		}
	}

877
	if b == nil {
878
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
879
880
	}

881
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
Michael Yang's avatar
Michael Yang committed
882
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
883
	return &Tensor{b: c.b, t: t}
884
885
}

886
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
887
	return c.newTensor(dtype, shape)
888
889
}

890
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
891
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
892
893
894
	if c.b.allocMemory {
		C.ggml_set_zero(t.(*Tensor).t)
	}
895
	return t
Michael Yang's avatar
Michael Yang committed
896
897
}

898
func checkShape[S ~[]E, E any](s S, shape ...int) {
Michael Yang's avatar
Michael Yang committed
899
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
900
901

	if n == 0 {
902
		return
Jesse Gross's avatar
Jesse Gross committed
903
904
	}

Michael Yang's avatar
Michael Yang committed
905
906
907
908
909
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
910
		panic(fmt.Errorf("invalid shape: %v", shape))
Michael Yang's avatar
Michael Yang committed
911
912
913
	}
}

914
915
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
916

917
	t := c.newTensor(ml.DTypeF32, shape)
918

Jesse Gross's avatar
Jesse Gross committed
919
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
920
921
922
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

923
	return t
Michael Yang's avatar
Michael Yang committed
924
925
}

926
927
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
928

929
	t := c.newTensor(ml.DTypeI32, shape)
930

Jesse Gross's avatar
Jesse Gross committed
931
	if c.b.allocMemory && len(s) > 0 {
Jesse Gross's avatar
Jesse Gross committed
932
933
934
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

935
	return t
Michael Yang's avatar
Michael Yang committed
936
937
}

Michael Yang's avatar
arange  
Michael Yang committed
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

953
		return c.Input().FromIntSlice(arange, len(arange))
Michael Yang's avatar
arange  
Michael Yang committed
954
955
956
957
958
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
959
960
func (c *Context) Close() {
	if c != nil {
961
962
963
964
965
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

966
967
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
968
969
970
}

type Tensor struct {
971
	b    *Backend
Michael Yang's avatar
Michael Yang committed
972
	t    *C.struct_ggml_tensor
973
	sync func()
Michael Yang's avatar
Michael Yang committed
974
975
976
977
978
979
980
981
982
983
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

984
985
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
986
987
}

988
989
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
990
991
}

992
993
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
994
995
996
997
998
999
1000
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

1001
1002
1003
1004
1005
1006
1007
1008
1009
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
1010
1011
}

1012
1013
1014
1015
1016
1017
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
1018
1019
1020
1021
1022
	}

	return
}

1023
1024
1025
1026
1027
1028
func (t *Tensor) SetValueFromIntSlice(s []int32) {
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
	}
}

Michael Yang's avatar
Michael Yang committed
1029
1030
1031
1032
func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
1033
1034
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
1035
1036
1037
1038
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
1039
1040
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
Michael Yang's avatar
Michael Yang committed
1041
1042
	case C.GGML_TYPE_MXFP4:
		return ml.DTypeMXFP4
Michael Yang's avatar
Michael Yang committed
1043
1044
1045
1046
1047
	default:
		return ml.DTypeOther
	}
}

1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
func ggmlDType(dtype ml.DType) uint32 {
	switch dtype {
	case ml.DTypeF32:
		return C.GGML_TYPE_F32
	case ml.DTypeF16:
		return C.GGML_TYPE_F16
	case ml.DTypeQ80:
		return C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		return C.GGML_TYPE_Q4_0
	case ml.DTypeI32:
		return C.GGML_TYPE_I32
	case ml.DTypeMXFP4:
		return C.GGML_TYPE_MXFP4
	default:
		panic("unsupported dtype")
	}
}

func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
	}
}

1074
1075
1076
1077
1078
1079
1080
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1081
1082
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1083
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1084
1085
1086
1087
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1088
1089
1090
1091
1092
1093
1094
func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
1126
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1127
1128
1129
1130
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

Michael Yang's avatar
Michael Yang committed
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
	switch len(shape) {
	case 0:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
		}
	case 1:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
Michael Yang's avatar
Michael Yang committed
1160
1161
1162
1163
1164
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1165
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1166
1167
1168
1169
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1170
1171
1172
1173
1174
1175
1176
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1177
1178
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1179
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1180
1181
1182
1183
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1184
1185
1186
1187
1188
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
1189
		b: t.b,
1190
1191
1192
1193
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1194
1195
1196
1197
1198
1199
1200
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

1201
1202
1203
1204
1205
1206
1207
func (t *Tensor) AddID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_add_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1208
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1209
1210
1211
1212
1213
1214
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
1215
1216
	}

Michael Yang's avatar
llama4  
Michael Yang committed
1217
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1218
1219
1220
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1221
1222
1223
1224
1225
1226
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1227
1228
}

1229
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1230
1231
	if len(shape) != 4 {
		panic("expected 4 dimensions")
1232
1233
	} else if shape[3] != 0 {
		panic("cuda does not support 4d tensors")
Michael Yang's avatar
Michael Yang committed
1234
1235
1236
	}

	return &Tensor{
1237
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
1248
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1249
1250
1251
1252
1253
1254
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1255
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1256
1257
1258
1259
1260
1261
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1262
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1263
1264
1265
1266
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1267
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1268
1269
1270
	switch len(shape) {
	case 1:
		return &Tensor{
1271
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1272
1273
1274
1275
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
1276
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1277
1278
1279
1280
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
1281
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1282
1283
1284
1285
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
1286
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1287
1288
1289
1290
1291
1292
1293
1294
1295
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
1296
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1297
1298
1299
1300
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

1301
1302
1303
1304
1305
1306
1307
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1308
1309
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
1310
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1311
1312
1313
1314
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1329
1330
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1331
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1332
1333
1334
1335
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1336
1337
1338
1339
1340
1341
1342
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1343
1344
1345
1346
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1347
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1348
1349
1350
1351
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1352
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1353
1354
1355
1356
1357
1358
1359
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1360
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1361
1362
1363
1364
1365
1366
1367
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1368
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

1379
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
1380
	// Default options
Michael Yang's avatar
Michael Yang committed
1381
1382
1383
1384
1385
1386
1387
1388
	opts := rope.Options{
		Factors:               &Tensor{},
		OriginalContextLength: 131072,
		ExtrapolationFactor:   0.,
		AttentionFactor:       1.,
		BetaFast:              32.,
		BetaSlow:              1.,
	}
1389
1390
1391

	// Apply any provided options
	for _, option := range options {
Michael Yang's avatar
Michael Yang committed
1392
		option(&opts)
1393
1394
	}

Jesse Gross's avatar
Jesse Gross committed
1395
1396
1397
1398
1399
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1400
	return &Tensor{
1401
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1402
		t: C.ggml_rope_ext(
1403
1404
			ctx.(*Context).ctx,
			dequant,
1405
1406
			positions.(*Tensor).t,
			opts.Factors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1407
			C.int(ropeDim),
1408
1409
			C.int(opts.Type),
			C.int(opts.OriginalContextLength),
Michael Yang's avatar
Michael Yang committed
1410
1411
			C.float(ropeBase),
			C.float(ropeScale),
Michael Yang's avatar
Michael Yang committed
1412
1413
1414
1415
			C.float(opts.ExtrapolationFactor),
			C.float(opts.AttentionFactor),
			C.float(opts.BetaFast),
			C.float(opts.BetaSlow),
Michael Yang's avatar
Michael Yang committed
1416
1417
1418
1419
		),
	}
}

1420
1421
1422
1423
1424
1425
1426
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

Michael Yang's avatar
Michael Yang committed
1427
1428
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1429
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1430
1431
1432
1433
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1434
1435
1436
1437
1438
1439
1440
func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1441
1442
func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1443
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1444
1445
1446
1447
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1448
1449
1450
1451
1452
1453
1454
func (t *Tensor) RELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
	}
}

1455
1456
1457
1458
1459
1460
1461
func (t *Tensor) SwiGLU(ctx ml.Context, up ml.Tensor, alpha, limit float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_swiglu_oai(ctx.(*Context).ctx, t.t, up.(*Tensor).t, C.float(alpha), C.float(limit)),
	}
}

Michael Yang's avatar
Michael Yang committed
1462
1463
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1464
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1465
1466
1467
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1468

Michael Yang's avatar
Michael Yang committed
1469
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1470
1471
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1472
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1473
1474
1475
	}
}

Michael Yang's avatar
Michael Yang committed
1476
1477
1478
1479
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1480
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1481
	case 1:
Michael Yang's avatar
Michael Yang committed
1482
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1483
1484
1485
1486
1487
1488
1489
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1490
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sinks ml.Tensor, scale float64) ml.Tensor {
1491
1492
1493
1494
1495
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1496
1497
1498
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1499
1500
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1501

1502
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
1503
1504
1505
		if sinks != nil {
			C.ggml_flash_attn_ext_add_sinks(kqv, sinks.(*Tensor).t)
		}
1506
1507
1508
1509
1510
1511
1512
1513
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}
1514
1515
1516
		if sinks != nil {
			C.ggml_soft_max_add_sinks(kq.(*Tensor).t, sinks.(*Tensor).t)
		}
1517
1518
1519
1520

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1521
}
1522
1523
1524
1525
1526
1527
1528

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1529
1530
1531
1532
1533
1534
1535

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}
1536
1537
1538
1539
1540
1541
1542

func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
	}
}
Michael Yang's avatar
Michael Yang committed
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581

func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
		Sqr(ctx).
		SumRows(ctx).
		Scale(ctx, 1/float64(t.Dim(0)))
}

func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
	return t.Variance(ctx).Sqrt(ctx)
}

func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
	}
}
Michael Yang's avatar
Michael Yang committed
1582
1583
1584
1585

func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
	// Unchecked to handle quantized types
	t := c.newTensor(dtype, shape)
Jesse Gross's avatar
Jesse Gross committed
1586
	if c.b.allocMemory && len(s) > 0 {
Michael Yang's avatar
Michael Yang committed
1587
1588
1589
1590
1591
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

	return t
}