ggml.go 36.2 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
package ggml

3
4
5
6
7
8
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
Michael Yang's avatar
Michael Yang committed
9
10
11
import "C"

import (
12
	"context"
Michael Yang's avatar
Michael Yang committed
13
14
15
	"fmt"
	"io"
	"log/slog"
16
	"maps"
Michael Yang's avatar
Michael Yang committed
17
	"os"
18
	"runtime"
19
20
21
	"slices"
	"strconv"
	"strings"
Jesse Gross's avatar
Jesse Gross committed
22
	"sync"
23
	"sync/atomic"
24
	"unicode"
Michael Yang's avatar
Michael Yang committed
25
26
27
	"unsafe"

	"github.com/ollama/ollama/format"
28
29
	"github.com/ollama/ollama/fs"
	fsggml "github.com/ollama/ollama/fs/ggml"
30
	"github.com/ollama/ollama/logutil"
Michael Yang's avatar
Michael Yang committed
31
	"github.com/ollama/ollama/ml"
32
	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
33
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
34
35
36
	"golang.org/x/sync/errgroup"
)

Jesse Gross's avatar
Jesse Gross committed
37
38
39
40
41
42
var (
	cpus, accels, gpus []C.ggml_backend_dev_t
	backends           map[C.ggml_backend_dev_t]C.ggml_backend_t
)

var initDevices = sync.OnceFunc(func() {
Michael Yang's avatar
Michael Yang committed
43
44
	ggml.OnceLoad()

Jesse Gross's avatar
Jesse Gross committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
	backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
	for i := range C.ggml_backend_dev_count() {
		d := C.ggml_backend_dev_get(i)

		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			if len(cpus) == 0 {
				// only the first cpu device should be used
				cpus = append(cpus, d)
			}
		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			accels = append(accels, d)
		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			gpus = append(gpus, d)
		}

		backends[d] = C.ggml_backend_dev_init(d, nil)
	}
})
Michael Yang's avatar
Michael Yang committed
64
65

type Backend struct {
66
67
68
	// modelPath is the location of the model data
	modelPath string

69
70
	meta *fsggml.GGML

71
72
73
74
	// tensorLoadTargets maps from the name of the tensor in the file
	// to the name that is used by the model definition
	tensorLoadTargets map[string][]string

75
76
77
	sched         C.ggml_backend_sched_t
	schedBackends []C.ggml_backend_t
	schedBufts    []C.ggml_backend_buffer_type_t
78

79
	tensors map[string]*C.struct_ggml_tensor
Michael Yang's avatar
Michael Yang committed
80
81

	// input is the backend used for inputs
82
	input C.ggml_backend_buffer_type_t
Michael Yang's avatar
Michael Yang committed
83
84

	// layers is the backend used for repeating layers
85
	layers map[int]C.ggml_backend_buffer_type_t
86

87
88
89
90
	// requiredMemory is the cumulative memory allocations needed by the backend
	requiredMemory *ml.BackendMemory

	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
91
	btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
92

93
	flashAttention bool
Michael Yang's avatar
Michael Yang committed
94
95
96

	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
	maxGraphNodes int
Jesse Gross's avatar
Jesse Gross committed
97
98
99

	// weightBuffers are the GGML contexts and buffers for allocating weights
	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
Michael Yang's avatar
Michael Yang committed
100
101
}

102
103
104
105
106
107
108
109
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
	r, err := os.Open(modelPath)
	if err != nil {
		return nil, err
	}
	defer r.Close()

	meta, err := fsggml.Decode(r, -1)
Michael Yang's avatar
Michael Yang committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
	if err != nil {
		return nil, err
	}

	slog.Info(
		"",
		"architecture", meta.KV().Architecture(),
		"file_type", meta.KV().FileType(),
		"name", meta.KV().String("general.name"),
		"description", meta.KV().String("general.description"),
		"num_tensors", len(meta.Tensors().Items()),
		"num_key_values", len(meta.KV()),
	)

Jesse Gross's avatar
Jesse Gross committed
124
125
	initDevices()

126
	var requiredMemory ml.BackendMemory
127
	btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
128

129
	type deviceBufferType struct {
130
131
		d   C.ggml_backend_dev_t
		bts []C.ggml_backend_buffer_type_t
132
133
	}

134
135
	blocks := int(meta.KV().BlockCount())

Michael Yang's avatar
Michael Yang committed
136
	// create list of buffer types for the cpu
Michael Yang's avatar
Michael Yang committed
137
	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
138
139
140
141
	for _, d := range append(accels, append(gpus, cpus...)...) {
		switch C.ggml_backend_dev_type(d) {
		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
Michael Yang's avatar
Michael Yang committed
142
			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
143
			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
Michael Yang's avatar
Michael Yang committed
144
		}
145
146
	}

147
	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
148
149
	var props C.struct_ggml_backend_dev_props
	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
150
	requiredMemory.CPU.ID = C.GoString(props.id)
151
152
153
	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)

Michael Yang's avatar
Michael Yang committed
154
	// create list of buffer types for each gpu
155
	var gpuDeviceBufferTypes []deviceBufferType
156
157
	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
	for i, d := range gpus {
158
		bt := C.ggml_backend_dev_buffer_type(d)
159
		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
160
			d:   d,
161
			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
162
		})
163
164
		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
165
166
		var props C.struct_ggml_backend_dev_props
		C.ggml_backend_dev_get_props(d, &props)
167
		requiredMemory.GPUs[i].ID = C.GoString(props.id)
168
169
		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
Michael Yang's avatar
Michael Yang committed
170
171
	}

Michael Yang's avatar
Michael Yang committed
172
173
174
175
176
	useDefaultSplit := true
	for _, s := range params.TensorSplit {
		if s != 0 {
			useDefaultSplit = false
			break
177
		}
Michael Yang's avatar
Michael Yang committed
178
	}
179

Michael Yang's avatar
Michael Yang committed
180
181
182
183
	// calculate splits
	splits := make([]float32, len(gpus))
	if useDefaultSplit {
		// default: split on free memory
184
185
186
187
188
		for i := range splits {
			var free, total C.size_t
			C.ggml_backend_dev_memory(gpus[i], &free, &total)
			splits[i] = float32(free)
		}
Michael Yang's avatar
Michael Yang committed
189
190
	} else {
		splits = params.TensorSplit
191
192
193
	}

	var sum float32
Michael Yang's avatar
Michael Yang committed
194
	// cumulative sum of all splits
195
196
197
198
199
	for i := range splits {
		sum += splits[i]
		splits[i] = sum
	}

Michael Yang's avatar
Michael Yang committed
200
	// normalize splits
201
	for i := range splits {
202
		splits[i] /= sum
203
204
	}

Michael Yang's avatar
Michael Yang committed
205
	// inputs always use cpu
Michael Yang's avatar
Michael Yang committed
206
	input := cpuDeviceBufferType
207

Michael Yang's avatar
Michael Yang committed
208
209
210
	// define a range of gpu layers. anything outside of this range is assigned to the cpu
	gpuRangeStart := max(0, blocks-params.NumGPULayers)
	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
Michael Yang's avatar
Michael Yang committed
211
	assignLayer := func(i int) deviceBufferType {
Michael Yang's avatar
Michael Yang committed
212
		if i < gpuRangeStart || i >= gpuRangeStop {
Michael Yang's avatar
Michael Yang committed
213
			return cpuDeviceBufferType
214
		}
215

Michael Yang's avatar
Michael Yang committed
216
		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
217
		if index < 0 || index >= len(gpuDeviceBufferTypes) {
Michael Yang's avatar
Michael Yang committed
218
			return cpuDeviceBufferType
219
220
221
		}

		return gpuDeviceBufferTypes[index]
222
223
	}

Michael Yang's avatar
Michael Yang committed
224
	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
225
	layers := make([]deviceBufferType, blocks)
226
	for i := range layers {
227
		layers[i] = assignLayer(i)
228
229
	}

Michael Yang's avatar
Michael Yang committed
230
	// outputs are assigned iff allowed by splits and configured number of gpu layers
231
	output := assignLayer(blocks)
232
233
234

	maxTensors := len(meta.Tensors().Items())
	maxTensors += 1
Michael Yang's avatar
Michael Yang committed
235
	// each layer has at most 2 extra tensors for rope operations
236
237
	maxTensors += blocks * 2

238
	type tensor struct {
239
		source *fsggml.Tensor
240
241
242
		target string
	}

Michael Yang's avatar
Michael Yang committed
243
	// some tensors are mapped to different names so keep a list
244
245
	targets := make(map[string][]string)

Michael Yang's avatar
Michael Yang committed
246
	// contexts are shared by tensors of the same buffer type
247
248
	ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
	createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
249
250
251
252
253
254
255
		for _, bt := range bts {
			if _, ok := ctxs[bt]; !ok {
				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
					no_alloc: true,
				})
			}
Michael Yang's avatar
Michael Yang committed
256

257
258
259
260
261
262
263
264
			targets[t.source.Name] = append(targets[t.source.Name], t.target)

			name := t.source.Name
			if t.target != "" {
				name = t.target
			}

			cname := C.CString(name)
Michael Yang's avatar
Michael Yang committed
265
			defer C.free(unsafe.Pointer(cname))
266
267
268
269
			if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
				return tt
			}

270
			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
Michael Yang's avatar
Michael Yang committed
271
272
			C.ggml_set_name(tt, cname)

273
			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
274
275
276
277
278
279
280
281
282
283

			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
			if layer == -1 {
				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
				requiredMemory.InputWeights.Status = ml.Allocated
				requiredMemory.InputWeights.Size += uint64(size)
			} else {
				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
			}

284
285
286
287
288
			//nolint:staticcheck // TODO: check if buffer type supports this tensor
			return tt
		}

		return nil
Michael Yang's avatar
Michael Yang committed
289
290
	}

291
	contains := func(s string, parts ...string) bool {
292
293
294
295
296
297
298
299
		split := strings.Split(s, ".")
		for _, part := range parts {
			if slices.Contains(split, part) {
				return true
			}
		}

		return false
Michael Yang's avatar
Michael Yang committed
300
301
	}

302
303
	for _, t := range meta.Tensors().Items() {
		switch {
304
		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
305
			createTensor(tensor{source: t}, input.bts, -1)
Michael Yang's avatar
Michael Yang committed
306
			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
307
				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
308
			}
Michael Yang's avatar
Michael Yang committed
309
310
311
		case contains(t.Name, "cls", "output", "output_norm",
			"altup_proj", "altup_unembd_proj",
			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
312
			createTensor(tensor{source: t}, output.bts, blocks)
313
		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
Michael Yang's avatar
Michael Yang committed
314
			// TODO: assign vision tensors to the gpu if possible
315
			createTensor(tensor{source: t}, output.bts, blocks)
Michael Yang's avatar
Michael Yang committed
316
317
318
319
320
321
		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
			// these tensors should be repeated per layer
			for i, layer := range layers {
				createTensor(tensor{
					source: t,
					target: "blk." + strconv.Itoa(i) + "." + t.Name,
322
				}, layer.bts, i)
Michael Yang's avatar
Michael Yang committed
323
			}
324
		default:
Michael Yang's avatar
Michael Yang committed
325
326
327
328
			layerIndex := -1
			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
				if i, err := strconv.Atoi(fields[0]); err == nil {
					layerIndex = i
329
				}
Michael Yang's avatar
Michael Yang committed
330
			}
331

Michael Yang's avatar
Michael Yang committed
332
			if layerIndex >= 0 {
333
				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
334
			} else {
Michael Yang's avatar
Michael Yang committed
335
				// load all other tensors on the cpu
336
				createTensor(tensor{source: t}, input.bts, -1)
337
338
339
			}
		}
	}
Michael Yang's avatar
Michael Yang committed
340

Michael Yang's avatar
Michael Yang committed
341
	// allocate buffers for each context
342
	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
343
344
345
346
347
348
	for bt, c := range ctxs {
		if C.ggml_get_first_tensor(c) == nil {
			continue
		}

		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
349
350
351
352
353
354
355
356
357
358
		for i := range btDeviceMemory[bt].Weights {
			if btDeviceMemory[bt].Weights[i].Size != 0 {
				if b != nil {
					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
				} else {
					btDeviceMemory[bt].Weights[i].Status = ml.Failed
				}
			}
		}

359
		if b == nil {
Jesse Gross's avatar
Jesse Gross committed
360
361
362
363
364
365
366
367
			for _, b := range bbs {
				C.ggml_backend_buffer_free(b)
			}

			for _, ctx := range ctxs {
				C.ggml_free(ctx)
			}

368
			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
369
370
		}

371
		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
Michael Yang's avatar
Michael Yang committed
372
		bbs[c] = b
373
374
	}

375
376
	// Mimic llama runner logs summarizing layers and memory
	gpuLayers := 0
377
378
379
380
381
382
383
	for _, layer := range layers {
		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
			gpuLayers++
		}
	}
	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))

384
	switch C.ggml_backend_dev_type(output.d) {
385
	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
386
		slog.Info("offloading output layer to CPU")
387
	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
388
389
		slog.Info("offloading output layer to GPU")
		gpuLayers++
390
	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
391
392
393
		slog.Info("offloading output layer to ACCEL")
	}
	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
394

395
	for bs := range maps.Values(bbs) {
Michael Yang's avatar
Michael Yang committed
396
		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
397
398
	}

Michael Yang's avatar
Michael Yang committed
399
	// map tensor names to tensors for easy lookup later
400
401
402
403
404
405
406
	tensors := make(map[string]*C.struct_ggml_tensor)
	for _, c := range ctxs {
		for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
			tensors[C.GoString(C.ggml_get_name(t))] = t
		}
	}

407
	// map devices to backend buffer types so new tensors can be assigned to the correct device
408
	deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
409
410

	// create backends and buffer types used for the compute graph scheduler
411
412
	var schedBackends []C.ggml_backend_t
	var schedBufts []C.ggml_backend_buffer_type_t
413
	for _, d := range append(gpus, append(accels, cpus...)...) {
Jesse Gross's avatar
Jesse Gross committed
414
		b := backends[d]
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
		bt := C.ggml_backend_get_default_buffer_type(b)

		deviceBufferTypes[d] = bt

		schedBackends = append(schedBackends, b)
		schedBufts = append(schedBufts, bt)

		if C.ggml_backend_is_cpu(b) {
			// set number of threads for cpu backend
			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
		}
	}

	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
	return &Backend{
		modelPath:         modelPath,
		flashAttention:    params.FlashAttention,
		meta:              meta,
		tensorLoadTargets: targets,
		tensors:           tensors,
		sched: C.ggml_backend_sched_new(
			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
			C.int(len(schedBackends)),
			C.size_t(maxGraphNodes),
440
			C._Bool(false),
441
442
443
444
445
			C._Bool(false),
		),
		schedBackends: schedBackends,
		schedBufts:    schedBufts,
		input:         deviceBufferTypes[input.d],
446
447
		layers: func() map[int]C.ggml_backend_buffer_type_t {
			m := make(map[int]C.ggml_backend_buffer_type_t)
448
449
450
451
452
			for i, layer := range layers {
				m[i] = deviceBufferTypes[layer.d]
			}
			return m
		}(),
453
454
455
		requiredMemory: &requiredMemory,
		btDeviceMemory: btDeviceMemory,
		maxGraphNodes:  maxGraphNodes,
Jesse Gross's avatar
Jesse Gross committed
456
		weightBuffers:  bbs,
457
458
459
460
461
462
463
	}, nil
}

func init() {
	ml.RegisterBackend("ggml", New)
}

Jesse Gross's avatar
Jesse Gross committed
464
465
466
467
468
469
470
471
472
473
474
475
476
func (b *Backend) Close() {
	if b == nil {
		return
	}

	for ctx, b := range b.weightBuffers {
		C.ggml_backend_buffer_free(b)
		C.ggml_free(ctx)
	}

	C.ggml_backend_sched_free(b.sched)
}

477
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
478
	var doneBytes atomic.Uint64
479
	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
480
481
482

	g, ctx := errgroup.WithContext(ctx)
	g.SetLimit(runtime.GOMAXPROCS(0))
483
	for _, t := range b.meta.Tensors().Items() {
484
		t := t
485
		g.Go(func() error {
486
			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
487
			for i := range tts {
488
				target := b.tensorLoadTargets[t.Name][i]
489
490
491
				if target == "" {
					target = t.Name
				}
492

493
				tt, ok := b.tensors[target]
494
495
496
				if !ok {
					return fmt.Errorf("unassigned tensor: %s", t.Name)
				}
Michael Yang's avatar
Michael Yang committed
497

498
499
500
				tts[i] = tt
			}

501
502
			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
			// seeking around within an FD shared between all goroutines.
503
			file, err := os.Open(b.modelPath)
504
			if err != nil {
505
				slog.Warn("file open error", "file", b.modelPath, "error", err)
506
507
508
				return err
			}
			defer file.Close()
509
			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
510
511
512
513
			bts := make([]byte, 128*format.KibiByte)

			var s uint64
			for s < t.Size() {
514
515
516
517
518
				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
				if err := ctx.Err(); err != nil {
					return err
				}

519
520
				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
				if err != nil {
521
					slog.Warn("file read error", "file", b.modelPath, "error", err)
522
					return err
523
				}
Michael Yang's avatar
Michael Yang committed
524

525
526
				for _, tt := range tts {
					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
527
				}
Michael Yang's avatar
Michael Yang committed
528

529
530
				s += uint64(n)

531
				if progress != nil {
532
					done := doneBytes.Add(uint64(n))
533
					progress(float32(done) / float32(totalBytes))
534
535
536
537
538
				}
			}

			return nil
		})
Michael Yang's avatar
Michael Yang committed
539
540
	}

541
	if err := g.Wait(); err != nil {
542
		return err
543
544
	}

545
	return nil
Michael Yang's avatar
Michael Yang committed
546
547
}

548
549
550
551
func (b *Backend) BackendMemory() ml.BackendMemory {
	return *b.requiredMemory
}

552
func (b *Backend) Config() fs.Config {
Michael Yang's avatar
Michael Yang committed
553
554
555
556
	return b.meta.KV()
}

func (b *Backend) Get(name string) ml.Tensor {
557
558
	if t, ok := b.tensors[name]; ok {
		return &Tensor{b: b, t: t}
Michael Yang's avatar
Michael Yang committed
559
560
561
562
563
564
	}

	return nil
}

func (b *Backend) NewContext() ml.Context {
Michael Yang's avatar
Michael Yang committed
565
	return b.NewContextSize(b.maxGraphNodes)
566
567
568
}

func (b *Backend) NewContextSize(n int) ml.Context {
Jesse Gross's avatar
Jesse Gross committed
569
570
571
572
	if n > b.maxGraphNodes {
		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
	}

573
	var allocatedBuffers []C.ggml_backend_buffer_t
574

Michael Yang's avatar
Michael Yang committed
575
	return &Context{
576
577
		b:             b,
		maxGraphNodes: n,
578
		ctx: C.ggml_init(C.struct_ggml_init_params{
579
			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
580
581
			no_alloc: true,
		}),
582
		allocatedBuffers: &allocatedBuffers,
583
		layer:            -1,
Michael Yang's avatar
Michael Yang committed
584
585
586
	}
}

587
func (b *Backend) CacheConfig() ml.CacheConfig {
588
589
590
591
592
	if b.flashAttention {
		return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
	} else {
		return ml.CacheConfig{CachePadding: 32, PermutedV: true}
	}
593
594
}

Michael Yang's avatar
Michael Yang committed
595
type Context struct {
596
	b *Backend
Michael Yang's avatar
Michael Yang committed
597

598
	ctx   *C.struct_ggml_context
Michael Yang's avatar
Michael Yang committed
599
	graph *C.struct_ggml_cgraph
600

601
	// buft is the buffer type used for new tensors
602
	buft C.ggml_backend_buffer_type_t
603

604
605
	// allocatedBuffers are buffers for tensors that we have allocated in this context
	// so that we can free them when we close the context
606
	allocatedBuffers *[]C.ggml_backend_buffer_t
607

Michael Yang's avatar
Michael Yang committed
608
	// maxGraphNodes is the maximum allowed number of graph nodes in this context
609
	maxGraphNodes int
610
611
612

	// layer is the graph layer that this context is allocating for - assumed to be cache
	layer int
Michael Yang's avatar
Michael Yang committed
613
614
}

615
func (c *Context) Input() ml.Context {
Michael Yang's avatar
Michael Yang committed
616
	if c.b.input != nil {
617
		return &Context{
618
619
620
621
622
			b:                c.b,
			ctx:              c.ctx,
			buft:             c.b.input,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
623
			layer:            -1,
624
625
626
		}
	}

627
	return c
628
629
}

630
func (c *Context) Layer(i int) ml.Context {
631
	if buft, ok := c.b.layers[i]; ok {
632
		return &Context{
633
634
635
636
637
			b:                c.b,
			ctx:              c.ctx,
			buft:             buft,
			allocatedBuffers: c.allocatedBuffers,
			maxGraphNodes:    c.maxGraphNodes,
638
			layer:            i,
639
640
641
		}
	}

642
	return c
643
644
}

645
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
Michael Yang's avatar
Michael Yang committed
646
	if c.graph == nil {
647
		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
Michael Yang's avatar
Michael Yang committed
648
649
	}

650
651
652
653
654
	for _, tensor := range tensors {
		C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
	}

	return c
Michael Yang's avatar
Michael Yang committed
655
656
}

657
func (c *Context) Compute(tensors ...ml.Tensor) {
658
659
660
	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
		panic(fmt.Errorf("error computing ggml graph: %v", status))
	}
Michael Yang's avatar
Michael Yang committed
661
	C.ggml_backend_sched_reset(c.b.sched)
Michael Yang's avatar
Michael Yang committed
662

663
664
665
	needSync := true
	sync := func() {
		if needSync {
666
			C.ggml_backend_sched_synchronize(c.b.sched)
667
668
669
			needSync = false
		}
	}
Michael Yang's avatar
Michael Yang committed
670

671
672
673
	for _, t := range tensors {
		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
			t.(*Tensor).sync = sync
674
675
		}
	}
Michael Yang's avatar
Michael Yang committed
676
677
}

678
679
func (c *Context) Reserve() {
	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
680
681

	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
682
683
684
685
686
687

	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
	for _, bt := range c.b.schedBufts {
		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
	}

688
	for i := range c.b.schedBackends {
689
690
691
692
693
694
695
696
697
698
		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])

		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
		graph.Size += uint64(bufferStatus.size)
		if bufferStatus.allocated && graph.Status != ml.Failed {
			graph.Status = ml.Allocated
		} else {
			graph.Status = ml.Failed
		}

699
		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
700
			"size", format.HumanBytes2(uint64(bufferStatus.size)))
701
702
	}

703
704
705
	if !reserved {
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
	}
706
707
}

708
func (c *Context) MaxGraphNodes() int {
709
	return c.maxGraphNodes
Jesse Gross's avatar
Jesse Gross committed
710
711
}

712
713
714
func shapeToGGML(shape []int) *C.int64_t {
	sh := make([]C.int64_t, len(shape))
	for i, s := range shape {
715
		sh[i] = C.int64_t(s)
716
717
718
719
720
	}

	return &sh[0]
}

721
722
723
724
func pad(length, pad C.size_t) C.size_t {
	return ((length + pad - 1) / pad) * pad
}

725
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
726
	if c.buft == nil {
727
		panic("set Input or Layer before creating tensors")
728
729
	}

Michael Yang's avatar
Michael Yang committed
730
731
732
733
734
735
	var cdtype uint32
	switch dtype {
	case ml.DTypeF32:
		cdtype = C.GGML_TYPE_F32
	case ml.DTypeF16:
		cdtype = C.GGML_TYPE_F16
736
737
738
739
	case ml.DTypeQ80:
		cdtype = C.GGML_TYPE_Q8_0
	case ml.DTypeQ40:
		cdtype = C.GGML_TYPE_Q4_0
Michael Yang's avatar
Michael Yang committed
740
741
	case ml.DTypeI32:
		cdtype = C.GGML_TYPE_I32
Michael Yang's avatar
Michael Yang committed
742
743
	case ml.DTypeMXFP4:
		cdtype = C.GGML_TYPE_MXFP4
Michael Yang's avatar
Michael Yang committed
744
745
746
747
	default:
		panic("unsupported dtype")
	}

Jesse Gross's avatar
Jesse Gross committed
748
	if len(shape) < 1 || shape[0] == 0 {
Michael Yang's avatar
Michael Yang committed
749
		var shape C.int64_t = 0
750
		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
Michael Yang's avatar
Michael Yang committed
751
	} else if len(shape) > 4 {
Michael Yang's avatar
Michael Yang committed
752
753
754
755
756
757
758
759
760
		panic("unsupported number of dimensions")
	}

	for _, dim := range shape {
		if dim < 1 {
			panic("invalid shape")
		}
	}

Michael Yang's avatar
Michael Yang committed
761
	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
762
	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
763

764
	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
765
766
767
768
769
770
771
772
773
774
775
	if c.layer >= 0 {
		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]

		cache.Size += uint64(size)
		if b != nil {
			cache.Status = ml.Allocated
		} else {
			cache.Status = ml.Failed
		}
	}

776
	if b == nil {
777
		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
778
779
	}

780
	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
Michael Yang's avatar
Michael Yang committed
781
	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
782
	return &Tensor{b: c.b, t: t}
783
784
}

785
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
786
	return c.newTensor(dtype, shape)
787
788
}

789
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
790
	t := c.newTensor(dtype, shape)
791
792
	C.ggml_set_zero(t.(*Tensor).t)
	return t
Michael Yang's avatar
Michael Yang committed
793
794
}

795
func checkShape[S ~[]E, E any](s S, shape ...int) {
Michael Yang's avatar
Michael Yang committed
796
	n := len(s)
Jesse Gross's avatar
Jesse Gross committed
797
798

	if n == 0 {
799
		return
Jesse Gross's avatar
Jesse Gross committed
800
801
	}

Michael Yang's avatar
Michael Yang committed
802
803
804
805
806
	for _, v := range shape {
		n /= v
	}

	if n != 1 {
807
		panic(fmt.Errorf("invalid shape: %v", shape))
Michael Yang's avatar
Michael Yang committed
808
809
810
	}
}

811
812
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
813

814
	t := c.newTensor(ml.DTypeF32, shape)
815

Jesse Gross's avatar
Jesse Gross committed
816
817
818
819
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

820
	return t
Michael Yang's avatar
Michael Yang committed
821
822
}

823
824
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
	checkShape(s, shape...)
825

826
	t := c.newTensor(ml.DTypeI32, shape)
827

Jesse Gross's avatar
Jesse Gross committed
828
829
830
831
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

832
	return t
Michael Yang's avatar
Michael Yang committed
833
834
}

Michael Yang's avatar
arange  
Michael Yang committed
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
	switch dtype {
	case ml.DTypeF32:
		// ggml_arange creates a float32 tensor
		return &Tensor{
			b: c.b,
			t: C.ggml_arange(c.ctx, C.float(start), C.float(stop), C.float(step)),
		}
	case ml.DTypeI32:
		// ggml_cast does not support float32 to int32 conversion
		arange := make([]int32, 0, int((stop-start)/step))
		for i := start; i < stop; i += step {
			arange = append(arange, int32(i))
		}

850
		return c.Input().FromIntSlice(arange, len(arange))
Michael Yang's avatar
arange  
Michael Yang committed
851
852
853
854
855
	default:
		panic("unsupported dtype for arange")
	}
}

Michael Yang's avatar
Michael Yang committed
856
857
func (c *Context) Close() {
	if c != nil {
858
859
860
861
862
		for _, b := range *c.allocatedBuffers {
			C.ggml_backend_buffer_free(b)
		}
		*c.allocatedBuffers = nil

863
864
		C.ggml_free(c.ctx)
	}
Michael Yang's avatar
Michael Yang committed
865
866
867
}

type Tensor struct {
868
	b    *Backend
Michael Yang's avatar
Michael Yang committed
869
	t    *C.struct_ggml_tensor
870
	sync func()
Michael Yang's avatar
Michael Yang committed
871
872
873
874
875
876
877
878
879
880
}

func (t *Tensor) LogValue() slog.Value {
	return slog.GroupValue(
		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
		slog.Any("shape", t.Shape()),
	)
}

881
882
func (t *Tensor) Dim(n int) int {
	return int(t.t.ne[n])
Michael Yang's avatar
Michael Yang committed
883
884
}

885
886
func (t *Tensor) Stride(n int) int {
	return int(t.t.nb[n])
Michael Yang's avatar
Michael Yang committed
887
888
}

889
890
func (t *Tensor) Shape() []int {
	shape := make([]int, C.ggml_n_dims(t.t))
Michael Yang's avatar
Michael Yang committed
891
892
893
894
895
896
897
	for i := range shape {
		shape[i] = t.Dim(i)
	}

	return shape
}

898
899
900
901
902
903
904
905
906
func (t *Tensor) Bytes() (data []byte) {
	if t.sync != nil {
		data = make([]byte, C.ggml_nbytes(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
	}

	return
Michael Yang's avatar
Michael Yang committed
907
908
}

909
910
911
912
913
914
func (t *Tensor) Floats() (data []float32) {
	if t.sync != nil {
		data = make([]float32, C.ggml_nelements(t.t))

		t.sync()
		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
Michael Yang's avatar
Michael Yang committed
915
916
917
918
919
920
921
922
923
	}

	return
}

func (t *Tensor) DType() ml.DType {
	switch t.t._type {
	case C.GGML_TYPE_F32:
		return ml.DTypeF32
Jesse Gross's avatar
Jesse Gross committed
924
925
	case C.GGML_TYPE_F16:
		return ml.DTypeF16
926
927
928
929
	case C.GGML_TYPE_Q8_0:
		return ml.DTypeQ80
	case C.GGML_TYPE_Q4_0:
		return ml.DTypeQ40
Michael Yang's avatar
Michael Yang committed
930
931
	case C.GGML_TYPE_I32:
		return ml.DTypeI32
Michael Yang's avatar
Michael Yang committed
932
933
	case C.GGML_TYPE_MXFP4:
		return ml.DTypeMXFP4
Michael Yang's avatar
Michael Yang committed
934
935
936
937
938
	default:
		return ml.DTypeOther
	}
}

939
940
941
942
943
944
945
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_neg(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
946
947
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
948
		b: t.b,
Michael Yang's avatar
Michael Yang committed
949
950
951
952
		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
953
954
955
956
957
958
959
func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
	if dim < 0 || dim >= C.GGML_MAX_DIMS {
		panic("invalid dimension")
	}

	shape := make([]C.int64_t, C.GGML_MAX_DIMS)
	for i := range C.GGML_MAX_DIMS {
		if i == dim {
			shape[i] = C.int64_t(t.Dim(i) * n)
		} else {
			shape[i] = C.int64_t(t.Dim(i))
		}
	}

	tmpl := C.ggml_new_tensor(ctx.(*Context).ctx, t.t._type, C.int(len(shape)), unsafe.SliceData(shape))
	return &Tensor{
		b: t.b,
		t: C.ggml_repeat(ctx.(*Context).ctx, t.t, tmpl),
	}
}

Michael Yang's avatar
Michael Yang committed
981
982
983
984
985
986
987
988
989
990
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
	if len(s) > 0 {
		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
	}

	return t
}

func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
	return &Tensor{
991
		b: t.b,
Michael Yang's avatar
Michael Yang committed
992
993
994
995
		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
	}
}

Michael Yang's avatar
Michael Yang committed
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
	switch len(shape) {
	case 0:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
		}
	case 1:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
			b: t.b,
			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
Michael Yang's avatar
Michael Yang committed
1025
1026
1027
1028
1029
	}
}

func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1030
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1031
1032
1033
1034
		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1035
1036
1037
1038
1039
1040
1041
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1042
1043
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1044
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1045
1046
1047
1048
		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1049
1050
1051
1052
1053
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)

	return &Tensor{
1054
		b: t.b,
1055
1056
1057
1058
		t: mul,
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1059
1060
1061
1062
1063
1064
1065
func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
	}
}

Michael Yang's avatar
Michael Yang committed
1066
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1067
1068
1069
1070
1071
1072
	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
		if b != nil {
			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
		}
Michael Yang's avatar
Michael Yang committed
1073
1074
	}

Michael Yang's avatar
llama4  
Michael Yang committed
1075
	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1076
1077
1078
}

func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
Michael Yang's avatar
llama4  
Michael Yang committed
1079
1080
1081
1082
1083
1084
	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
	if w != nil {
		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
	}

	return &Tensor{b: t.b, t: tt}
Michael Yang's avatar
Michael Yang committed
1085
1086
}

1087
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1088
1089
	if len(shape) != 4 {
		panic("expected 4 dimensions")
1090
1091
	} else if shape[3] != 0 {
		panic("cuda does not support 4d tensors")
Michael Yang's avatar
Michael Yang committed
1092
1093
1094
	}

	return &Tensor{
1095
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
	if len(shape) != 4 {
		panic("expected 4 dimensions")
	}

	return &Tensor{
1106
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1107
1108
1109
1110
1111
1112
		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
	}
}

func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1113
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1114
1115
1116
1117
1118
1119
		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
	return &Tensor{
1120
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1121
1122
1123
1124
		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
	}
}

1125
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1126
1127
1128
	switch len(shape) {
	case 1:
		return &Tensor{
1129
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1130
1131
1132
1133
			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
		}
	case 2:
		return &Tensor{
1134
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1135
1136
1137
1138
			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
		}
	case 3:
		return &Tensor{
1139
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1140
1141
1142
1143
			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
		}
	case 4:
		return &Tensor{
1144
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1145
1146
1147
1148
1149
1150
1151
1152
1153
			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
	return &Tensor{
1154
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1155
1156
1157
1158
		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
	}
}

1159
1160
1161
1162
1163
1164
1165
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1166
1167
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
	return &Tensor{
1168
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1169
1170
1171
1172
		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
	}
}

1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
func (t *Tensor) Sin(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sin(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Cos(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_cos(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1187
1188
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
	return &Tensor{
1189
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1190
1191
1192
1193
		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
llama4  
Michael Yang committed
1194
1195
1196
1197
1198
1199
1200
func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1201
1202
1203
1204
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
	switch len(shape) {
	case 1:
		return &Tensor{
1205
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1206
1207
1208
1209
			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
		}
	case 3:
		return &Tensor{
1210
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1211
1212
1213
1214
1215
1216
1217
			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]),
				C.size_t(shape[1]),
				C.size_t(offset)),
		}
	case 5:
		return &Tensor{
1218
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1219
1220
1221
1222
1223
1224
1225
			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
				C.size_t(shape[1]), C.size_t(shape[3]),
				C.size_t(offset)),
		}
	case 7:
		return &Tensor{
1226
			b: t.b,
Michael Yang's avatar
Michael Yang committed
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
				C.size_t(offset)),
		}
	default:
		panic("unsupported number of dimensions")
	}
}

1237
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
1238
	// Default options
Michael Yang's avatar
Michael Yang committed
1239
1240
1241
1242
1243
1244
1245
1246
	opts := rope.Options{
		Factors:               &Tensor{},
		OriginalContextLength: 131072,
		ExtrapolationFactor:   0.,
		AttentionFactor:       1.,
		BetaFast:              32.,
		BetaSlow:              1.,
	}
1247
1248
1249

	// Apply any provided options
	for _, option := range options {
Michael Yang's avatar
Michael Yang committed
1250
		option(&opts)
1251
1252
	}

Jesse Gross's avatar
Jesse Gross committed
1253
1254
1255
1256
1257
	dequant := t.t
	if C.ggml_is_quantized(t.t._type) {
		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
	}

Michael Yang's avatar
Michael Yang committed
1258
	return &Tensor{
1259
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1260
		t: C.ggml_rope_ext(
1261
1262
			ctx.(*Context).ctx,
			dequant,
1263
1264
			positions.(*Tensor).t,
			opts.Factors.(*Tensor).t,
Michael Yang's avatar
Michael Yang committed
1265
			C.int(ropeDim),
1266
1267
			C.int(opts.Type),
			C.int(opts.OriginalContextLength),
Michael Yang's avatar
Michael Yang committed
1268
1269
			C.float(ropeBase),
			C.float(ropeScale),
Michael Yang's avatar
Michael Yang committed
1270
1271
1272
1273
			C.float(opts.ExtrapolationFactor),
			C.float(opts.AttentionFactor),
			C.float(opts.BetaFast),
			C.float(opts.BetaSlow),
Michael Yang's avatar
Michael Yang committed
1274
1275
1276
1277
		),
	}
}

1278
1279
1280
1281
1282
1283
1284
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
	}
}

Michael Yang's avatar
Michael Yang committed
1285
1286
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1287
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1288
1289
1290
1291
		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1292
1293
1294
1295
1296
1297
1298
func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1299
1300
func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
	return &Tensor{
1301
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1302
1303
1304
1305
		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1306
1307
1308
1309
1310
1311
1312
func (t *Tensor) RELU(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
	}
}

Michael Yang's avatar
Michael Yang committed
1313
1314
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
	return &Tensor{
1315
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1316
1317
1318
		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
	}
}
1319

Michael Yang's avatar
Michael Yang committed
1320
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
1321
1322
	return &Tensor{
		b: t.b,
Michael Yang's avatar
Michael Yang committed
1323
		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
Michael Yang's avatar
Michael Yang committed
1324
1325
1326
	}
}

Michael Yang's avatar
Michael Yang committed
1327
1328
1329
1330
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
	var tt *C.struct_ggml_tensor
	switch len(strides) {
	case 0:
Michael Yang's avatar
Michael Yang committed
1331
		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
Michael Yang's avatar
Michael Yang committed
1332
	case 1:
Michael Yang's avatar
Michael Yang committed
1333
		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
Michael Yang's avatar
Michael Yang committed
1334
1335
1336
1337
1338
1339
1340
	default:
		panic("unsupported number of dimensions")
	}

	return &Tensor{b: t.b, t: tt}
}

1341
1342
1343
1344
1345
1346
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
	var kqMask *C.struct_ggml_tensor
	if mask != nil {
		kqMask = mask.(*Tensor).t
	}

1347
1348
1349
	query := t.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)

1350
1351
	if t.b.flashAttention {
		value = value.Permute(ctx, 0, 2, 1, 3)
1352

1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
		kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
		C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
		return &Tensor{b: t.b, t: kqv}
	} else {
		kq := key.MulmatFullPrec(ctx, query)
		kq = &Tensor{
			b: t.b,
			t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
		}

		kqv := value.Mulmat(ctx, kq)
		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	}
1366
}
1367
1368
1369
1370
1371
1372
1373

func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
	}
}
Michael Yang's avatar
llama4  
Michael Yang committed
1374
1375
1376
1377
1378
1379
1380

func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
	}
}
1381
1382
1383
1384
1385
1386
1387

func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
	}
}
Michael Yang's avatar
Michael Yang committed
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426

func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
		Sqr(ctx).
		SumRows(ctx).
		Scale(ctx, 1/float64(t.Dim(0)))
}

func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
	return t.Variance(ctx).Sqrt(ctx)
}

func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
	}
}

func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
	return &Tensor{
		b: t.b,
		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
	}
}
Michael Yang's avatar
Michael Yang committed
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436

func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
	// Unchecked to handle quantized types
	t := c.newTensor(dtype, shape)
	if len(s) > 0 {
		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
	}

	return t
}