Merge branch 'main' into drifkin/array-head-count-simple

b2b270ad · Devon Rifkin · 20c5fd39 · 2bb69b40 · b2b270ad · 20c5fd39
Commit b2b270ad authored Jun 23, 2025 by Devon Rifkin
20 changed files
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Fri, 18 Apr 2025 15:58:19 -0700
+Subject: [PATCH] graph memory reporting on failure
+---
+ ggml/include/ggml-alloc.h   |  6 ++++++
+ ggml/include/ggml-backend.h |  6 ++++++
+ ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-backend.cpp   | 10 ++++++++++
+ 4 files changed, 56 insertions(+), 4 deletions(-)
+diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
+index 2cb150fd..781b1e10 100644
+--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
+@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+ // Utils
+ // Create a buffer and allocate all the tensors in a ggml_context
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 778927f6..74e46716 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -304,6 +304,12 @@ extern "C" {
+     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index 5fd379f6..04812990 100644
+--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+@@ -364,6 +364,7 @@ struct node_alloc {
+ struct ggml_gallocr {
+     ggml_backend_buffer_type_t * bufts; // [n_buffers]
+     ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
+     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+     int n_buffers;
+@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+     GGML_ASSERT(galloc->buffers != NULL);
+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
+     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
+     GGML_ASSERT(galloc->buf_tallocs != NULL);
+@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+     ggml_hash_set_free(&galloc->hash_set);
+     free(galloc->hash_values);
+     free(galloc->bufts);
+    free(galloc->buffer_sizes);
+     free(galloc->buffers);
+     free(galloc->buf_tallocs);
+     free(galloc->node_allocs);
+@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+         }
+     }
+    bool success = true;
+
+     // reallocate buffers if needed
+     for (int i = 0; i < galloc->n_buffers; i++) {
+         // if the buffer type is used multiple times, we reuse the same buffer
+@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+             ggml_backend_buffer_free(galloc->buffers[i]);
+             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
+                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
+             }
+-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+         }
+     }
+-    return true;
+    return success;
+ }
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+ }
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
+ // utils
+ static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index 0ce73a99..be335e8c 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+ }
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+     int backend_index = ggml_backend_sched_backend_id(sched, backend);
+     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@kernel.org>
-Date: Thu, 1 May 2025 13:46:10 -0700
-Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
-The following scenario will cause an assertion failure in the graph
-allocator:
- - Build and allocate a graph containing a tensor with a non-NULL data
-   pointer
- - Build and allocate a new graph where that data is NULL
-Result:
-ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
-This happens during revalidation because we think that memory should
-have been previously allocated based on the current graph but in
-reality the previous graph was different. In this situation, we
-should do a full reallocation pass.
---
- ggml/src/ggml-alloc.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
-+++ b/ggml/src/ggml-alloc.c
-@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-     size_t node_size = 0;
-     if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
-+        // If we previously had data but don't now then reallocate
-+        if (talloc->buffer_id < 0) {
-+            return false;
-+        }
-         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
-     }
-     return talloc->size_max >= node_size;
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Thu, 24 Apr 2025 14:48:51 -0700
+Subject: [PATCH] ggml: Export GPU UUIDs
+This enables matching up devices and information reported by the backend
+with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.m |  1 +
+ 3 files changed, 35 insertions(+)
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 74e46716..a880df33 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -152,6 +152,7 @@ extern "C" {
+     struct ggml_backend_dev_props {
+         const char * name;
+         const char * description;
+        const char * uuid;
+         size_t memory_free;
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cb0d8528..4c829153 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+     int device;
+     std::string name;
+     std::string description;
+    std::string uuid;
+ };
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+     return ctx->description.c_str();
+ }
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->uuid.c_str();
+}
+
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+                #if !defined(GGML_USE_HIP)
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                    (unsigned char)prop.uuid.bytes[0],
+                    (unsigned char)prop.uuid.bytes[1],
+                    (unsigned char)prop.uuid.bytes[2],
+                    (unsigned char)prop.uuid.bytes[3],
+                    (unsigned char)prop.uuid.bytes[4],
+                    (unsigned char)prop.uuid.bytes[5],
+                    (unsigned char)prop.uuid.bytes[6],
+                    (unsigned char)prop.uuid.bytes[7],
+                    (unsigned char)prop.uuid.bytes[8],
+                    (unsigned char)prop.uuid.bytes[9],
+                    (unsigned char)prop.uuid.bytes[10],
+                    (unsigned char)prop.uuid.bytes[11],
+                    (unsigned char)prop.uuid.bytes[12],
+                    (unsigned char)prop.uuid.bytes[13],
+                    (unsigned char)prop.uuid.bytes[14],
+                    (unsigned char)prop.uuid.bytes[15]
+                  );
+                dev_ctx->uuid = uuid;
+                #else
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
+
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 1b56f858..ee4f2dcb 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
+    props->uuid        = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = (struct ggml_backend_dev_caps) {
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -114,6 +114,9 @@ void grammar_free(struct llama_grammar *g) {
        if (g->vocab != nullptr) {
            delete g->vocab;
        }
+        if (g->o_vocab != nullptr) {
+                delete g->o_vocab;
+        }
        llama_grammar_free_impl(g);
    }
 }

--- a/llm/memory.go
+++ b/llm/memory.go
@@ -82,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64
 	// Projectors loaded into GPU0 only
-	var projectorWeights uint64
+	var llamaEngineProjectorWeights uint64
-	var projectorGraph uint64
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64
 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -108,15 +111,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 	for _, projector := range projectors {
-		weight, graph := projectorMemoryRequirements(projector)
+		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
-		projectorWeights += weight
-		projectorGraph += graph
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if projectorWeights == 0 && projectorGraph == 0 {
+	if llamaEngineProjectorWeights == 0 {
-		projectorWeights, projectorGraph = f.VisionGraphSize()
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	layers := f.Tensors().GroupLayers()
@@ -168,6 +170,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}
+	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -177,8 +180,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}
-	// Output layer handled at the end if we have space
+	gpuZeroOverhead := llamaEngineProjectorWeights
-	gpuZeroOverhead := projectorWeights + projectorGraph
 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -221,10 +223,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
 	}
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(f.KV().BlockCount()) {
+	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
@@ -234,6 +238,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			overflow += layerSize
 			continue
 		}
@@ -250,31 +255,34 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
+		if len(gpusWithSpace) == 0 {
+			overflow += layerSize
+		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
-	} else {
-		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
-			overflow += layerSize
-		}
 	}
 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
-		for j := len(gpusWithSpace); j > 0; j-- {
+	if memoryLastLayer > 0 {
-			g := gpusWithSpace[layerCount%j]
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			for j := len(gpusWithSpace); j > 0; j-- {
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+				g := gpusWithSpace[layerCount%j]
-				gpuAllocations[g.i] += memoryLayerOutput
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				layerCounts[g.i]++
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
-				layerCount++
+					gpuAllocations[g.i] += memoryLastLayer
-				break
+					layerCounts[g.i]++
+					layerCount++
+					break
+				}
 			}
 		}
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLayerOutput
+			overflow += memoryLastLayer
 		}
 	}
@@ -332,8 +340,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    projectorWeights,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
-		projectorGraph:      projectorGraph,
+		projectorGraph:      ollamaEngineProjectorGraph,
 	}
 	if gpus[0].Library == "cpu" {
@@ -412,51 +420,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }
-func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
+func projectorMemoryRequirements(filename string) (weights uint64) {
 	file, err := os.Open(filename)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 	defer file.Close()
-	ggml, _, err := ggml.Decode(file, 1024)
+	ggml, err := ggml.Decode(file, 1024)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}
-	switch arch := ggml.KV().Architecture(); arch {
+	return weights
-	case "mllama":
-		kv := func(n string) uint64 {
-			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
-				return uint64(v)
-			}
-			return 0
-		}
-		imageSize := kv("image_size")
-		maxNumTiles := kv("max_num_tiles")
-		embeddingLength := kv("embedding_length")
-		headCount := kv("attention.head_count")
-		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
-			numPatches++
-		}
-		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
-		graphSize = 4 * (8 +
-			imageSize*imageSize*kv("num_channels")*maxNumTiles +
-			embeddingLength*numPatches*maxNumTiles +
-			9*embeddingLength*numPaddedPatches*maxNumTiles +
-			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
-	}
-	return weights, graphSize
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"slices"
 	"strconv"
 	"strings"
 	"sync"
@@ -30,9 +31,37 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model"
 )
+type filteredEnv []string
+func (e filteredEnv) LogValue() slog.Value {
+	var attrs []slog.Attr
+	for _, env := range e {
+		if key, value, ok := strings.Cut(env, "="); ok {
+			switch {
+			case strings.HasPrefix(key, "OLLAMA_"),
+				strings.HasPrefix(key, "CUDA_"),
+				strings.HasPrefix(key, "ROCR_"),
+				strings.HasPrefix(key, "ROCM_"),
+				strings.HasPrefix(key, "HIP_"),
+				strings.HasPrefix(key, "GPU_"),
+				strings.HasPrefix(key, "HSA_"),
+				strings.HasPrefix(key, "GGML_"),
+				slices.Contains([]string{
+					"PATH",
+					"LD_LIBRARY_PATH",
+					"DYLD_LIBRARY_PATH",
+				}, key):
+				attrs = append(attrs, slog.String(key, value))
+			}
+		}
+	}
+	return slog.GroupValue(attrs...)
+}
 type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
@@ -92,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()
-	ggml, _, err := ggml.Decode(f, maxArraySize)
+	ggml, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }
@@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
 	}
-	if envconfig.Debug() {
-		params = append(params, "--verbose")
-	}
 	if opts.MainGPU > 0 {
 		params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
 	}
@@ -286,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 		slog.Info("starting llama server", "cmd", s.cmd)
-		if envconfig.Debug() {
+		slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
-			filteredEnv := []string{}
-			for _, ev := range s.cmd.Env {
-				if strings.HasPrefix(ev, "OLLAMA_") ||
-					strings.HasPrefix(ev, "CUDA_") ||
-					strings.HasPrefix(ev, "ROCR_") ||
-					strings.HasPrefix(ev, "ROCM_") ||
-					strings.HasPrefix(ev, "HIP_") ||
-					strings.HasPrefix(ev, "GPU_") ||
-					strings.HasPrefix(ev, "HSA_") ||
-					strings.HasPrefix(ev, "GGML_") ||
-					strings.HasPrefix(ev, "PATH=") ||
-					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
-					strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
-					filteredEnv = append(filteredEnv, ev)
-				}
-			}
-			// Log at debug as the environment is inherited and might contain sensitive information
-			slog.Debug("subprocess", "environment", filteredEnv)
-		}
 		if err = s.cmd.Start(); err != nil {
 			var msg string
@@ -673,9 +679,8 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte
 type ImageData struct {
-	Data          []byte `json:"data"`
+	Data []byte `json:"data"`
-	ID            int    `json:"id"`
+	ID   int    `json:"id"`
-	AspectRatioID int    `json:"aspect_ratio_id"`
 }
 type CompletionRequest struct {
@@ -721,6 +726,9 @@ type CompletionResponse struct {
 }
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
+	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
+	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@@ -789,7 +797,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	res, err := http.DefaultClient.Do(serverReq)
 	if err != nil {
-		return fmt.Errorf("POST predict: %v", err)
+		slog.Error("post predict", "error", err)
+		return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
 	}
 	defer res.Body.Close()
@@ -884,6 +893,8 @@ type EmbeddingResponse struct {
 }
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
+	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embedding request due to client closing the connection")

--- a/logutil/logutil.go
+++ b/logutil/logutil.go
+package logutil
+import (
+	"io"
+	"log/slog"
+	"path/filepath"
+)
+const LevelTrace slog.Level = -8
+func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
+	return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			switch attr.Key {
+			case slog.LevelKey:
+				switch attr.Value.Any().(slog.Level) {
+				case LevelTrace:
+					attr.Value = slog.StringValue("TRACE")
+				}
+			case slog.SourceKey:
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+			return attr
+		},
+	}))
+}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,7 +5,8 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"os"
+	"log/slog"
+	"math"
 	"slices"
 	"strconv"
 	"strings"
@@ -14,6 +15,11 @@ import (
 )
 type Backend interface {
+	Load(ctx context.Context, progress func(float32)) error
+	// BackendMemory returns the memory allocations that were made for this model
+	BackendMemory() BackendMemory
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -51,10 +57,6 @@ type CacheConfig struct {
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int
@@ -71,9 +73,130 @@ type BackendParams struct {
 	FlashAttention bool
 }
-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+// ErrNoMem is returned when panicing due to insufficient memory. It includes
+// the attempted memory allocation.
+type ErrNoMem struct {
+	BackendMemory
+}
+func (e ErrNoMem) Error() string {
+	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
+}
+type AllocationStatus int
+const (
+	// Unallocated memory - have not yet attempted to allocate
+	Unallocated AllocationStatus = iota
+	// Failed memory - tried to allocate the memory and did not succeed
+	Failed
+	// Allocated memory = tried and succeeded to allocate memory
+	Allocated
+)
+// Memory is the size of an allocation and whether it was successful.
+type Memory struct {
+	Size   uint64
+	Status AllocationStatus
+}
+func (m Memory) String() string {
+	s := fmt.Sprint(m.Size)
-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+	switch m.Status {
+	case Unallocated:
+		s += "U"
+	case Failed:
+		s += "F"
+	case Allocated:
+		s += "A"
+	}
+	return s
+}
+// DeviceMemory provides a breakdown of the memory needed
+// per device, such as a CPU or GPU.
+type DeviceMemory struct {
+	// Name is the name of the device as labeled by the backend. It
+	// may not be persistent across instances of the runner.
+	Name string
+	// UUID is a unique persistent identifier for the device for matching
+	// with system management libraries
+	UUID string
+	// Weights is the per-layer memory needed for the model weights.
+	Weights []Memory
+	// Cache is the per-layer memory needed for the KV cache.
+	Cache []Memory
+	// Graph is the size of the compute graph. It is not per-layer.
+	Graph Memory
+}
+func memoryPresent(mem []Memory) bool {
+	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
+}
+func (m DeviceMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if memoryPresent(m.Weights) {
+		attrs = append(attrs, slog.Any("Weights", m.Weights))
+	}
+	if memoryPresent(m.Cache) {
+		attrs = append(attrs, slog.Any("Cache", m.Cache))
+	}
+	if m.Graph.Size != 0 {
+		attrs = append(attrs, slog.Any("Graph", m.Graph))
+	}
+	if len(attrs) > 0 && m.UUID != "" {
+		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	}
+	return slog.GroupValue(attrs...)
+}
+// BackendMemory provides the amount of memory required to load the model
+// per device based on the BackendParams. In some cases, not all required
+// allocations will be known at this point. However, the size of the most recent
+// allocation is guaranteed to be provided so that if it failed, the caller can
+// accommodate that to make forward progress.
+type BackendMemory struct {
+	// InputsWeights are always located on the CPU and cannot be moved
+	InputWeights Memory
+	// CPU model components are located in system memory. This does not
+	// include unified memory allocated through the GPU.
+	CPU DeviceMemory
+	// GPU model components are located on one or more GPUs.
+	GPUs []DeviceMemory
+}
+func (m BackendMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if m.InputWeights.Size != 0 {
+		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
+	}
+	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
+	for _, g := range m.GPUs {
+		attrs = append(attrs, slog.Any(g.Name, g))
+	}
+	return slog.GroupValue(attrs...)
+}
+var backends = make(map[string]func(string, BackendParams) (Backend, error))
+func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -81,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }
-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(modelPath, params)
 	}
 	return nil, fmt.Errorf("unsupported backend")
@@ -92,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
+	FromFloatSlice(s []float32, shape ...int) Tensor
-	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+	FromIntSlice(s []int32, shape ...int) Tensor
 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -105,7 +228,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve() error
+	Reserve()
 	MaxGraphNodes() int
 	Close()
@@ -131,6 +254,8 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
+	Div(ctx Context, t2 Tensor) Tensor
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -139,11 +264,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
+	SumRows(ctx Context) Tensor
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Sin(ctx Context) Tensor
@@ -160,7 +285,6 @@ type Tensor interface {
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
 	Pad(ctx Context, shape ...int) Tensor
-	Unpad(ctx Context, shape ...int) Tensor
 	Stack(ctx Context, dim int, s ...Tensor) Tensor
@@ -172,6 +296,7 @@ type Tensor interface {
 	Duplicate(ctx Context) Tensor
 	TopK(ctx Context, k int) Tensor
+	Argsort(ctx Context) Tensor
 }
 // ScaledDotProductAttention implements a fused attention
@@ -214,35 +339,58 @@ func mul[T number](s ...T) T {
 	return p
 }
-type DumpOptions struct {
+type DumpOptions func(*dumpOptions)
-	// Items is the number of elements to print at the beginning and end of each dimension.
-	Items int
-	// Precision is the number of decimal places to print. Applies to float32 and float64.
+// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
-	Precision int
+func DumpWithPrecision(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.Precision = n
+	}
 }
-func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
+// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
-	if len(opts) < 1 {
+// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
-		opts = append(opts, DumpOptions{
+// beginning and end of each dimension will be printed.
-			Items:     3,
+func DumpWithThreshold(n int) DumpOptions {
-			Precision: 4,
+	return func(opts *dumpOptions) {
-		})
+		opts.Threshold = n
+	}
+}
+// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
+func DumpWithEdgeItems(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.EdgeItems = n
+	}
+}
+type dumpOptions struct {
+	Precision, Threshold, EdgeItems int
+}
+func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
+	opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
+	for _, optsFunc := range optsFuncs {
+		optsFunc(&opts)
+	}
+	if mul(t.Shape()...) <= opts.Threshold {
+		opts.EdgeItems = math.MaxInt
 	}
 	switch t.DType() {
 	case DTypeF32:
-		return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
+		return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
-			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
 		})
 	case DTypeF16, DTypeQ80, DTypeQ40:
 		f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
 		f32 = t.Copy(ctx, f32)
-		return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
+		return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
-			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
 		})
 	case DTypeI32:
-		return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
+		return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
 			return strconv.FormatInt(int64(i), 10)
 		})
 	default:

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"
 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -27,8 +26,10 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )
@@ -43,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
 }
 type Backend struct {
+	// modelPath is the location of the model data
+	modelPath string
 	meta *fsggml.GGML
+	// tensorLoadTargets maps from the name of the tensor in the file
+	// to the name that is used by the model definition
+	tensorLoadTargets map[string][]string
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -57,14 +65,26 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type
+	// requiredMemory is the cumulative memory allocations needed by the backend
+	requiredMemory *ml.BackendMemory
+	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
+	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
 	flashAttention bool
 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }
-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
+func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
-	meta, n, err := fsggml.Decode(r, -1)
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -79,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		"num_key_values", len(meta.KV()),
 	)
+	var requiredMemory ml.BackendMemory
+	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -99,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}
+	blocks := int(meta.KV().BlockCount())
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -106,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
+			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}
+	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	var props C.struct_ggml_backend_dev_props
+	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
+	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
+	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	for _, d := range gpus {
+	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
+	for i, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
+		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
+		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		var props C.struct_ggml_backend_dev_props
+		C.ggml_backend_dev_get_props(d, &props)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
+		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
 	useDefaultSplit := true
@@ -155,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	// inputs always use cpu
 	input := cpuDeviceBufferType
-	blocks := int(meta.KV().BlockCount())
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -197,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -222,7 +261,17 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
 			C.ggml_set_name(tt, cname)
-			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
+			if layer == -1 {
+				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
+				requiredMemory.InputWeights.Status = ml.Allocated
+				requiredMemory.InputWeights.Size += uint64(size)
+			} else {
+				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
+			}
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -244,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts)
+			createTensor(tensor{source: t}, input.bts, -1)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts)
+				}, layer.bts, i)
 			}
 		default:
 			layerIndex := -1
@@ -270,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			}
 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts)
+				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts)
+				createTensor(tensor{source: t}, input.bts, -1)
 			}
 		}
 	}
@@ -286,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
+				}
+			}
+		}
 		if b == nil {
-			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
+			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
 		}
 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -306,22 +365,79 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}
+	// map devices to backend buffer types so new tensors can be assigned to the correct device
+	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
+	// create backends and buffer types used for the compute graph scheduler
+	var schedBackends []*C.struct_ggml_backend
+	var schedBufts []*C.struct_ggml_backend_buffer_type
+	for _, d := range append(gpus, append(accels, cpus...)...) {
+		b := C.ggml_backend_dev_init(d, nil)
+		bt := C.ggml_backend_get_default_buffer_type(b)
+		deviceBufferTypes[d] = bt
+		schedBackends = append(schedBackends, b)
+		schedBufts = append(schedBufts, bt)
+		if C.ggml_backend_is_cpu(b) {
+			// set number of threads for cpu backend
+			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
+		}
+	}
+	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
+	return &Backend{
+		modelPath:         modelPath,
+		flashAttention:    params.FlashAttention,
+		meta:              meta,
+		tensorLoadTargets: targets,
+		tensors:           tensors,
+		sched: C.ggml_backend_sched_new(
+			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
+			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
+			C.int(len(schedBackends)),
+			C.size_t(maxGraphNodes),
+			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
+			C._Bool(false),
+		),
+		schedBackends: schedBackends,
+		schedBufts:    schedBufts,
+		input:         deviceBufferTypes[input.d],
+		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
+			m := make(map[int]*C.struct_ggml_backend_buffer_type)
+			for i, layer := range layers {
+				m[i] = deviceBufferTypes[layer.d]
+			}
+			return m
+		}(),
+		requiredMemory: &requiredMemory,
+		btDeviceMemory: btDeviceMemory,
+		maxGraphNodes:  maxGraphNodes,
+	}, nil
+}
+func init() {
+	ml.RegisterBackend("ggml", New)
+}
+func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
+	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range meta.Tensors().Items() {
+	for _, t := range b.meta.Tensors().Items() {
 		t := t
 		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
+			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
 			for i := range tts {
-				target := targets[t.Name][i]
+				target := b.tensorLoadTargets[t.Name][i]
 				if target == "" {
 					target = t.Name
 				}
-				tt, ok := tensors[target]
+				tt, ok := b.tensors[target]
 				if !ok {
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}
@@ -331,13 +447,13 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
 			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(r.Name())
+			file, err := os.Open(b.modelPath)
 			if err != nil {
-				slog.Warn("file open error", "file", r.Name(), "error", err)
+				slog.Warn("file open error", "file", b.modelPath, "error", err)
 				return err
 			}
 			defer file.Close()
-			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
 			bts := make([]byte, 128*format.KibiByte)
 			var s uint64
@@ -349,7 +465,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
 				if err != nil {
-					slog.Warn("file read error", "file", r.Name(), "error", err)
+					slog.Warn("file read error", "file", b.modelPath, "error", err)
 					return err
 				}
@@ -359,9 +475,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 				s += uint64(n)
-				if params.Progress != nil {
+				if progress != nil {
 					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
+					progress(float32(done) / float32(totalBytes))
 				}
 			}
@@ -370,58 +486,14 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	}
 	if err := g.Wait(); err != nil {
-		return nil, err
+		return err
 	}
-	// map devices to backend buffer types so new tensors can be assigned to the correct device
+	return nil
-	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
-	// create backends and buffer types used for the compute graph scheduler
-	var schedBackends []*C.struct_ggml_backend
-	var schedBufts []*C.struct_ggml_backend_buffer_type
-	for _, d := range append(gpus, append(accels, cpus...)...) {
-		b := C.ggml_backend_dev_init(d, nil)
-		bt := C.ggml_backend_get_default_buffer_type(b)
-		deviceBufferTypes[d] = bt
-		schedBackends = append(schedBackends, b)
-		schedBufts = append(schedBufts, bt)
-		if C.ggml_backend_is_cpu(b) {
-			// set number of threads for cpu backend
-			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
-		}
-	}
-	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
-	return &Backend{
-		flashAttention: params.FlashAttention,
-		meta:           meta,
-		tensors:        tensors,
-		sched: C.ggml_backend_sched_new(
-			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
-			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
-			C.int(len(schedBackends)),
-			C.size_t(maxGraphNodes),
-			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
-		),
-		schedBackends: schedBackends,
-		schedBufts:    schedBufts,
-		input:         deviceBufferTypes[input.d],
-		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
-			m := make(map[int]*C.struct_ggml_backend_buffer_type)
-			for i, layer := range layers {
-				m[i] = deviceBufferTypes[layer.d]
-			}
-			return m
-		}(),
-		maxGraphNodes: maxGraphNodes,
-	}, nil
 }
-func init() {
+func (b *Backend) BackendMemory() ml.BackendMemory {
-	ml.RegisterBackend("ggml", New)
+	return *b.requiredMemory
 }
 func (b *Backend) Config() fs.Config {
@@ -455,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
+		layer:            -1,
 	}
 }
@@ -481,6 +554,9 @@ type Context struct {
 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
+	// layer is the graph layer that this context is allocating for - assumed to be cache
+	layer int
 }
 func (c *Context) Input() ml.Context {
@@ -491,6 +567,7 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            -1,
 		}
 	}
@@ -505,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            i,
 		}
 	}
@@ -524,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }
 func (c *Context) Compute(tensors ...ml.Tensor) {
-	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
+	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
+		panic(fmt.Errorf("error computing ggml graph: %v", status))
+	}
 	C.ggml_backend_sched_reset(c.b.sched)
 	needSync := true
@@ -542,22 +622,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }
-func (c *Context) Reserve() error {
+func (c *Context) Reserve() {
-	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
+	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
-		C.ggml_backend_sched_reset(c.b.sched)
-		return errors.New("failed to reserve graph")
-	}
 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
+	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
+	for _, bt := range c.b.schedBufts {
+		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
+	}
 	for i := range c.b.schedBackends {
-		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
+		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
+		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
+		graph.Size += uint64(bufferStatus.size)
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
+		}
 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(size)))
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
 	}
-	C.ggml_backend_sched_reset(c.b.sched)
+	if !reserved {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
-	return nil
+	}
 }
 func (c *Context) MaxGraphNodes() int {
@@ -577,7 +669,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }
-func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
+func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -600,7 +692,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -613,40 +705,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
 	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
+	if c.layer >= 0 {
+		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
+		cache.Size += uint64(size)
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
+		}
+	}
 	if b == nil {
-		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
 	}
-	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
+	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}, nil
+	return &Tensor{b: c.b, t: t}
 }
 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
+	return c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-	return t
 }
 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
+	t := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }
-func checkShape[S ~[]E, E any](s S, shape ...int) error {
+func checkShape[S ~[]E, E any](s S, shape ...int) {
 	n := len(s)
 	if n == 0 {
-		return nil
+		return
 	}
 	for _, v := range shape {
@@ -654,44 +749,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	}
 	if n != 1 {
-		return fmt.Errorf("invalid shape: %v", shape)
+		panic(fmt.Errorf("invalid shape: %v", shape))
 	}
-	return nil
 }
-func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
-	if err := checkShape(s, shape...); err != nil {
+	checkShape(s, shape...)
-		return nil, err
-	}
-	t, err := c.newTensor(ml.DTypeF32, shape)
+	t := c.newTensor(ml.DTypeF32, shape)
-	if err != nil {
-		return nil, err
-	}
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
-	return t, nil
+	return t
 }
-func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
-	if err := checkShape(s, shape...); err != nil {
+	checkShape(s, shape...)
-		return nil, err
-	}
-	t, err := c.newTensor(ml.DTypeI32, shape)
+	t := c.newTensor(ml.DTypeI32, shape)
-	if err != nil {
-		return nil, err
-	}
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
-	return t, nil
+	return t
 }
 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -709,12 +792,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}
-		t, err := c.Input().FromIntSlice(arange, len(arange))
+		return c.Input().FromIntSlice(arange, len(arange))
-		if err != nil {
-			panic(err)
-		}
-		return t
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -865,6 +943,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
+func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
+	}
+}
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -913,6 +998,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
+	} else if shape[3] != 0 {
+		panic("cuda does not support 4d tensors")
 	}
 	return &Tensor{
@@ -980,6 +1067,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }
+func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
+	}
+}
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1015,17 +1109,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
 	}
 }
-func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
-	if len(shape) != 4 {
-		panic("expected 4 dimensions")
-	}
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
-	}
-}
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
@@ -1062,16 +1145,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }
-const (
+func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
-	ropeTypeNorm   C.int = 0
+	// Default options
-	ropeTypeNeox   C.int = 2
+	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
-	ropeTypeMrope  C.int = 8
-	ropeTypeVision C.int = 24
-)
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
+	// Apply any provided options
-	if ropeFactors == nil {
+	for _, option := range options {
-		ropeFactors = &Tensor{b: t.b}
+		option(opts)
 	}
 	dequant := t.t
@@ -1082,16 +1162,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			ctx.(*Context).ctx,
+			dequant,
+			positions.(*Tensor).t,
+			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(ropeType),
+			C.int(opts.Type),
-			131072, // YaRN n_ctx_train
+			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			0.,  // YaRN ext_factor
+			C.float(0.0),
-			1.,  // YaRN attn_factor
+			C.float(1.0),
-			32., // YaRN beta_fast
+			C.float(32.0),
-			1.,  // YaRN beta_slow
+			C.float(1.0),
 		),
 	}
 }
@@ -1185,3 +1268,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
 		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
 	}
 }
+func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
+	}
+}
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);

--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -38,7 +38,7 @@ extern "C" {
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
@@ -59,7 +59,7 @@ extern "C" {
    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
@@ -152,6 +152,7 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
+        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
@@ -248,7 +249,7 @@ extern "C" {
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +290,7 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
@@ -304,6 +305,12 @@ extern "C" {
    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

--- a/ml/backend/ggml/ggml/include/ggml-cpp.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpp.h
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
 struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
 // ggml-backend

--- a/ml/backend/ggml/ggml/include/ggml-opt.h
+++ b/ml/backend/ggml/ggml/include/ggml-opt.h
@@ -37,13 +37,16 @@ extern "C" {
    // ====== Dataset ======
    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
+            enum ggml_type type_data,    // the type for the internal data tensor
-            int64_t ne_label,     // number of elements per label
+            enum ggml_type type_label,   // the type for the internal labels tensor
-            int64_t ndata,        // total number of datapoints/labels
+            int64_t        ne_datapoint, // number of elements per datapoint
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
    // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
@@ -56,13 +59,19 @@ extern "C" {
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);
    // ====== Model / Context ======
    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
-        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
-        GGML_OPT_BUILD_TYPE_OPT,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
-    // returns the default optimizer params (constant)
+    // returns the default optimizer params (constant, hard-coded values)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
-        // the forward graph is defined by inputs and outputs
+        struct ggml_context * ctx_compute;
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor  * inputs;
-        struct ggml_tensor * inputs;
+        struct ggml_tensor  * outputs;
-        struct ggml_tensor * outputs;
        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {
    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
+            ggml_backend_sched_t    backend_sched,
-            struct ggml_context     * ctx_compute,
+            enum ggml_opt_loss_type loss_type);
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
    // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
    // ====== Optimization Result ======
-    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
@@ -144,11 +154,20 @@ extern "C" {
    // ====== Computation ======
-    // do forward pass, increment result if not NULL
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
-    // do forward pass, increment result if not NULL, do backward pass
+    // do forward pass, increment result if not NULL, do backward pass if allocated
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)

--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -489,7 +489,6 @@ extern "C" {
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_PAD_REFLECT_1D,
-        GGML_OP_UNPAD,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
@@ -674,11 +673,15 @@ extern "C" {
    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
    GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+    // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
+    GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
    GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
@@ -765,7 +768,7 @@ extern "C" {
    // Tensor flags
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
    //
@@ -935,7 +938,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
    // concat a and b along dim
    // used in stable-diffusion
@@ -1778,15 +1781,6 @@ extern "C" {
            int                   p0,
            int                   p1);
-    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-    GGML_API struct ggml_tensor * ggml_unpad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
@@ -2055,15 +2049,14 @@ extern "C" {
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct ggml_context *  ctx,        // context for gradient computation
-        struct ggml_context * ctx_compute, // context for gradient computation
+        struct ggml_cgraph  *  cgraph,
-        struct ggml_cgraph  * cgraph,
+        struct ggml_tensor  ** grad_accs);
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);

--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -214,7 +214,7 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl stdc++fs)
+    target_link_libraries(ggml PRIVATE dl)
 endif()
 function(ggml_add_backend_library backend)

--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);
+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
+    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }
+    bool success = true;
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
            }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
        }
    }
-    return true;
+    return success;
 }
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
 // utils
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {

--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
-// #ifdef GGML_USE_BLAS
+#ifdef GGML_USE_BLAS
-//         register_backend(ggml_backend_blas_reg());
+        register_backend(ggml_backend_blas_reg());
-// #endif
+#endif
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif

--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
    return SIZE_MAX;
 }
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
    // get_alloc_size is optional, defaults to ggml_nbytes
    if (buft->iface.get_alloc_size) {
        size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
 }
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
    char * context_buffer;
    size_t context_buffer_size;
+    bool op_offload;
    int debug;
 };
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                for (int b = 0; b < src_backend_id; b++) {
                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            const int node_backend_id = tensor_backend_id(node);
-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
    }
    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;
    ggml_backend_sched_reset(sched);
@@ -1625,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+    return status;
+}
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

--- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
        endif()
        if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()
        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()
        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")

--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate