Commit b2b270ad authored by Devon Rifkin's avatar Devon Rifkin
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents 20c5fd39 2bb69b40
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 18 Apr 2025 15:58:19 -0700
Subject: [PATCH] graph memory reporting on failure
---
ggml/include/ggml-alloc.h | 6 ++++++
ggml/include/ggml-backend.h | 6 ++++++
ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++----
ggml/src/ggml-backend.cpp | 10 ++++++++++
4 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd..781b1e10 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+struct ggml_allocr_buffer_status {
+ size_t size;
+ bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 778927f6..74e46716 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -304,6 +304,12 @@ extern "C" {
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ struct ggml_backend_buffer_status {
+ size_t size;
+ bool allocated;
+ };
+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5fd379f6..04812990 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
+ size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
GGML_ASSERT(galloc->buffers != NULL);
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+ GGML_ASSERT(galloc->buffer_sizes != NULL);
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
+ free(galloc->buffer_sizes);
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
+ bool success = true;
+
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
- if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false;
+ galloc->buffer_sizes[i] = new_size;
+ success = false;
}
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
}
}
- return true;
+ return success;
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+ for (int i = 0; i < buffer_id; i++) {
+ if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+ // This buffer is the same as a previous one due to the same buffer type being used multiple times
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
+ // case and we still want to know the attempted size.
+
+ struct ggml_allocr_buffer_status status = {0, true};
+ return status;
+ }
+ }
+
+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+ return status;
+}
+
// utils
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 0ce73a99..be335e8c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+ return status;
+}
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 24 Apr 2025 14:48:51 -0700
Subject: [PATCH] ggml: Export GPU UUIDs
This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 33 ++++++++++++++++++++++++++++++++
ggml/src/ggml-metal/ggml-metal.m | 1 +
3 files changed, 35 insertions(+)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 74e46716..a880df33 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
const char * description;
+ const char * uuid;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..4c829153 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
+ std::string uuid;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
}
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+ return ctx->uuid.c_str();
+}
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
+ props->uuid = ggml_backend_cuda_device_get_uuid(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ #if !defined(GGML_USE_HIP)
+ char uuid[64];
+ snprintf(uuid, sizeof(uuid),
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ (unsigned char)prop.uuid.bytes[0],
+ (unsigned char)prop.uuid.bytes[1],
+ (unsigned char)prop.uuid.bytes[2],
+ (unsigned char)prop.uuid.bytes[3],
+ (unsigned char)prop.uuid.bytes[4],
+ (unsigned char)prop.uuid.bytes[5],
+ (unsigned char)prop.uuid.bytes[6],
+ (unsigned char)prop.uuid.bytes[7],
+ (unsigned char)prop.uuid.bytes[8],
+ (unsigned char)prop.uuid.bytes[9],
+ (unsigned char)prop.uuid.bytes[10],
+ (unsigned char)prop.uuid.bytes[11],
+ (unsigned char)prop.uuid.bytes[12],
+ (unsigned char)prop.uuid.bytes[13],
+ (unsigned char)prop.uuid.bytes[14],
+ (unsigned char)prop.uuid.bytes[15]
+ );
+ dev_ctx->uuid = uuid;
+ #else
+ dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+ #endif
+
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..ee4f2dcb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev);
+ props->uuid = "0";
props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (struct ggml_backend_dev_caps) {
...@@ -114,6 +114,9 @@ void grammar_free(struct llama_grammar *g) { ...@@ -114,6 +114,9 @@ void grammar_free(struct llama_grammar *g) {
if (g->vocab != nullptr) { if (g->vocab != nullptr) {
delete g->vocab; delete g->vocab;
} }
if (g->o_vocab != nullptr) {
delete g->o_vocab;
}
llama_grammar_free_impl(g); llama_grammar_free_impl(g);
} }
} }
......
...@@ -82,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -82,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
var graphOffload uint64 var graphOffload uint64
// Projectors loaded into GPU0 only // Projectors loaded into GPU0 only
var projectorWeights uint64 var llamaEngineProjectorWeights uint64
var projectorGraph uint64
// Projectors loaded with output layer
var ollamaEngineProjectorWeights uint64
var ollamaEngineProjectorGraph uint64
// Conditional output size on GPU 0 // Conditional output size on GPU 0
var memoryLayerOutput uint64 var memoryLayerOutput uint64
...@@ -108,15 +111,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -108,15 +111,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors { for _, projector := range projectors {
weight, graph := projectorMemoryRequirements(projector) llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
projectorWeights += weight
projectorGraph += graph
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
} }
if projectorWeights == 0 && projectorGraph == 0 { if llamaEngineProjectorWeights == 0 {
projectorWeights, projectorGraph = f.VisionGraphSize() ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
} }
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
...@@ -168,6 +170,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -168,6 +170,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
graphFullOffload = graphPartialOffload graphFullOffload = graphPartialOffload
} }
// Output layer handled at the end if we have space
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.Size() memoryLayerOutput += layer.Size()
} }
...@@ -177,8 +180,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -177,8 +180,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput += layer.Size() memoryLayerOutput += layer.Size()
} }
// Output layer handled at the end if we have space gpuZeroOverhead := llamaEngineProjectorWeights
gpuZeroOverhead := projectorWeights + projectorGraph
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int var layerCount int
...@@ -221,10 +223,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -221,10 +223,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if len(gpusWithSpace) > 0 { if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead gpuAllocations[gpuZeroID] += gpuZeroOverhead
} else {
overflow += gpuZeroOverhead
} }
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
for i := range int(f.KV().BlockCount()) { for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
// Some models have inconsistent layer sizes // Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.Size() layerSize = blk.Size()
...@@ -234,6 +238,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -234,6 +238,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU // Stop allocating on GPU(s) once we hit the users target NumGPU
overflow += layerSize
continue continue
} }
...@@ -250,31 +255,34 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -250,31 +255,34 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
} }
} }
if len(gpusWithSpace) == 0 {
overflow += layerSize
}
} }
if layerCount >= int(f.KV().BlockCount()) { if layerCount >= int(f.KV().BlockCount()) {
fullyLoaded = true fullyLoaded = true
} else {
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
overflow += layerSize
}
} }
// Determine if we need to consider output then find where it fits // Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
for j := len(gpusWithSpace); j > 0; j-- { if memoryLastLayer > 0 {
g := gpusWithSpace[layerCount%j] if opts.NumGPU < 0 || layerCount < opts.NumGPU {
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) for j := len(gpusWithSpace); j > 0; j-- {
if g.g.FreeMemory > overhead+used+memoryLayerOutput { g := gpusWithSpace[layerCount%j]
gpuAllocations[g.i] += memoryLayerOutput used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
layerCounts[g.i]++ if g.g.FreeMemory > overhead+used+memoryLastLayer {
layerCount++ gpuAllocations[g.i] += memoryLastLayer
break layerCounts[g.i]++
layerCount++
break
}
} }
} }
if layerCount < int(f.KV().BlockCount())+1 { if layerCount < int(f.KV().BlockCount())+1 {
fullyLoaded = false fullyLoaded = false
overflow += memoryLayerOutput overflow += memoryLastLayer
} }
} }
...@@ -332,8 +340,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -332,8 +340,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput: memoryLayerOutput, memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload, graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload, graphPartialOffload: graphPartialOffload,
projectorWeights: projectorWeights, projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
projectorGraph: projectorGraph, projectorGraph: ollamaEngineProjectorGraph,
} }
if gpus[0].Library == "cpu" { if gpus[0].Library == "cpu" {
...@@ -412,51 +420,21 @@ func (m MemoryEstimate) LogValue() slog.Value { ...@@ -412,51 +420,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
return slog.GroupValue(attrs...) return slog.GroupValue(attrs...)
} }
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { func projectorMemoryRequirements(filename string) (weights uint64) {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
return 0, 0 return 0
} }
defer file.Close() defer file.Close()
ggml, _, err := ggml.Decode(file, 1024) ggml, err := ggml.Decode(file, 1024)
if err != nil { if err != nil {
return 0, 0 return 0
} }
for _, layer := range ggml.Tensors().GroupLayers() { for _, layer := range ggml.Tensors().GroupLayers() {
weights += layer.Size() weights += layer.Size()
} }
switch arch := ggml.KV().Architecture(); arch { return weights
case "mllama":
kv := func(n string) uint64 {
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
return uint64(v)
}
return 0
}
imageSize := kv("image_size")
maxNumTiles := kv("max_num_tiles")
embeddingLength := kv("embedding_length")
headCount := kv("attention.head_count")
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
numPatches++
}
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
graphSize = 4 * (8 +
imageSize*imageSize*kv("num_channels")*maxNumTiles +
embeddingLength*numPatches*maxNumTiles +
9*embeddingLength*numPaddedPatches*maxNumTiles +
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
}
return weights, graphSize
} }
...@@ -17,6 +17,7 @@ import ( ...@@ -17,6 +17,7 @@ import (
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"runtime" "runtime"
"slices"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
...@@ -30,9 +31,37 @@ import ( ...@@ -30,9 +31,37 @@ import (
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/model"
) )
type filteredEnv []string
func (e filteredEnv) LogValue() slog.Value {
var attrs []slog.Attr
for _, env := range e {
if key, value, ok := strings.Cut(env, "="); ok {
switch {
case strings.HasPrefix(key, "OLLAMA_"),
strings.HasPrefix(key, "CUDA_"),
strings.HasPrefix(key, "ROCR_"),
strings.HasPrefix(key, "ROCM_"),
strings.HasPrefix(key, "HIP_"),
strings.HasPrefix(key, "GPU_"),
strings.HasPrefix(key, "HSA_"),
strings.HasPrefix(key, "GGML_"),
slices.Contains([]string{
"PATH",
"LD_LIBRARY_PATH",
"DYLD_LIBRARY_PATH",
}, key):
attrs = append(attrs, slog.String(key, value))
}
}
}
return slog.GroupValue(attrs...)
}
type LlamaServer interface { type LlamaServer interface {
Ping(ctx context.Context) error Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error WaitUntilRunning(ctx context.Context) error
...@@ -92,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { ...@@ -92,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
} }
defer f.Close() defer f.Close()
ggml, _, err := ggml.Decode(f, maxArraySize) ggml, err := ggml.Decode(f, maxArraySize)
return ggml, err return ggml, err
} }
...@@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a ...@@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU)) params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
} }
if envconfig.Debug() {
params = append(params, "--verbose")
}
if opts.MainGPU > 0 { if opts.MainGPU > 0 {
params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU)) params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
} }
...@@ -286,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a ...@@ -286,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--mmproj", projectors[0]) params = append(params, "--mmproj", projectors[0])
} }
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc. // iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running // adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags // without any LD_LIBRARY_PATH flags
for { for {
...@@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a ...@@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
} }
slog.Info("starting llama server", "cmd", s.cmd) slog.Info("starting llama server", "cmd", s.cmd)
if envconfig.Debug() { slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "OLLAMA_") ||
strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "GPU_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
var msg string var msg string
...@@ -673,9 +679,8 @@ ws ::= ([ \t\n] ws)? ...@@ -673,9 +679,8 @@ ws ::= ([ \t\n] ws)?
const maxBufferSize = 512 * format.KiloByte const maxBufferSize = 512 * format.KiloByte
type ImageData struct { type ImageData struct {
Data []byte `json:"data"` Data []byte `json:"data"`
ID int `json:"id"` ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
} }
type CompletionRequest struct { type CompletionRequest struct {
...@@ -721,6 +726,9 @@ type CompletionResponse struct { ...@@ -721,6 +726,9 @@ type CompletionResponse struct {
} }
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error { func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
if len(req.Format) > 0 { if len(req.Format) > 0 {
switch string(req.Format) { switch string(req.Format) {
case `null`, `""`: case `null`, `""`:
...@@ -789,7 +797,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu ...@@ -789,7 +797,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
res, err := http.DefaultClient.Do(serverReq) res, err := http.DefaultClient.Do(serverReq)
if err != nil { if err != nil {
return fmt.Errorf("POST predict: %v", err) slog.Error("post predict", "error", err)
return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
} }
defer res.Body.Close() defer res.Body.Close()
...@@ -884,6 +893,8 @@ type EmbeddingResponse struct { ...@@ -884,6 +893,8 @@ type EmbeddingResponse struct {
} }
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) { func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil { if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) { if errors.Is(err, context.Canceled) {
slog.Info("aborting embedding request due to client closing the connection") slog.Info("aborting embedding request due to client closing the connection")
......
package logutil
import (
"io"
"log/slog"
"path/filepath"
)
const LevelTrace slog.Level = -8
func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
switch attr.Key {
case slog.LevelKey:
switch attr.Value.Any().(slog.Level) {
case LevelTrace:
attr.Value = slog.StringValue("TRACE")
}
case slog.SourceKey:
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
}))
}
...@@ -5,7 +5,8 @@ import ( ...@@ -5,7 +5,8 @@ import (
"context" "context"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"os" "log/slog"
"math"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
...@@ -14,6 +15,11 @@ import ( ...@@ -14,6 +15,11 @@ import (
) )
type Backend interface { type Backend interface {
Load(ctx context.Context, progress func(float32)) error
// BackendMemory returns the memory allocations that were made for this model
BackendMemory() BackendMemory
Config() fs.Config Config() fs.Config
Get(name string) Tensor Get(name string) Tensor
NewContext() Context NewContext() Context
...@@ -51,10 +57,6 @@ type CacheConfig struct { ...@@ -51,10 +57,6 @@ type CacheConfig struct {
// BackendParams controls how the backend loads and executes models // BackendParams controls how the backend loads and executes models
type BackendParams struct { type BackendParams struct {
// Progress is a callback function that allows reporting percentage completion
// of model loading
Progress func(float32)
// NumThreads sets the number of threads to use if running on the CPU // NumThreads sets the number of threads to use if running on the CPU
NumThreads int NumThreads int
...@@ -71,9 +73,130 @@ type BackendParams struct { ...@@ -71,9 +73,130 @@ type BackendParams struct {
FlashAttention bool FlashAttention bool
} }
var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error)) // ErrNoMem is returned when panicing due to insufficient memory. It includes
// the attempted memory allocation.
type ErrNoMem struct {
BackendMemory
}
func (e ErrNoMem) Error() string {
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
}
type AllocationStatus int
const (
// Unallocated memory - have not yet attempted to allocate
Unallocated AllocationStatus = iota
// Failed memory - tried to allocate the memory and did not succeed
Failed
// Allocated memory = tried and succeeded to allocate memory
Allocated
)
// Memory is the size of an allocation and whether it was successful.
type Memory struct {
Size uint64
Status AllocationStatus
}
func (m Memory) String() string {
s := fmt.Sprint(m.Size)
func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) { switch m.Status {
case Unallocated:
s += "U"
case Failed:
s += "F"
case Allocated:
s += "A"
}
return s
}
// DeviceMemory provides a breakdown of the memory needed
// per device, such as a CPU or GPU.
type DeviceMemory struct {
// Name is the name of the device as labeled by the backend. It
// may not be persistent across instances of the runner.
Name string
// UUID is a unique persistent identifier for the device for matching
// with system management libraries
UUID string
// Weights is the per-layer memory needed for the model weights.
Weights []Memory
// Cache is the per-layer memory needed for the KV cache.
Cache []Memory
// Graph is the size of the compute graph. It is not per-layer.
Graph Memory
}
func memoryPresent(mem []Memory) bool {
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
}
func (m DeviceMemory) LogValue() slog.Value {
var attrs []slog.Attr
if memoryPresent(m.Weights) {
attrs = append(attrs, slog.Any("Weights", m.Weights))
}
if memoryPresent(m.Cache) {
attrs = append(attrs, slog.Any("Cache", m.Cache))
}
if m.Graph.Size != 0 {
attrs = append(attrs, slog.Any("Graph", m.Graph))
}
if len(attrs) > 0 && m.UUID != "" {
attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
}
return slog.GroupValue(attrs...)
}
// BackendMemory provides the amount of memory required to load the model
// per device based on the BackendParams. In some cases, not all required
// allocations will be known at this point. However, the size of the most recent
// allocation is guaranteed to be provided so that if it failed, the caller can
// accommodate that to make forward progress.
type BackendMemory struct {
// InputsWeights are always located on the CPU and cannot be moved
InputWeights Memory
// CPU model components are located in system memory. This does not
// include unified memory allocated through the GPU.
CPU DeviceMemory
// GPU model components are located on one or more GPUs.
GPUs []DeviceMemory
}
func (m BackendMemory) LogValue() slog.Value {
var attrs []slog.Attr
if m.InputWeights.Size != 0 {
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
}
attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
for _, g := range m.GPUs {
attrs = append(attrs, slog.Any(g.Name, g))
}
return slog.GroupValue(attrs...)
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok { if _, ok := backends[name]; ok {
panic("backend: backend already registered") panic("backend: backend already registered")
} }
...@@ -81,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam ...@@ -81,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
backends[name] = f backends[name] = f
} }
func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) { func NewBackend(modelPath string, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok { if backend, ok := backends["ggml"]; ok {
return backend(ctx, f, params) return backend(modelPath, params)
} }
return nil, fmt.Errorf("unsupported backend") return nil, fmt.Errorf("unsupported backend")
...@@ -92,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, ...@@ -92,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
type Context interface { type Context interface {
Empty(dtype DType, shape ...int) Tensor Empty(dtype DType, shape ...int) Tensor
Zeros(dtype DType, shape ...int) Tensor Zeros(dtype DType, shape ...int) Tensor
FromFloatSlice(s []float32, shape ...int) (Tensor, error) FromFloatSlice(s []float32, shape ...int) Tensor
FromIntSlice(s []int32, shape ...int) (Tensor, error) FromIntSlice(s []int32, shape ...int) Tensor
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step. // Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
Arange(start, stop, step float32, dtype DType) Tensor Arange(start, stop, step float32, dtype DType) Tensor
...@@ -105,7 +228,7 @@ type Context interface { ...@@ -105,7 +228,7 @@ type Context interface {
// graph, simply preallocates memory. Typically called with a // graph, simply preallocates memory. Typically called with a
// worst case graph to ensure all resources are available for // worst case graph to ensure all resources are available for
// for future inference. // for future inference.
Reserve() error Reserve()
MaxGraphNodes() int MaxGraphNodes() int
Close() Close()
...@@ -131,6 +254,8 @@ type Tensor interface { ...@@ -131,6 +254,8 @@ type Tensor interface {
Neg(ctx Context) Tensor Neg(ctx Context) Tensor
Add(ctx Context, t2 Tensor) Tensor Add(ctx Context, t2 Tensor) Tensor
Mul(ctx Context, t2 Tensor) Tensor Mul(ctx Context, t2 Tensor) Tensor
Div(ctx Context, t2 Tensor) Tensor
Mulmat(ctx Context, t2 Tensor) Tensor Mulmat(ctx Context, t2 Tensor) Tensor
MulmatFullPrec(ctx Context, t2 Tensor) Tensor MulmatFullPrec(ctx Context, t2 Tensor) Tensor
MulmatID(ctx Context, t2, ids Tensor) Tensor MulmatID(ctx Context, t2, ids Tensor) Tensor
...@@ -139,11 +264,11 @@ type Tensor interface { ...@@ -139,11 +264,11 @@ type Tensor interface {
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
Scale(ctx Context, s float64) Tensor Scale(ctx Context, s float64) Tensor
SumRows(ctx Context) Tensor
AvgPool2D(ctx Context, k, s int, p float32) Tensor AvgPool2D(ctx Context, k, s int, p float32) Tensor
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
Sin(ctx Context) Tensor Sin(ctx Context) Tensor
...@@ -160,7 +285,6 @@ type Tensor interface { ...@@ -160,7 +285,6 @@ type Tensor interface {
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
Pad(ctx Context, shape ...int) Tensor Pad(ctx Context, shape ...int) Tensor
Unpad(ctx Context, shape ...int) Tensor
Stack(ctx Context, dim int, s ...Tensor) Tensor Stack(ctx Context, dim int, s ...Tensor) Tensor
...@@ -172,6 +296,7 @@ type Tensor interface { ...@@ -172,6 +296,7 @@ type Tensor interface {
Duplicate(ctx Context) Tensor Duplicate(ctx Context) Tensor
TopK(ctx Context, k int) Tensor TopK(ctx Context, k int) Tensor
Argsort(ctx Context) Tensor
} }
// ScaledDotProductAttention implements a fused attention // ScaledDotProductAttention implements a fused attention
...@@ -214,35 +339,58 @@ func mul[T number](s ...T) T { ...@@ -214,35 +339,58 @@ func mul[T number](s ...T) T {
return p return p
} }
type DumpOptions struct { type DumpOptions func(*dumpOptions)
// Items is the number of elements to print at the beginning and end of each dimension.
Items int
// Precision is the number of decimal places to print. Applies to float32 and float64. // DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
Precision int func DumpWithPrecision(n int) DumpOptions {
return func(opts *dumpOptions) {
opts.Precision = n
}
} }
func Dump(ctx Context, t Tensor, opts ...DumpOptions) string { // DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
if len(opts) < 1 { // is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
opts = append(opts, DumpOptions{ // beginning and end of each dimension will be printed.
Items: 3, func DumpWithThreshold(n int) DumpOptions {
Precision: 4, return func(opts *dumpOptions) {
}) opts.Threshold = n
}
}
// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
func DumpWithEdgeItems(n int) DumpOptions {
return func(opts *dumpOptions) {
opts.EdgeItems = n
}
}
type dumpOptions struct {
Precision, Threshold, EdgeItems int
}
func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
for _, optsFunc := range optsFuncs {
optsFunc(&opts)
}
if mul(t.Shape()...) <= opts.Threshold {
opts.EdgeItems = math.MaxInt
} }
switch t.DType() { switch t.DType() {
case DTypeF32: case DTypeF32:
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string { return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32) return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
}) })
case DTypeF16, DTypeQ80, DTypeQ40: case DTypeF16, DTypeQ80, DTypeQ40:
f32 := ctx.Input().Empty(DTypeF32, t.Shape()...) f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
f32 = t.Copy(ctx, f32) f32 = t.Copy(ctx, f32)
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string { return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32) return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
}) })
case DTypeI32: case DTypeI32:
return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string { return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
return strconv.FormatInt(int64(i), 10) return strconv.FormatInt(int64(i), 10)
}) })
default: default:
......
...@@ -10,7 +10,6 @@ import "C" ...@@ -10,7 +10,6 @@ import "C"
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
...@@ -27,8 +26,10 @@ import ( ...@@ -27,8 +26,10 @@ import (
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
fsggml "github.com/ollama/ollama/fs/ggml" fsggml "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
"github.com/ollama/ollama/ml/nn/rope"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
) )
...@@ -43,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device { ...@@ -43,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
} }
type Backend struct { type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML meta *fsggml.GGML
// tensorLoadTargets maps from the name of the tensor in the file
// to the name that is used by the model definition
tensorLoadTargets map[string][]string
sched *C.struct_ggml_backend_sched sched *C.struct_ggml_backend_sched
schedBackends []*C.struct_ggml_backend schedBackends []*C.struct_ggml_backend
schedBufts []*C.struct_ggml_backend_buffer_type schedBufts []*C.struct_ggml_backend_buffer_type
...@@ -57,14 +65,26 @@ type Backend struct { ...@@ -57,14 +65,26 @@ type Backend struct {
// layers is the backend used for repeating layers // layers is the backend used for repeating layers
layers map[int]*C.struct_ggml_backend_buffer_type layers map[int]*C.struct_ggml_backend_buffer_type
// requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
flashAttention bool flashAttention bool
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
maxGraphNodes int maxGraphNodes int
} }
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) { func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
meta, n, err := fsggml.Decode(r, -1) r, err := os.Open(modelPath)
if err != nil {
return nil, err
}
defer r.Close()
meta, err := fsggml.Decode(r, -1)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -79,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -79,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
"num_key_values", len(meta.KV()), "num_key_values", len(meta.KV()),
) )
var requiredMemory ml.BackendMemory
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
type deviceBufferType struct { type deviceBufferType struct {
d *C.struct_ggml_backend_device d *C.struct_ggml_backend_device
bts []*C.struct_ggml_backend_buffer_type bts []*C.struct_ggml_backend_buffer_type
...@@ -99,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -99,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
} }
} }
blocks := int(meta.KV().BlockCount())
// create list of buffer types for the cpu // create list of buffer types for the cpu
cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)} cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
for _, d := range append(accels, append(gpus, cpus...)...) { for _, d := range append(accels, append(gpus, cpus...)...) {
...@@ -106,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -106,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
case C.GGML_BACKEND_DEVICE_TYPE_CPU, case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL: C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d)) cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
} }
} }
requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
requiredMemory.CPU.UUID = C.GoString(props.uuid)
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
// create list of buffer types for each gpu // create list of buffer types for each gpu
var gpuDeviceBufferTypes []deviceBufferType var gpuDeviceBufferTypes []deviceBufferType
for _, d := range gpus { requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
for i, d := range gpus {
bt := C.ggml_backend_dev_buffer_type(d) bt := C.ggml_backend_dev_buffer_type(d)
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{ gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
d: d, d: d,
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...), bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
}) })
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(d, &props)
requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
} }
useDefaultSplit := true useDefaultSplit := true
...@@ -155,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -155,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// inputs always use cpu // inputs always use cpu
input := cpuDeviceBufferType input := cpuDeviceBufferType
blocks := int(meta.KV().BlockCount())
// define a range of gpu layers. anything outside of this range is assigned to the cpu // define a range of gpu layers. anything outside of this range is assigned to the cpu
gpuRangeStart := max(0, blocks-params.NumGPULayers) gpuRangeStart := max(0, blocks-params.NumGPULayers)
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1) gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
...@@ -197,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -197,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// contexts are shared by tensors of the same buffer type // contexts are shared by tensors of the same buffer type
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context) ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor { createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
for _, bt := range bts { for _, bt := range bts {
if _, ok := ctxs[bt]; !ok { if _, ok := ctxs[bt]; !ok {
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
...@@ -222,7 +261,17 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -222,7 +261,17 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0]))) tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
C.ggml_set_name(tt, cname) C.ggml_set_name(tt, cname)
slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
if layer == -1 {
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
requiredMemory.InputWeights.Status = ml.Allocated
requiredMemory.InputWeights.Size += uint64(size)
} else {
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
}
//nolint:staticcheck // TODO: check if buffer type supports this tensor //nolint:staticcheck // TODO: check if buffer type supports this tensor
return tt return tt
} }
...@@ -244,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -244,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
for _, t := range meta.Tensors().Items() { for _, t := range meta.Tensors().Items() {
switch { switch {
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"): case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
createTensor(tensor{source: t}, input.bts) createTensor(tensor{source: t}, input.bts, -1)
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" { if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
createTensor(tensor{source: t, target: "output.weight"}, output.bts) createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
} }
case contains(t.Name, "cls", "output", "output_norm"): case contains(t.Name, "cls", "output", "output_norm"):
createTensor(tensor{source: t}, output.bts) createTensor(tensor{source: t}, output.bts, blocks)
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."): case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
// TODO: assign vision tensors to the gpu if possible // TODO: assign vision tensors to the gpu if possible
createTensor(tensor{source: t}, output.bts) createTensor(tensor{source: t}, output.bts, blocks)
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"): case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
// these tensors should be repeated per layer // these tensors should be repeated per layer
for i, layer := range layers { for i, layer := range layers {
createTensor(tensor{ createTensor(tensor{
source: t, source: t,
target: "blk." + strconv.Itoa(i) + "." + t.Name, target: "blk." + strconv.Itoa(i) + "." + t.Name,
}, layer.bts) }, layer.bts, i)
} }
default: default:
layerIndex := -1 layerIndex := -1
...@@ -270,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -270,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
} }
if layerIndex >= 0 { if layerIndex >= 0 {
createTensor(tensor{source: t}, layers[layerIndex].bts) createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
} else { } else {
// load all other tensors on the cpu // load all other tensors on the cpu
createTensor(tensor{source: t}, input.bts) createTensor(tensor{source: t}, input.bts, -1)
} }
} }
} }
...@@ -286,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -286,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
} }
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt) b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
}
}
if b == nil { if b == nil {
return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt))) panic(ml.ErrNoMem{BackendMemory: requiredMemory})
} }
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS) C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
...@@ -306,22 +365,79 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -306,22 +365,79 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
} }
} }
// map devices to backend buffer types so new tensors can be assigned to the correct device
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
// create backends and buffer types used for the compute graph scheduler
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
for _, d := range append(gpus, append(accels, cpus...)...) {
b := C.ggml_backend_dev_init(d, nil)
bt := C.ggml_backend_get_default_buffer_type(b)
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
if C.ggml_backend_is_cpu(b) {
// set number of threads for cpu backend
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
}
}
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
modelPath: modelPath,
flashAttention: params.FlashAttention,
meta: meta,
tensorLoadTargets: targets,
tensors: tensors,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
C._Bool(false),
),
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
m := make(map[int]*C.struct_ggml_backend_buffer_type)
for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d]
}
return m
}(),
requiredMemory: &requiredMemory,
btDeviceMemory: btDeviceMemory,
maxGraphNodes: maxGraphNodes,
}, nil
}
func init() {
ml.RegisterBackend("ggml", New)
}
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
var doneBytes atomic.Uint64 var doneBytes atomic.Uint64
totalBytes := uint64(n) - meta.Tensors().Offset totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
g, ctx := errgroup.WithContext(ctx) g, ctx := errgroup.WithContext(ctx)
g.SetLimit(runtime.GOMAXPROCS(0)) g.SetLimit(runtime.GOMAXPROCS(0))
for _, t := range meta.Tensors().Items() { for _, t := range b.meta.Tensors().Items() {
t := t t := t
g.Go(func() error { g.Go(func() error {
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name]))) tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
for i := range tts { for i := range tts {
target := targets[t.Name][i] target := b.tensorLoadTargets[t.Name][i]
if target == "" { if target == "" {
target = t.Name target = t.Name
} }
tt, ok := tensors[target] tt, ok := b.tensors[target]
if !ok { if !ok {
return fmt.Errorf("unassigned tensor: %s", t.Name) return fmt.Errorf("unassigned tensor: %s", t.Name)
} }
...@@ -331,13 +447,13 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -331,13 +447,13 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// Create a new FD for each goroutine so that each FD is read sequentially, rather than // Create a new FD for each goroutine so that each FD is read sequentially, rather than
// seeking around within an FD shared between all goroutines. // seeking around within an FD shared between all goroutines.
file, err := os.Open(r.Name()) file, err := os.Open(b.modelPath)
if err != nil { if err != nil {
slog.Warn("file open error", "file", r.Name(), "error", err) slog.Warn("file open error", "file", b.modelPath, "error", err)
return err return err
} }
defer file.Close() defer file.Close()
sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size())) sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
bts := make([]byte, 128*format.KibiByte) bts := make([]byte, 128*format.KibiByte)
var s uint64 var s uint64
...@@ -349,7 +465,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -349,7 +465,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))]) n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
if err != nil { if err != nil {
slog.Warn("file read error", "file", r.Name(), "error", err) slog.Warn("file read error", "file", b.modelPath, "error", err)
return err return err
} }
...@@ -359,9 +475,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -359,9 +475,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
s += uint64(n) s += uint64(n)
if params.Progress != nil { if progress != nil {
done := doneBytes.Add(uint64(n)) done := doneBytes.Add(uint64(n))
params.Progress(float32(done) / float32(totalBytes)) progress(float32(done) / float32(totalBytes))
} }
} }
...@@ -370,58 +486,14 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -370,58 +486,14 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
} }
if err := g.Wait(); err != nil { if err := g.Wait(); err != nil {
return nil, err return err
} }
// map devices to backend buffer types so new tensors can be assigned to the correct device return nil
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
// create backends and buffer types used for the compute graph scheduler
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
for _, d := range append(gpus, append(accels, cpus...)...) {
b := C.ggml_backend_dev_init(d, nil)
bt := C.ggml_backend_get_default_buffer_type(b)
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
if C.ggml_backend_is_cpu(b) {
// set number of threads for cpu backend
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
}
}
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
flashAttention: params.FlashAttention,
meta: meta,
tensors: tensors,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
),
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
m := make(map[int]*C.struct_ggml_backend_buffer_type)
for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d]
}
return m
}(),
maxGraphNodes: maxGraphNodes,
}, nil
} }
func init() { func (b *Backend) BackendMemory() ml.BackendMemory {
ml.RegisterBackend("ggml", New) return *b.requiredMemory
} }
func (b *Backend) Config() fs.Config { func (b *Backend) Config() fs.Config {
...@@ -455,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context { ...@@ -455,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
no_alloc: true, no_alloc: true,
}), }),
allocatedBuffers: &allocatedBuffers, allocatedBuffers: &allocatedBuffers,
layer: -1,
} }
} }
...@@ -481,6 +554,9 @@ type Context struct { ...@@ -481,6 +554,9 @@ type Context struct {
// maxGraphNodes is the maximum allowed number of graph nodes in this context // maxGraphNodes is the maximum allowed number of graph nodes in this context
maxGraphNodes int maxGraphNodes int
// layer is the graph layer that this context is allocating for - assumed to be cache
layer int
} }
func (c *Context) Input() ml.Context { func (c *Context) Input() ml.Context {
...@@ -491,6 +567,7 @@ func (c *Context) Input() ml.Context { ...@@ -491,6 +567,7 @@ func (c *Context) Input() ml.Context {
buft: c.b.input, buft: c.b.input,
allocatedBuffers: c.allocatedBuffers, allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes, maxGraphNodes: c.maxGraphNodes,
layer: -1,
} }
} }
...@@ -505,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context { ...@@ -505,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context {
buft: buft, buft: buft,
allocatedBuffers: c.allocatedBuffers, allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes, maxGraphNodes: c.maxGraphNodes,
layer: i,
} }
} }
...@@ -524,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { ...@@ -524,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
} }
func (c *Context) Compute(tensors ...ml.Tensor) { func (c *Context) Compute(tensors ...ml.Tensor) {
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph) if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
panic(fmt.Errorf("error computing ggml graph: %v", status))
}
C.ggml_backend_sched_reset(c.b.sched) C.ggml_backend_sched_reset(c.b.sched)
needSync := true needSync := true
...@@ -542,22 +622,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) { ...@@ -542,22 +622,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
} }
} }
func (c *Context) Reserve() error { func (c *Context) Reserve() {
if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) { reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
C.ggml_backend_sched_reset(c.b.sched)
return errors.New("failed to reserve graph")
}
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched)) slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
for _, bt := range c.b.schedBufts {
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
}
for i := range c.b.schedBackends { for i := range c.b.schedBackends {
size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i]) bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
graph.Size += uint64(bufferStatus.size)
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
}
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
"size", format.HumanBytes2(uint64(size))) "size", format.HumanBytes2(uint64(bufferStatus.size)))
} }
C.ggml_backend_sched_reset(c.b.sched) if !reserved {
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
return nil }
} }
func (c *Context) MaxGraphNodes() int { func (c *Context) MaxGraphNodes() int {
...@@ -577,7 +669,7 @@ func pad(length, pad C.size_t) C.size_t { ...@@ -577,7 +669,7 @@ func pad(length, pad C.size_t) C.size_t {
return ((length + pad - 1) / pad) * pad return ((length + pad - 1) / pad) * pad
} }
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) { func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
if c.buft == nil { if c.buft == nil {
panic("set Input or Layer before creating tensors") panic("set Input or Layer before creating tensors")
} }
...@@ -600,7 +692,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) { ...@@ -600,7 +692,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
if len(shape) < 1 || shape[0] == 0 { if len(shape) < 1 || shape[0] == 0 {
var shape C.int64_t = 0 var shape C.int64_t = 0
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
} else if len(shape) > 4 { } else if len(shape) > 4 {
panic("unsupported number of dimensions") panic("unsupported number of dimensions")
} }
...@@ -613,40 +705,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) { ...@@ -613,40 +705,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape)) t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft)) size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
b := C.ggml_backend_buft_alloc_buffer(c.buft, size) b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
if c.layer >= 0 {
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
cache.Size += uint64(size)
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
}
}
if b == nil { if b == nil {
return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft))) panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
} }
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
return &Tensor{b: c.b, t: t}, nil return &Tensor{b: c.b, t: t}
} }
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor { func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
t, err := c.newTensor(dtype, shape) return c.newTensor(dtype, shape)
if err != nil {
panic(err)
}
return t
} }
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor { func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
t, err := c.newTensor(dtype, shape) t := c.newTensor(dtype, shape)
if err != nil {
panic(err)
}
C.ggml_set_zero(t.(*Tensor).t) C.ggml_set_zero(t.(*Tensor).t)
return t return t
} }
func checkShape[S ~[]E, E any](s S, shape ...int) error { func checkShape[S ~[]E, E any](s S, shape ...int) {
n := len(s) n := len(s)
if n == 0 { if n == 0 {
return nil return
} }
for _, v := range shape { for _, v := range shape {
...@@ -654,44 +749,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error { ...@@ -654,44 +749,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
} }
if n != 1 { if n != 1 {
return fmt.Errorf("invalid shape: %v", shape) panic(fmt.Errorf("invalid shape: %v", shape))
} }
return nil
} }
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) { func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
if err := checkShape(s, shape...); err != nil { checkShape(s, shape...)
return nil, err
}
t, err := c.newTensor(ml.DTypeF32, shape) t := c.newTensor(ml.DTypeF32, shape)
if err != nil {
return nil, err
}
if len(s) > 0 { if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
} }
return t, nil return t
} }
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
if err := checkShape(s, shape...); err != nil { checkShape(s, shape...)
return nil, err
}
t, err := c.newTensor(ml.DTypeI32, shape) t := c.newTensor(ml.DTypeI32, shape)
if err != nil {
return nil, err
}
if len(s) > 0 { if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
} }
return t, nil return t
} }
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor { func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
...@@ -709,12 +792,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor { ...@@ -709,12 +792,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
arange = append(arange, int32(i)) arange = append(arange, int32(i))
} }
t, err := c.Input().FromIntSlice(arange, len(arange)) return c.Input().FromIntSlice(arange, len(arange))
if err != nil {
panic(err)
}
return t
default: default:
panic("unsupported dtype for arange") panic("unsupported dtype for arange")
} }
...@@ -865,6 +943,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor { ...@@ -865,6 +943,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
} }
} }
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
}
}
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor { func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
return &Tensor{ return &Tensor{
b: t.b, b: t.b,
...@@ -913,6 +998,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor { ...@@ -913,6 +998,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
if len(shape) != 4 { if len(shape) != 4 {
panic("expected 4 dimensions") panic("expected 4 dimensions")
} else if shape[3] != 0 {
panic("cuda does not support 4d tensors")
} }
return &Tensor{ return &Tensor{
...@@ -980,6 +1067,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor { ...@@ -980,6 +1067,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
} }
} }
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
}
}
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor { func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
return &Tensor{ return &Tensor{
b: t.b, b: t.b,
...@@ -1015,17 +1109,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor { ...@@ -1015,17 +1109,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
} }
} }
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
if len(shape) != 4 {
panic("expected 4 dimensions")
}
return &Tensor{
b: t.b,
t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
}
}
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
switch len(shape) { switch len(shape) {
case 1: case 1:
...@@ -1062,16 +1145,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { ...@@ -1062,16 +1145,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
} }
} }
const ( func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
ropeTypeNorm C.int = 0 // Default options
ropeTypeNeox C.int = 2 opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
ropeTypeMrope C.int = 8
ropeTypeVision C.int = 24
)
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor { // Apply any provided options
if ropeFactors == nil { for _, option := range options {
ropeFactors = &Tensor{b: t.b} option(opts)
} }
dequant := t.t dequant := t.t
...@@ -1082,16 +1162,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi ...@@ -1082,16 +1162,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
return &Tensor{ return &Tensor{
b: t.b, b: t.b,
t: C.ggml_rope_ext( t: C.ggml_rope_ext(
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t, ctx.(*Context).ctx,
dequant,
positions.(*Tensor).t,
opts.Factors.(*Tensor).t,
C.int(ropeDim), C.int(ropeDim),
C.int(ropeType), C.int(opts.Type),
131072, // YaRN n_ctx_train C.int(opts.OriginalContextLength),
C.float(ropeBase), C.float(ropeBase),
C.float(ropeScale), C.float(ropeScale),
0., // YaRN ext_factor C.float(0.0),
1., // YaRN attn_factor C.float(1.0),
32., // YaRN beta_fast C.float(32.0),
1., // YaRN beta_slow C.float(1.0),
), ),
} }
} }
...@@ -1185,3 +1268,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor { ...@@ -1185,3 +1268,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)), t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
} }
} }
func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
}
}
...@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph ...@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
struct ggml_allocr_buffer_status {
size_t size;
bool allocated;
};
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
......
...@@ -38,7 +38,7 @@ extern "C" { ...@@ -38,7 +38,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
...@@ -59,7 +59,7 @@ extern "C" { ...@@ -59,7 +59,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
...@@ -152,6 +152,7 @@ extern "C" { ...@@ -152,6 +152,7 @@ extern "C" {
struct ggml_backend_dev_props { struct ggml_backend_dev_props {
const char * name; const char * name;
const char * description; const char * description;
const char * uuid;
size_t memory_free; size_t memory_free;
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; enum ggml_backend_dev_type type;
...@@ -248,7 +249,7 @@ extern "C" { ...@@ -248,7 +249,7 @@ extern "C" {
// preferrably to run on the same backend as the buffer // preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
// initialize buffers from a max size graph (optional) // initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size); reserve_graph = build_graph(sched, max_batch_size);
...@@ -289,7 +290,7 @@ extern "C" { ...@@ -289,7 +290,7 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index // Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
...@@ -304,6 +305,12 @@ extern "C" { ...@@ -304,6 +305,12 @@ extern "C" {
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
struct ggml_backend_buffer_status {
size_t size;
bool allocated;
};
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
......
...@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr; ...@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr; typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend // ggml-backend
......
...@@ -37,13 +37,16 @@ extern "C" { ...@@ -37,13 +37,16 @@ extern "C" {
// ====== Dataset ====== // ====== Dataset ======
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init( GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint enum ggml_type type_data, // the type for the internal data tensor
int64_t ne_label, // number of elements per label enum ggml_type type_label, // the type for the internal labels tensor
int64_t ndata, // total number of datapoints/labels int64_t ne_datapoint, // number of elements per datapoint
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset); GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
// get underlying tensors that store the data // get underlying tensors that store the data
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata] GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata] GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
...@@ -56,13 +59,19 @@ extern "C" { ...@@ -56,13 +59,19 @@ extern "C" {
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch] struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch] struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch); int64_t ibatch);
GGML_API void ggml_opt_dataset_get_batch_host(
ggml_opt_dataset_t dataset,
void * data_batch,
size_t nb_data_batch,
void * labels_batch,
int64_t ibatch);
// ====== Model / Context ====== // ====== Model / Context ======
enum ggml_opt_build_type { enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD, GGML_OPT_BUILD_TYPE_FORWARD = 10,
GGML_OPT_BUILD_TYPE_GRAD, GGML_OPT_BUILD_TYPE_GRAD = 20,
GGML_OPT_BUILD_TYPE_OPT, GGML_OPT_BUILD_TYPE_OPT = 30,
}; };
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
...@@ -81,20 +90,22 @@ extern "C" { ...@@ -81,20 +90,22 @@ extern "C" {
// userdata can be used to pass arbitrary data // userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
// returns the default optimizer params (constant) // returns the default optimizer params (constant, hard-coded values)
// userdata is not used // userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
// casts userdata to ggml_opt_optimizer_params and returns it
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
// parameters for initializing a new optimization context // parameters for initializing a new optimization context
struct ggml_opt_params { struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors // by default the forward graph needs to be reconstructed for each eval
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
// the forward graph is defined by inputs and outputs struct ggml_context * ctx_compute;
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts struct ggml_tensor * inputs;
struct ggml_tensor * inputs; struct ggml_tensor * outputs;
struct ggml_tensor * outputs;
enum ggml_opt_loss_type loss_type; enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type; enum ggml_opt_build_type build_type;
...@@ -107,12 +118,9 @@ extern "C" { ...@@ -107,12 +118,9 @@ extern "C" {
// get parameters for an optimization context with defaults set where possible // get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function // parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params( GGML_API struct ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched, ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute, enum ggml_opt_loss_type loss_type);
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
...@@ -121,6 +129,7 @@ extern "C" { ...@@ -121,6 +129,7 @@ extern "C" {
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
// get underlying tensors that store data // get underlying tensors that store data
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
...@@ -128,11 +137,12 @@ extern "C" { ...@@ -128,11 +137,12 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
// ====== Optimization Result ====== // ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init(); GGML_API ggml_opt_result_t ggml_opt_result_init(void);
GGML_API void ggml_opt_result_free(ggml_opt_result_t result); GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
...@@ -144,11 +154,20 @@ extern "C" { ...@@ -144,11 +154,20 @@ extern "C" {
// ====== Computation ====== // ====== Computation ======
// do forward pass, increment result if not NULL // if not using static graphs, this function must be called prior to ggml_opt_alloc
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); GGML_API void ggml_opt_prepare_alloc(
ggml_opt_context_t opt_ctx,
struct ggml_context * ctx_compute,
struct ggml_cgraph * gf,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs);
// allocate the next graph for evaluation, either forward or forward + backward
// must be called exactly once prior to calling ggml_opt_eval
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
// do forward pass, increment result if not NULL, do backward pass // do forward pass, increment result if not NULL, do backward pass if allocated
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// ############################################################################ // ############################################################################
// ## The high-level functions start here. They do not depend on any private ## // ## The high-level functions start here. They do not depend on any private ##
...@@ -200,9 +219,9 @@ extern "C" { ...@@ -200,9 +219,9 @@ extern "C" {
// fit model defined by inputs and outputs to dataset // fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit( GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
......
...@@ -489,7 +489,6 @@ extern "C" { ...@@ -489,7 +489,6 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
GGML_OP_UNPAD,
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
...@@ -674,11 +673,15 @@ extern "C" { ...@@ -674,11 +673,15 @@ extern "C" {
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
...@@ -765,7 +768,7 @@ extern "C" { ...@@ -765,7 +768,7 @@ extern "C" {
// Tensor flags // Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor); GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor); GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API void ggml_set_param(struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
// //
...@@ -935,7 +938,7 @@ extern "C" { ...@@ -935,7 +938,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_repeat_back( GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
// concat a and b along dim // concat a and b along dim
// used in stable-diffusion // used in stable-diffusion
...@@ -1778,15 +1781,6 @@ extern "C" { ...@@ -1778,15 +1781,6 @@ extern "C" {
int p0, int p0,
int p1); int p1);
// unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
GGML_API struct ggml_tensor * ggml_unpad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
...@@ -2055,15 +2049,14 @@ extern "C" { ...@@ -2055,15 +2049,14 @@ extern "C" {
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand( GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation) struct ggml_context * ctx, // context for gradient computation
struct ggml_context * ctx_compute, // context for gradient computation struct ggml_cgraph * cgraph,
struct ggml_cgraph * cgraph, struct ggml_tensor ** grad_accs);
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
// graph allocation in a context // graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
......
...@@ -214,7 +214,7 @@ add_library(ggml ...@@ -214,7 +214,7 @@ add_library(ggml
target_link_libraries(ggml PUBLIC ggml-base) target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux") if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_libraries(ggml PRIVATE dl stdc++fs) target_link_libraries(ggml PRIVATE dl)
endif() endif()
function(ggml_add_backend_library backend) function(ggml_add_backend_library backend)
......
...@@ -364,6 +364,7 @@ struct node_alloc { ...@@ -364,6 +364,7 @@ struct node_alloc {
struct ggml_gallocr { struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers] ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers] ggml_backend_buffer_t * buffers; // [n_buffers]
size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers; int n_buffers;
...@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs ...@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
GGML_ASSERT(galloc->buffer_sizes != NULL);
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL); GGML_ASSERT(galloc->buf_tallocs != NULL);
...@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { ...@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
free(galloc->buffer_sizes);
free(galloc->buffers); free(galloc->buffers);
free(galloc->buf_tallocs); free(galloc->buf_tallocs);
free(galloc->node_allocs); free(galloc->node_allocs);
...@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c ...@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
bool success = true;
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer // if the buffer type is used multiple times, we reuse the same buffer
...@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c ...@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]); ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) { if (galloc->buffers[i]) {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
} else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false; galloc->buffer_sizes[i] = new_size;
success = false;
} }
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); } else {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
} }
} }
return true; return success;
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
...@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { ...@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
} }
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
for (int i = 0; i < buffer_id; i++) {
if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
// This buffer is the same as a previous one due to the same buffer type being used multiple times
// (See above.) However, we need a different check because multiple buffers might be NULL in our
// case and we still want to know the attempted size.
struct ggml_allocr_buffer_status status = {0, true};
return status;
}
}
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
return status;
}
// utils // utils
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
......
...@@ -178,9 +178,9 @@ struct ggml_backend_registry { ...@@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
// #ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
// register_backend(ggml_backend_blas_reg()); register_backend(ggml_backend_blas_reg());
// #endif #endif
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
......
...@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { ...@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
return SIZE_MAX; return SIZE_MAX;
} }
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes // get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) { if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor); size_t size = buft->iface.get_alloc_size(buft, tensor);
...@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { ...@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
} }
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
...@@ -674,6 +674,8 @@ struct ggml_backend_sched { ...@@ -674,6 +674,8 @@ struct ggml_backend_sched {
char * context_buffer; char * context_buffer;
size_t context_buffer_size; size_t context_buffer_size;
bool op_offload;
int debug; int debug;
}; };
...@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st ...@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) { for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off"); SET_CAUSE(tensor, "1.off");
...@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ...@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node); const int node_backend_id = tensor_backend_id(node);
assert(node_backend_id != -1); // all nodes should be assigned by now assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
// check if we should start a new split based on the sources of the current node // check if we should start a new split based on the sources of the current node
bool need_new_split = false; bool need_new_split = false;
...@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ...@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_buffer_type_t * bufts, ggml_backend_buffer_type_t * bufts,
int n_backends, int n_backends,
size_t graph_size, size_t graph_size,
bool parallel) { bool parallel,
bool op_offload) {
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
...@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( ...@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
} }
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
...@@ -1625,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe ...@@ -1625,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
} }
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
return status;
}
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
......
...@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/ ${KLEIDIAI_SRC}/kai/ukernels/
${KLEIDIAI_SRC}/kai/ukernels/matmul/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}") set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
...@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS}) set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
if (NOT DOTPROD_ENABLED MATCHES -1) if (NOT DOTPROD_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
endif() endif()
if (NOT I8MM_ENABLED MATCHES -1) if (NOT I8MM_ENABLED MATCHES -1)
...@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif() endif()
if (NOT SME_ENABLED MATCHES -1) if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2") ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif() endif()
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}") set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
......
...@@ -3,7 +3,7 @@ package cpu ...@@ -3,7 +3,7 @@ package cpu
// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
// #cgo CXXFLAGS: -std=c++17 // #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
// #cgo linux CPPFLAGS: -D_GNU_SOURCE // #cgo linux CPPFLAGS: -D_GNU_SOURCE
// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 LDFLAGS: -framework Accelerate // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment