Commit b2b270ad authored by Devon Rifkin's avatar Devon Rifkin
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents 20c5fd39 2bb69b40
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 18 Apr 2025 15:58:19 -0700
Subject: [PATCH] graph memory reporting on failure
---
ggml/include/ggml-alloc.h | 6 ++++++
ggml/include/ggml-backend.h | 6 ++++++
ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++----
ggml/src/ggml-backend.cpp | 10 ++++++++++
4 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd..781b1e10 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+struct ggml_allocr_buffer_status {
+ size_t size;
+ bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 778927f6..74e46716 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -304,6 +304,12 @@ extern "C" {
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ struct ggml_backend_buffer_status {
+ size_t size;
+ bool allocated;
+ };
+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5fd379f6..04812990 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
+ size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
GGML_ASSERT(galloc->buffers != NULL);
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+ GGML_ASSERT(galloc->buffer_sizes != NULL);
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
+ free(galloc->buffer_sizes);
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
+ bool success = true;
+
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
- if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false;
+ galloc->buffer_sizes[i] = new_size;
+ success = false;
}
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
}
}
- return true;
+ return success;
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+ for (int i = 0; i < buffer_id; i++) {
+ if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+ // This buffer is the same as a previous one due to the same buffer type being used multiple times
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
+ // case and we still want to know the attempted size.
+
+ struct ggml_allocr_buffer_status status = {0, true};
+ return status;
+ }
+ }
+
+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+ return status;
+}
+
// utils
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 0ce73a99..be335e8c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+ return status;
+}
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 24 Apr 2025 14:48:51 -0700
Subject: [PATCH] ggml: Export GPU UUIDs
This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 33 ++++++++++++++++++++++++++++++++
ggml/src/ggml-metal/ggml-metal.m | 1 +
3 files changed, 35 insertions(+)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 74e46716..a880df33 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
const char * description;
+ const char * uuid;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..4c829153 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
+ std::string uuid;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
}
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+ return ctx->uuid.c_str();
+}
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
+ props->uuid = ggml_backend_cuda_device_get_uuid(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ #if !defined(GGML_USE_HIP)
+ char uuid[64];
+ snprintf(uuid, sizeof(uuid),
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ (unsigned char)prop.uuid.bytes[0],
+ (unsigned char)prop.uuid.bytes[1],
+ (unsigned char)prop.uuid.bytes[2],
+ (unsigned char)prop.uuid.bytes[3],
+ (unsigned char)prop.uuid.bytes[4],
+ (unsigned char)prop.uuid.bytes[5],
+ (unsigned char)prop.uuid.bytes[6],
+ (unsigned char)prop.uuid.bytes[7],
+ (unsigned char)prop.uuid.bytes[8],
+ (unsigned char)prop.uuid.bytes[9],
+ (unsigned char)prop.uuid.bytes[10],
+ (unsigned char)prop.uuid.bytes[11],
+ (unsigned char)prop.uuid.bytes[12],
+ (unsigned char)prop.uuid.bytes[13],
+ (unsigned char)prop.uuid.bytes[14],
+ (unsigned char)prop.uuid.bytes[15]
+ );
+ dev_ctx->uuid = uuid;
+ #else
+ dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+ #endif
+
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..ee4f2dcb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev);
+ props->uuid = "0";
props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (struct ggml_backend_dev_caps) {
......@@ -114,6 +114,9 @@ void grammar_free(struct llama_grammar *g) {
if (g->vocab != nullptr) {
delete g->vocab;
}
if (g->o_vocab != nullptr) {
delete g->o_vocab;
}
llama_grammar_free_impl(g);
}
}
......
......@@ -82,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
var graphOffload uint64
// Projectors loaded into GPU0 only
var projectorWeights uint64
var projectorGraph uint64
var llamaEngineProjectorWeights uint64
// Projectors loaded with output layer
var ollamaEngineProjectorWeights uint64
var ollamaEngineProjectorGraph uint64
// Conditional output size on GPU 0
var memoryLayerOutput uint64
......@@ -108,15 +111,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
weight, graph := projectorMemoryRequirements(projector)
projectorWeights += weight
projectorGraph += graph
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
if projectorWeights == 0 && projectorGraph == 0 {
projectorWeights, projectorGraph = f.VisionGraphSize()
if llamaEngineProjectorWeights == 0 {
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
}
layers := f.Tensors().GroupLayers()
......@@ -168,6 +170,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
graphFullOffload = graphPartialOffload
}
// Output layer handled at the end if we have space
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.Size()
}
......@@ -177,8 +180,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput += layer.Size()
}
// Output layer handled at the end if we have space
gpuZeroOverhead := projectorWeights + projectorGraph
gpuZeroOverhead := llamaEngineProjectorWeights
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
......@@ -221,10 +223,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead
} else {
overflow += gpuZeroOverhead
}
// For all the layers, find where they can fit on the GPU(s)
for i := range int(f.KV().BlockCount()) {
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
// Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.Size()
......@@ -234,6 +238,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU
overflow += layerSize
continue
}
......@@ -250,31 +255,34 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
}
}
if len(gpusWithSpace) == 0 {
overflow += layerSize
}
}
if layerCount >= int(f.KV().BlockCount()) {
fullyLoaded = true
} else {
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
overflow += layerSize
}
}
// Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
break
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
if memoryLastLayer > 0 {
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+memoryLastLayer {
gpuAllocations[g.i] += memoryLastLayer
layerCounts[g.i]++
layerCount++
break
}
}
}
if layerCount < int(f.KV().BlockCount())+1 {
fullyLoaded = false
overflow += memoryLayerOutput
overflow += memoryLastLayer
}
}
......@@ -332,8 +340,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
projectorWeights: projectorWeights,
projectorGraph: projectorGraph,
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
projectorGraph: ollamaEngineProjectorGraph,
}
if gpus[0].Library == "cpu" {
......@@ -412,51 +420,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
return slog.GroupValue(attrs...)
}
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
func projectorMemoryRequirements(filename string) (weights uint64) {
file, err := os.Open(filename)
if err != nil {
return 0, 0
return 0
}
defer file.Close()
ggml, _, err := ggml.Decode(file, 1024)
ggml, err := ggml.Decode(file, 1024)
if err != nil {
return 0, 0
return 0
}
for _, layer := range ggml.Tensors().GroupLayers() {
weights += layer.Size()
}
switch arch := ggml.KV().Architecture(); arch {
case "mllama":
kv := func(n string) uint64 {
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
return uint64(v)
}
return 0
}
imageSize := kv("image_size")
maxNumTiles := kv("max_num_tiles")
embeddingLength := kv("embedding_length")
headCount := kv("attention.head_count")
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
numPatches++
}
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
graphSize = 4 * (8 +
imageSize*imageSize*kv("num_channels")*maxNumTiles +
embeddingLength*numPatches*maxNumTiles +
9*embeddingLength*numPaddedPatches*maxNumTiles +
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
}
return weights, graphSize
return weights
}
......@@ -17,6 +17,7 @@ import (
"os/exec"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"sync"
......@@ -30,9 +31,37 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/model"
)
type filteredEnv []string
func (e filteredEnv) LogValue() slog.Value {
var attrs []slog.Attr
for _, env := range e {
if key, value, ok := strings.Cut(env, "="); ok {
switch {
case strings.HasPrefix(key, "OLLAMA_"),
strings.HasPrefix(key, "CUDA_"),
strings.HasPrefix(key, "ROCR_"),
strings.HasPrefix(key, "ROCM_"),
strings.HasPrefix(key, "HIP_"),
strings.HasPrefix(key, "GPU_"),
strings.HasPrefix(key, "HSA_"),
strings.HasPrefix(key, "GGML_"),
slices.Contains([]string{
"PATH",
"LD_LIBRARY_PATH",
"DYLD_LIBRARY_PATH",
}, key):
attrs = append(attrs, slog.String(key, value))
}
}
}
return slog.GroupValue(attrs...)
}
type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
......@@ -92,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
}
defer f.Close()
ggml, _, err := ggml.Decode(f, maxArraySize)
ggml, err := ggml.Decode(f, maxArraySize)
return ggml, err
}
......@@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
}
if envconfig.Debug() {
params = append(params, "--verbose")
}
if opts.MainGPU > 0 {
params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
}
......@@ -286,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--mmproj", projectors[0])
}
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
......@@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
}
slog.Info("starting llama server", "cmd", s.cmd)
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "OLLAMA_") ||
strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "GPU_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
if err = s.cmd.Start(); err != nil {
var msg string
......@@ -673,9 +679,8 @@ ws ::= ([ \t\n] ws)?
const maxBufferSize = 512 * format.KiloByte
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
Data []byte `json:"data"`
ID int `json:"id"`
}
type CompletionRequest struct {
......@@ -721,6 +726,9 @@ type CompletionResponse struct {
}
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
if len(req.Format) > 0 {
switch string(req.Format) {
case `null`, `""`:
......@@ -789,7 +797,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
res, err := http.DefaultClient.Do(serverReq)
if err != nil {
return fmt.Errorf("POST predict: %v", err)
slog.Error("post predict", "error", err)
return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
}
defer res.Body.Close()
......@@ -884,6 +893,8 @@ type EmbeddingResponse struct {
}
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting embedding request due to client closing the connection")
......
package logutil
import (
"io"
"log/slog"
"path/filepath"
)
const LevelTrace slog.Level = -8
func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
switch attr.Key {
case slog.LevelKey:
switch attr.Value.Any().(slog.Level) {
case LevelTrace:
attr.Value = slog.StringValue("TRACE")
}
case slog.SourceKey:
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
}))
}
......@@ -5,7 +5,8 @@ import (
"context"
"encoding/binary"
"fmt"
"os"
"log/slog"
"math"
"slices"
"strconv"
"strings"
......@@ -14,6 +15,11 @@ import (
)
type Backend interface {
Load(ctx context.Context, progress func(float32)) error
// BackendMemory returns the memory allocations that were made for this model
BackendMemory() BackendMemory
Config() fs.Config
Get(name string) Tensor
NewContext() Context
......@@ -51,10 +57,6 @@ type CacheConfig struct {
// BackendParams controls how the backend loads and executes models
type BackendParams struct {
// Progress is a callback function that allows reporting percentage completion
// of model loading
Progress func(float32)
// NumThreads sets the number of threads to use if running on the CPU
NumThreads int
......@@ -71,9 +73,130 @@ type BackendParams struct {
FlashAttention bool
}
var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
// ErrNoMem is returned when panicing due to insufficient memory. It includes
// the attempted memory allocation.
type ErrNoMem struct {
BackendMemory
}
func (e ErrNoMem) Error() string {
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
}
type AllocationStatus int
const (
// Unallocated memory - have not yet attempted to allocate
Unallocated AllocationStatus = iota
// Failed memory - tried to allocate the memory and did not succeed
Failed
// Allocated memory = tried and succeeded to allocate memory
Allocated
)
// Memory is the size of an allocation and whether it was successful.
type Memory struct {
Size uint64
Status AllocationStatus
}
func (m Memory) String() string {
s := fmt.Sprint(m.Size)
func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
switch m.Status {
case Unallocated:
s += "U"
case Failed:
s += "F"
case Allocated:
s += "A"
}
return s
}
// DeviceMemory provides a breakdown of the memory needed
// per device, such as a CPU or GPU.
type DeviceMemory struct {
// Name is the name of the device as labeled by the backend. It
// may not be persistent across instances of the runner.
Name string
// UUID is a unique persistent identifier for the device for matching
// with system management libraries
UUID string
// Weights is the per-layer memory needed for the model weights.
Weights []Memory
// Cache is the per-layer memory needed for the KV cache.
Cache []Memory
// Graph is the size of the compute graph. It is not per-layer.
Graph Memory
}
func memoryPresent(mem []Memory) bool {
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
}
func (m DeviceMemory) LogValue() slog.Value {
var attrs []slog.Attr
if memoryPresent(m.Weights) {
attrs = append(attrs, slog.Any("Weights", m.Weights))
}
if memoryPresent(m.Cache) {
attrs = append(attrs, slog.Any("Cache", m.Cache))
}
if m.Graph.Size != 0 {
attrs = append(attrs, slog.Any("Graph", m.Graph))
}
if len(attrs) > 0 && m.UUID != "" {
attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
}
return slog.GroupValue(attrs...)
}
// BackendMemory provides the amount of memory required to load the model
// per device based on the BackendParams. In some cases, not all required
// allocations will be known at this point. However, the size of the most recent
// allocation is guaranteed to be provided so that if it failed, the caller can
// accommodate that to make forward progress.
type BackendMemory struct {
// InputsWeights are always located on the CPU and cannot be moved
InputWeights Memory
// CPU model components are located in system memory. This does not
// include unified memory allocated through the GPU.
CPU DeviceMemory
// GPU model components are located on one or more GPUs.
GPUs []DeviceMemory
}
func (m BackendMemory) LogValue() slog.Value {
var attrs []slog.Attr
if m.InputWeights.Size != 0 {
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
}
attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
for _, g := range m.GPUs {
attrs = append(attrs, slog.Any(g.Name, g))
}
return slog.GroupValue(attrs...)
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok {
panic("backend: backend already registered")
}
......@@ -81,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
backends[name] = f
}
func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok {
return backend(ctx, f, params)
return backend(modelPath, params)
}
return nil, fmt.Errorf("unsupported backend")
......@@ -92,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
type Context interface {
Empty(dtype DType, shape ...int) Tensor
Zeros(dtype DType, shape ...int) Tensor
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
FromIntSlice(s []int32, shape ...int) (Tensor, error)
FromFloatSlice(s []float32, shape ...int) Tensor
FromIntSlice(s []int32, shape ...int) Tensor
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
Arange(start, stop, step float32, dtype DType) Tensor
......@@ -105,7 +228,7 @@ type Context interface {
// graph, simply preallocates memory. Typically called with a
// worst case graph to ensure all resources are available for
// for future inference.
Reserve() error
Reserve()
MaxGraphNodes() int
Close()
......@@ -131,6 +254,8 @@ type Tensor interface {
Neg(ctx Context) Tensor
Add(ctx Context, t2 Tensor) Tensor
Mul(ctx Context, t2 Tensor) Tensor
Div(ctx Context, t2 Tensor) Tensor
Mulmat(ctx Context, t2 Tensor) Tensor
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
MulmatID(ctx Context, t2, ids Tensor) Tensor
......@@ -139,11 +264,11 @@ type Tensor interface {
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
Scale(ctx Context, s float64) Tensor
SumRows(ctx Context) Tensor
AvgPool2D(ctx Context, k, s int, p float32) Tensor
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
Sin(ctx Context) Tensor
......@@ -160,7 +285,6 @@ type Tensor interface {
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
Pad(ctx Context, shape ...int) Tensor
Unpad(ctx Context, shape ...int) Tensor
Stack(ctx Context, dim int, s ...Tensor) Tensor
......@@ -172,6 +296,7 @@ type Tensor interface {
Duplicate(ctx Context) Tensor
TopK(ctx Context, k int) Tensor
Argsort(ctx Context) Tensor
}
// ScaledDotProductAttention implements a fused attention
......@@ -214,35 +339,58 @@ func mul[T number](s ...T) T {
return p
}
type DumpOptions struct {
// Items is the number of elements to print at the beginning and end of each dimension.
Items int
type DumpOptions func(*dumpOptions)
// Precision is the number of decimal places to print. Applies to float32 and float64.
Precision int
// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
func DumpWithPrecision(n int) DumpOptions {
return func(opts *dumpOptions) {
opts.Precision = n
}
}
func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
if len(opts) < 1 {
opts = append(opts, DumpOptions{
Items: 3,
Precision: 4,
})
// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
// beginning and end of each dimension will be printed.
func DumpWithThreshold(n int) DumpOptions {
return func(opts *dumpOptions) {
opts.Threshold = n
}
}
// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
func DumpWithEdgeItems(n int) DumpOptions {
return func(opts *dumpOptions) {
opts.EdgeItems = n
}
}
type dumpOptions struct {
Precision, Threshold, EdgeItems int
}
func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
for _, optsFunc := range optsFuncs {
optsFunc(&opts)
}
if mul(t.Shape()...) <= opts.Threshold {
opts.EdgeItems = math.MaxInt
}
switch t.DType() {
case DTypeF32:
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
})
case DTypeF16, DTypeQ80, DTypeQ40:
f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
f32 = t.Copy(ctx, f32)
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
})
case DTypeI32:
return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
return strconv.FormatInt(int64(i), 10)
})
default:
......
......@@ -10,7 +10,6 @@ import "C"
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
......@@ -27,8 +26,10 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs"
fsggml "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
"github.com/ollama/ollama/ml/nn/rope"
"golang.org/x/sync/errgroup"
)
......@@ -43,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
}
type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML
// tensorLoadTargets maps from the name of the tensor in the file
// to the name that is used by the model definition
tensorLoadTargets map[string][]string
sched *C.struct_ggml_backend_sched
schedBackends []*C.struct_ggml_backend
schedBufts []*C.struct_ggml_backend_buffer_type
......@@ -57,14 +65,26 @@ type Backend struct {
// layers is the backend used for repeating layers
layers map[int]*C.struct_ggml_backend_buffer_type
// requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
flashAttention bool
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
maxGraphNodes int
}
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
meta, n, err := fsggml.Decode(r, -1)
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath)
if err != nil {
return nil, err
}
defer r.Close()
meta, err := fsggml.Decode(r, -1)
if err != nil {
return nil, err
}
......@@ -79,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
"num_key_values", len(meta.KV()),
)
var requiredMemory ml.BackendMemory
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
type deviceBufferType struct {
d *C.struct_ggml_backend_device
bts []*C.struct_ggml_backend_buffer_type
......@@ -99,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
}
}
blocks := int(meta.KV().BlockCount())
// create list of buffer types for the cpu
cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
for _, d := range append(accels, append(gpus, cpus...)...) {
......@@ -106,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
}
}
requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
requiredMemory.CPU.UUID = C.GoString(props.uuid)
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
// create list of buffer types for each gpu
var gpuDeviceBufferTypes []deviceBufferType
for _, d := range gpus {
requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
for i, d := range gpus {
bt := C.ggml_backend_dev_buffer_type(d)
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
d: d,
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
})
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(d, &props)
requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
}
useDefaultSplit := true
......@@ -155,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// inputs always use cpu
input := cpuDeviceBufferType
blocks := int(meta.KV().BlockCount())
// define a range of gpu layers. anything outside of this range is assigned to the cpu
gpuRangeStart := max(0, blocks-params.NumGPULayers)
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
......@@ -197,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// contexts are shared by tensors of the same buffer type
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
for _, bt := range bts {
if _, ok := ctxs[bt]; !ok {
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
......@@ -222,7 +261,17 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
C.ggml_set_name(tt, cname)
slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
if layer == -1 {
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
requiredMemory.InputWeights.Status = ml.Allocated
requiredMemory.InputWeights.Size += uint64(size)
} else {
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
}
//nolint:staticcheck // TODO: check if buffer type supports this tensor
return tt
}
......@@ -244,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
for _, t := range meta.Tensors().Items() {
switch {
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
createTensor(tensor{source: t}, input.bts)
createTensor(tensor{source: t}, input.bts, -1)
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
createTensor(tensor{source: t, target: "output.weight"}, output.bts)
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
}
case contains(t.Name, "cls", "output", "output_norm"):
createTensor(tensor{source: t}, output.bts)
createTensor(tensor{source: t}, output.bts, blocks)
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
// TODO: assign vision tensors to the gpu if possible
createTensor(tensor{source: t}, output.bts)
createTensor(tensor{source: t}, output.bts, blocks)
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
// these tensors should be repeated per layer
for i, layer := range layers {
createTensor(tensor{
source: t,
target: "blk." + strconv.Itoa(i) + "." + t.Name,
}, layer.bts)
}, layer.bts, i)
}
default:
layerIndex := -1
......@@ -270,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
}
if layerIndex >= 0 {
createTensor(tensor{source: t}, layers[layerIndex].bts)
createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
} else {
// load all other tensors on the cpu
createTensor(tensor{source: t}, input.bts)
createTensor(tensor{source: t}, input.bts, -1)
}
}
}
......@@ -286,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
}
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
}
}
if b == nil {
return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
panic(ml.ErrNoMem{BackendMemory: requiredMemory})
}
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
......@@ -306,22 +365,79 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
}
}
// map devices to backend buffer types so new tensors can be assigned to the correct device
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
// create backends and buffer types used for the compute graph scheduler
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
for _, d := range append(gpus, append(accels, cpus...)...) {
b := C.ggml_backend_dev_init(d, nil)
bt := C.ggml_backend_get_default_buffer_type(b)
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
if C.ggml_backend_is_cpu(b) {
// set number of threads for cpu backend
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
}
}
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
modelPath: modelPath,
flashAttention: params.FlashAttention,
meta: meta,
tensorLoadTargets: targets,
tensors: tensors,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
C._Bool(false),
),
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
m := make(map[int]*C.struct_ggml_backend_buffer_type)
for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d]
}
return m
}(),
requiredMemory: &requiredMemory,
btDeviceMemory: btDeviceMemory,
maxGraphNodes: maxGraphNodes,
}, nil
}
func init() {
ml.RegisterBackend("ggml", New)
}
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
var doneBytes atomic.Uint64
totalBytes := uint64(n) - meta.Tensors().Offset
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(runtime.GOMAXPROCS(0))
for _, t := range meta.Tensors().Items() {
for _, t := range b.meta.Tensors().Items() {
t := t
g.Go(func() error {
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
for i := range tts {
target := targets[t.Name][i]
target := b.tensorLoadTargets[t.Name][i]
if target == "" {
target = t.Name
}
tt, ok := tensors[target]
tt, ok := b.tensors[target]
if !ok {
return fmt.Errorf("unassigned tensor: %s", t.Name)
}
......@@ -331,13 +447,13 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
// seeking around within an FD shared between all goroutines.
file, err := os.Open(r.Name())
file, err := os.Open(b.modelPath)
if err != nil {
slog.Warn("file open error", "file", r.Name(), "error", err)
slog.Warn("file open error", "file", b.modelPath, "error", err)
return err
}
defer file.Close()
sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
bts := make([]byte, 128*format.KibiByte)
var s uint64
......@@ -349,7 +465,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
if err != nil {
slog.Warn("file read error", "file", r.Name(), "error", err)
slog.Warn("file read error", "file", b.modelPath, "error", err)
return err
}
......@@ -359,9 +475,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
s += uint64(n)
if params.Progress != nil {
if progress != nil {
done := doneBytes.Add(uint64(n))
params.Progress(float32(done) / float32(totalBytes))
progress(float32(done) / float32(totalBytes))
}
}
......@@ -370,58 +486,14 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
}
if err := g.Wait(); err != nil {
return nil, err
return err
}
// map devices to backend buffer types so new tensors can be assigned to the correct device
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
// create backends and buffer types used for the compute graph scheduler
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
for _, d := range append(gpus, append(accels, cpus...)...) {
b := C.ggml_backend_dev_init(d, nil)
bt := C.ggml_backend_get_default_buffer_type(b)
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
if C.ggml_backend_is_cpu(b) {
// set number of threads for cpu backend
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
}
}
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
flashAttention: params.FlashAttention,
meta: meta,
tensors: tensors,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
),
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
m := make(map[int]*C.struct_ggml_backend_buffer_type)
for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d]
}
return m
}(),
maxGraphNodes: maxGraphNodes,
}, nil
return nil
}
func init() {
ml.RegisterBackend("ggml", New)
func (b *Backend) BackendMemory() ml.BackendMemory {
return *b.requiredMemory
}
func (b *Backend) Config() fs.Config {
......@@ -455,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
no_alloc: true,
}),
allocatedBuffers: &allocatedBuffers,
layer: -1,
}
}
......@@ -481,6 +554,9 @@ type Context struct {
// maxGraphNodes is the maximum allowed number of graph nodes in this context
maxGraphNodes int
// layer is the graph layer that this context is allocating for - assumed to be cache
layer int
}
func (c *Context) Input() ml.Context {
......@@ -491,6 +567,7 @@ func (c *Context) Input() ml.Context {
buft: c.b.input,
allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes,
layer: -1,
}
}
......@@ -505,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context {
buft: buft,
allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes,
layer: i,
}
}
......@@ -524,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
}
func (c *Context) Compute(tensors ...ml.Tensor) {
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
panic(fmt.Errorf("error computing ggml graph: %v", status))
}
C.ggml_backend_sched_reset(c.b.sched)
needSync := true
......@@ -542,22 +622,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
}
}
func (c *Context) Reserve() error {
if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
C.ggml_backend_sched_reset(c.b.sched)
return errors.New("failed to reserve graph")
}
func (c *Context) Reserve() {
reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
for _, bt := range c.b.schedBufts {
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
}
for i := range c.b.schedBackends {
size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
graph.Size += uint64(bufferStatus.size)
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
}
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
"size", format.HumanBytes2(uint64(size)))
"size", format.HumanBytes2(uint64(bufferStatus.size)))
}
C.ggml_backend_sched_reset(c.b.sched)
return nil
if !reserved {
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
}
}
func (c *Context) MaxGraphNodes() int {
......@@ -577,7 +669,7 @@ func pad(length, pad C.size_t) C.size_t {
return ((length + pad - 1) / pad) * pad
}
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
if c.buft == nil {
panic("set Input or Layer before creating tensors")
}
......@@ -600,7 +692,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
if len(shape) < 1 || shape[0] == 0 {
var shape C.int64_t = 0
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
} else if len(shape) > 4 {
panic("unsupported number of dimensions")
}
......@@ -613,40 +705,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
if c.layer >= 0 {
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
cache.Size += uint64(size)
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
}
}
if b == nil {
return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
}
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
return &Tensor{b: c.b, t: t}, nil
return &Tensor{b: c.b, t: t}
}
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
t, err := c.newTensor(dtype, shape)
if err != nil {
panic(err)
}
return t
return c.newTensor(dtype, shape)
}
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
t, err := c.newTensor(dtype, shape)
if err != nil {
panic(err)
}
t := c.newTensor(dtype, shape)
C.ggml_set_zero(t.(*Tensor).t)
return t
}
func checkShape[S ~[]E, E any](s S, shape ...int) error {
func checkShape[S ~[]E, E any](s S, shape ...int) {
n := len(s)
if n == 0 {
return nil
return
}
for _, v := range shape {
......@@ -654,44 +749,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
}
if n != 1 {
return fmt.Errorf("invalid shape: %v", shape)
panic(fmt.Errorf("invalid shape: %v", shape))
}
return nil
}
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
if err := checkShape(s, shape...); err != nil {
return nil, err
}
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
checkShape(s, shape...)
t, err := c.newTensor(ml.DTypeF32, shape)
if err != nil {
return nil, err
}
t := c.newTensor(ml.DTypeF32, shape)
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
return t, nil
return t
}
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
if err := checkShape(s, shape...); err != nil {
return nil, err
}
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
checkShape(s, shape...)
t, err := c.newTensor(ml.DTypeI32, shape)
if err != nil {
return nil, err
}
t := c.newTensor(ml.DTypeI32, shape)
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
return t, nil
return t
}
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
......@@ -709,12 +792,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
arange = append(arange, int32(i))
}
t, err := c.Input().FromIntSlice(arange, len(arange))
if err != nil {
panic(err)
}
return t
return c.Input().FromIntSlice(arange, len(arange))
default:
panic("unsupported dtype for arange")
}
......@@ -865,6 +943,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
}
}
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
}
}
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
return &Tensor{
b: t.b,
......@@ -913,6 +998,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
if len(shape) != 4 {
panic("expected 4 dimensions")
} else if shape[3] != 0 {
panic("cuda does not support 4d tensors")
}
return &Tensor{
......@@ -980,6 +1067,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
}
}
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
}
}
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
......@@ -1015,17 +1109,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
}
}
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
if len(shape) != 4 {
panic("expected 4 dimensions")
}
return &Tensor{
b: t.b,
t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
}
}
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
switch len(shape) {
case 1:
......@@ -1062,16 +1145,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
}
}
const (
ropeTypeNorm C.int = 0
ropeTypeNeox C.int = 2
ropeTypeMrope C.int = 8
ropeTypeVision C.int = 24
)
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
// Default options
opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
if ropeFactors == nil {
ropeFactors = &Tensor{b: t.b}
// Apply any provided options
for _, option := range options {
option(opts)
}
dequant := t.t
......@@ -1082,16 +1162,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
return &Tensor{
b: t.b,
t: C.ggml_rope_ext(
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
ctx.(*Context).ctx,
dequant,
positions.(*Tensor).t,
opts.Factors.(*Tensor).t,
C.int(ropeDim),
C.int(ropeType),
131072, // YaRN n_ctx_train
C.int(opts.Type),
C.int(opts.OriginalContextLength),
C.float(ropeBase),
C.float(ropeScale),
0., // YaRN ext_factor
1., // YaRN attn_factor
32., // YaRN beta_fast
1., // YaRN beta_slow
C.float(0.0),
C.float(1.0),
C.float(32.0),
C.float(1.0),
),
}
}
......@@ -1185,3 +1268,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
}
}
func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
}
}
......@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
struct ggml_allocr_buffer_status {
size_t size;
bool allocated;
};
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
......
......@@ -38,7 +38,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
......@@ -59,7 +59,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
......@@ -152,6 +152,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
const char * description;
const char * uuid;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
......@@ -248,7 +249,7 @@ extern "C" {
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
// initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size);
......@@ -289,7 +290,7 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
......@@ -304,6 +305,12 @@ extern "C" {
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
struct ggml_backend_buffer_status {
size_t size;
bool allocated;
};
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
......
......@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend
......
......@@ -37,13 +37,16 @@ extern "C" {
// ====== Dataset ======
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
enum ggml_type type_data, // the type for the internal data tensor
enum ggml_type type_label, // the type for the internal labels tensor
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
// get underlying tensors that store the data
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
......@@ -56,13 +59,19 @@ extern "C" {
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch);
GGML_API void ggml_opt_dataset_get_batch_host(
ggml_opt_dataset_t dataset,
void * data_batch,
size_t nb_data_batch,
void * labels_batch,
int64_t ibatch);
// ====== Model / Context ======
enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD,
GGML_OPT_BUILD_TYPE_GRAD,
GGML_OPT_BUILD_TYPE_OPT,
GGML_OPT_BUILD_TYPE_FORWARD = 10,
GGML_OPT_BUILD_TYPE_GRAD = 20,
GGML_OPT_BUILD_TYPE_OPT = 30,
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
......@@ -81,20 +90,22 @@ extern "C" {
// userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
// returns the default optimizer params (constant)
// returns the default optimizer params (constant, hard-coded values)
// userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
// casts userdata to ggml_opt_optimizer_params and returns it
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
// parameters for initializing a new optimization context
struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
// the forward graph is defined by inputs and outputs
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;
// by default the forward graph needs to be reconstructed for each eval
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
struct ggml_context * ctx_compute;
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;
enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type;
......@@ -107,12 +118,9 @@ extern "C" {
// get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API struct ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
enum ggml_opt_loss_type loss_type);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
......@@ -121,6 +129,7 @@ extern "C" {
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
// get underlying tensors that store data
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
......@@ -128,11 +137,12 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
// ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init();
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
......@@ -144,11 +154,20 @@ extern "C" {
// ====== Computation ======
// do forward pass, increment result if not NULL
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// if not using static graphs, this function must be called prior to ggml_opt_alloc
GGML_API void ggml_opt_prepare_alloc(
ggml_opt_context_t opt_ctx,
struct ggml_context * ctx_compute,
struct ggml_cgraph * gf,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs);
// allocate the next graph for evaluation, either forward or forward + backward
// must be called exactly once prior to calling ggml_opt_eval
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
// do forward pass, increment result if not NULL, do backward pass
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// do forward pass, increment result if not NULL, do backward pass if allocated
GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// ############################################################################
// ## The high-level functions start here. They do not depend on any private ##
......@@ -200,9 +219,9 @@ extern "C" {
// fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
......
......@@ -489,7 +489,6 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_UNPAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
......@@ -674,11 +673,15 @@ extern "C" {
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
......@@ -765,7 +768,7 @@ extern "C" {
// Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
//
......@@ -935,7 +938,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
// concat a and b along dim
// used in stable-diffusion
......@@ -1778,15 +1781,6 @@ extern "C" {
int p0,
int p1);
// unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
GGML_API struct ggml_tensor * ggml_unpad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
......@@ -2055,15 +2049,14 @@ extern "C" {
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
struct ggml_context * ctx_compute, // context for gradient computation
struct ggml_cgraph * cgraph,
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
struct ggml_context * ctx, // context for gradient computation
struct ggml_cgraph * cgraph,
struct ggml_tensor ** grad_accs);
// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
......
......@@ -214,7 +214,7 @@ add_library(ggml
target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_libraries(ggml PRIVATE dl stdc++fs)
target_link_libraries(ggml PRIVATE dl)
endif()
function(ggml_add_backend_library backend)
......
......@@ -364,6 +364,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
......@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
GGML_ASSERT(galloc->buffers != NULL);
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
GGML_ASSERT(galloc->buffer_sizes != NULL);
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
......@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffer_sizes);
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
......@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
bool success = true;
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
......@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
if (galloc->buffers[i]) {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
} else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
galloc->buffer_sizes[i] = new_size;
success = false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
} else {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
}
}
return true;
return success;
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
......@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
for (int i = 0; i < buffer_id; i++) {
if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
// This buffer is the same as a previous one due to the same buffer type being used multiple times
// (See above.) However, we need a different check because multiple buffers might be NULL in our
// case and we still want to know the attempted size.
struct ggml_allocr_buffer_status status = {0, true};
return status;
}
}
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
return status;
}
// utils
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
......
......@@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
// #ifdef GGML_USE_BLAS
// register_backend(ggml_backend_blas_reg());
// #endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
......
......@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
return SIZE_MAX;
}
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor);
......@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
......@@ -674,6 +674,8 @@ struct ggml_backend_sched {
char * context_buffer;
size_t context_buffer_size;
bool op_offload;
int debug;
};
......@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
......@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node);
assert(node_backend_id != -1); // all nodes should be assigned by now
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
......@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_buffer_type_t * bufts,
int n_backends,
size_t graph_size,
bool parallel) {
bool parallel,
bool op_offload) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
......@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
}
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
ggml_backend_sched_reset(sched);
......@@ -1625,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
return status;
}
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
......
......@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/
${KLEIDIAI_SRC}/kai/ukernels/matmul/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
......@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
if (NOT DOTPROD_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
endif()
if (NOT I8MM_ENABLED MATCHES -1)
......@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif()
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
......
......@@ -3,7 +3,7 @@ package cpu
// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
// #cgo linux CPPFLAGS: -D_GNU_SOURCE
// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment