Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
272 additions
and
140 deletions
+272
-140
llama/patches/0022-ggml-No-alloc-mode.patch
llama/patches/0022-ggml-No-alloc-mode.patch
+19
-19
llama/patches/0023-decode-disable-output_all.patch
llama/patches/0023-decode-disable-output_all.patch
+1
-1
llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
.../patches/0024-ggml-Enable-resetting-backend-devices.patch
+9
-9
llama/patches/0025-harden-uncaught-exception-registration.patch
...patches/0025-harden-uncaught-exception-registration.patch
+1
-1
llama/patches/0026-GPU-discovery-enhancements.patch
llama/patches/0026-GPU-discovery-enhancements.patch
+22
-22
llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
...UDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
+0
-49
llama/patches/0028-report-LoadLibrary-failures.patch
llama/patches/0028-report-LoadLibrary-failures.patch
+2
-2
llama/patches/0029-interleave-multi-rope.patch
llama/patches/0029-interleave-multi-rope.patch
+3
-3
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
+9
-9
ml/backend/ggml/ggml/include/ggml-hexagon.h
ml/backend/ggml/ggml/include/ggml-hexagon.h
+19
-0
ml/backend/ggml/ggml/include/ggml-rpc.h
ml/backend/ggml/ggml/include/ggml-rpc.h
+1
-2
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+44
-0
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+13
-0
ml/backend/ggml/ggml/src/ggml-alloc.c
ml/backend/ggml/ggml/src/ggml-alloc.c
+22
-0
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+8
-0
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+37
-21
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+10
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+16
-0
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
+32
-0
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
+4
-0
No files found.
llama/patches/0022-ggml-No-alloc-mode.patch
View file @
544b6739
...
...
@@ -219,7 +219,7 @@ index 41eef3b5f..c81a2e48a 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index
e0abde542..e98044bd8
100644
index
41ff89c4d..2931c15ca
100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,41 @@
...
...
@@ -274,7 +274,7 @@ index e0abde542..e98044bd8 100644
};
template<typename T>
@@ -99
9
,11 +103
7
,11 @@
struct ggml_backend_cuda_context {
@@ -99
2
,11 +103
0
,11 @@
struct ggml_backend_cuda_context {
// pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
...
...
@@ -288,7 +288,7 @@ index e0abde542..e98044bd8 100644
}
return *pools[device];
}
@@ -10
11
,4 +104
9
,20 @@
struct ggml_backend_cuda_context {
@@ -10
04
,4 +104
2
,20 @@
struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() {
return pool(device);
}
...
...
@@ -310,10 +310,10 @@ index e0abde542..e98044bd8 100644
+ }
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
c555cd30f..eb3db0f19
100644
index
02d413467..f79e5d65c
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -35
0
,6 +35
0
,8 @@
const ggml_cuda_device_info & ggml_cuda_info() {
@@ -35
9
,6 +35
9
,8 @@
const ggml_cuda_device_info & ggml_cuda_info() {
// #define DEBUG_CUDA_MALLOC
...
...
@@ -322,7 +322,7 @@ index c555cd30f..eb3db0f19 100644
// buffer pool for cuda (legacy)
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
static const int MAX_BUFFERS = 256;
@@ -3
62
,9 +3
64
,12 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -3
71
,9 +3
73
,12 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
size_t pool_size = 0;
...
...
@@ -337,7 +337,7 @@ index c555cd30f..eb3db0f19 100644
}
~ggml_cuda_pool_leg() {
@@ -3
72
,7 +3
77
,9 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -3
81
,7 +3
86
,9 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cuda_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
...
...
@@ -348,7 +348,7 @@ index c555cd30f..eb3db0f19 100644
pool_size -= b.size;
}
}
@@ -42
0
,8 +4
27
,15 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -42
9
,8 +4
36
,15 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
...
...
@@ -366,7 +366,7 @@ index c555cd30f..eb3db0f19 100644
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
@@ -4
41
,10 +4
55
,20 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -4
50
,10 +4
64
,20 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}
}
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
...
...
@@ -389,7 +389,7 @@ index c555cd30f..eb3db0f19 100644
};
// pool with virtual memory
@@ -4
5
6,18 +48
0
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -46
5
,18 +48
9
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
CUdeviceptr pool_addr = 0;
size_t pool_used = 0;
size_t pool_size = 0;
...
...
@@ -417,7 +417,7 @@ index c555cd30f..eb3db0f19 100644
#if defined(GGML_USE_HIP)
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
@@ -
494
,35 +5
24
,49 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -
503
,35 +5
33
,49 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
...
...
@@ -493,7 +493,7 @@ index c555cd30f..eb3db0f19 100644
// add to the pool
pool_size += reserve_size;
@@ -5
55
,16 +
599
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -5
64
,16 +
608
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
// all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
}
...
...
@@ -521,7 +521,7 @@ index c555cd30f..eb3db0f19 100644
}
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
@@ -7
48
,11 +80
0
,20 @@
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
@@ -7
57
,11 +80
9
,20 @@
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
}
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
...
...
@@ -543,7 +543,7 @@ index c555cd30f..eb3db0f19 100644
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0];
@@ -7
76
,6 +8
37
,7 @@
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
@@ -7
85
,6 +8
46
,7 @@
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .is_host = */ NULL,
...
...
@@ -551,7 +551,7 @@ index c555cd30f..eb3db0f19 100644
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -
3003
,6 +30
65
,7 @@
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
@@ -
2986
,6 +30
48
,7 @@
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
...
...
@@ -559,7 +559,7 @@ index c555cd30f..eb3db0f19 100644
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -301
8
,6 +30
81
,11 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -30
0
1,6 +30
64
,11 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
...
...
@@ -571,7 +571,7 @@ index c555cd30f..eb3db0f19 100644
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
@@ -314
4
,6 +32
12
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -314
0
,6 +32
08
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
...
...
@@ -579,7 +579,7 @@ index c555cd30f..eb3db0f19 100644
ggml_cuda_set_device(cuda_ctx->device);
@@ -32
23
,6 +32
92
,71 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -32
15
,6 +32
84
,71 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
}
...
...
@@ -651,7 +651,7 @@ index c555cd30f..eb3db0f19 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -32
63
,6 +339
7
,9 @@
static const ggml_backend_i ggml_backend_cuda_interface = {
@@ -32
55
,6 +33
8
9,9 @@
static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
...
...
llama/patches/0023-decode-disable-output_all.patch
View file @
544b6739
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
e7526e7d..53a5e3a9
100644
index
bd348bcad..8b4a89d38
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -974,8 +974,7 @@
int llama_context::decode(const llama_batch & batch_inp) {
...
...
llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
View file @
544b6739
...
...
@@ -16,7 +16,7 @@ unused then it can be reset to free these data structures.
6 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index
1ff53ed03..ba181d09d
100644
index
b3b5b356a..69223c488
100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -178,6 +178,7 @@
extern "C" {
...
...
@@ -28,7 +28,7 @@ index 1ff53ed03..ba181d09d 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index
3c3f22fc0..43c91d9f2
100644
index
7bdf9d81f..21b35ac5c
100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -195,6 +195,10 @@
extern "C" {
...
...
@@ -43,7 +43,7 @@ index 3c3f22fc0..43c91d9f2 100644
struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index
6ef5eeafa..0b757af59
100644
index
c81a2e48a..9b0a9b91f
100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -526,6 +526,14 @@
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
...
...
@@ -62,7 +62,7 @@ index 6ef5eeafa..0b757af59 100644
GGML_ASSERT(device);
return device->iface.get_buffer_type(device);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
811462c79..87c6c34a4
100644
index
f79e5d65c..c9333689f
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -107,6 +107,11 @@
int ggml_cuda_get_device() {
...
...
@@ -77,7 +77,7 @@ index 811462c79..87c6c34a4 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
cudaError_t err;
@@ -3
515
,7 +35
2
0,10 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -3
499
,7 +350
4
,10 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
...
...
@@ -89,7 +89,7 @@ index 811462c79..87c6c34a4 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
@@ -39
48
,6 +39
56
,11 @@
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
@@ -39
36
,6 +39
44
,11 @@
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
...
...
@@ -101,7 +101,7 @@ index 811462c79..87c6c34a4 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -39
64
,6 +39
77
,7 @@
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
@@ -39
52
,6 +39
65
,7 @@
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
...
...
@@ -122,10 +122,10 @@ index 890c10364..1f06be80e 100644
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
diff --git a/src/llama.cpp b/src/llama.cpp
index
fe5a7a835..d821a96a0
100644
index
ab2e9868a..74c49e651
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2
6
7,10 +2
6
7,12 @@
static struct llama_model * llama_model_load_from_file_impl(
@@ -27
0
,10 +27
0
,12 @@
static struct llama_model * llama_model_load_from_file_impl(
for (auto * dev : model->devices) {
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
...
...
llama/patches/0025-harden-uncaught-exception-registration.patch
View file @
544b6739
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] harden uncaught exception registration
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
index 0d388d45..f5bcb446 100644
index 0d388d45
5
..f5bcb446
d
100644
--- a/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
@@ -19,8 +19,12 @@
static bool ggml_uncaught_exception_init = []{
...
...
llama/patches/0026-GPU-discovery-enhancements.patch
View file @
544b6739
...
...
@@ -45,7 +45,7 @@ index 69223c488..6510e0cba 100644
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
0609c6503..aefe43bdd
100644
index
f9a6587f1..03f359ae9
100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -209,6 +209,8 @@
add_library(ggml-base
...
...
@@ -58,7 +58,7 @@ index 0609c6503..aefe43bdd 100644
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
5787e8cd5..d232bf828
100644
index
c9333689f..41b00af83
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -261,6 +261,16 @@
static ggml_cuda_device_info ggml_cuda_init() {
...
...
@@ -90,7 +90,7 @@ index 5787e8cd5..d232bf828 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
@@ -34
7
6,6 +34
91
,11 @@
struct ggml_backend_cuda_device_context {
@@ -346
8
,6 +34
83
,11 @@
struct ggml_backend_cuda_device_context {
std::string description;
std::string pci_bus_id;
std::string id;
...
...
@@ -102,7 +102,7 @@ index 5787e8cd5..d232bf828 100644
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -34
96
,6 +35
16
,28 @@
static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
@@ -34
88
,6 +35
08
,28 @@
static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
...
...
@@ -131,7 +131,7 @@ index 5787e8cd5..d232bf828 100644
CUDA_CHECK(cudaMemGetInfo(free, total));
}
@@ -3
504
,6 +35
46
,7 @@
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
@@ -3
496
,6 +35
38
,7 @@
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
...
...
@@ -139,7 +139,7 @@ index 5787e8cd5..d232bf828 100644
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -35
17
,6 +35
60
,19 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -35
09
,6 +35
52
,19 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
...
...
@@ -159,7 +159,7 @@ index 5787e8cd5..d232bf828 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
@@ -407
9
,6 +413
5
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -407
5
,6 +413
1
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
...
...
@@ -167,7 +167,7 @@ index 5787e8cd5..d232bf828 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -409
4
,6 +41
51
,14 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -409
0
,6 +41
47
,14 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
...
...
@@ -204,11 +204,11 @@ index 1f06be80e..2f9ef2dc0 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index
d0fb3bcca..b63edd0c1
100644
index
e9201cdc6..44ae76d66
100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -6
38
,6 +6
38
,14 @@
static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph,
int node_idx
return ggml_can_fuse_ext(cgraph, idxs,
ops, num_op
s);
@@ -6
77
,6 +6
77
,14 @@
static inline bool ggml_can_fuse
_subgraph
(const struct ggml_cgraph * cgraph,
return ggml_can_fuse_
subgraph_
ext(cgraph, idxs,
count, ops, outputs, num_output
s);
}
+// Management libraries for fetching more accurate free VRAM data
...
...
@@ -243,10 +243,10 @@ index 05ff6a5a6..032dee76d 100644
/* .async = */ true,
/* .host_buffer = */ false,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
bd3ece516..7cfb14a54
100644
index
3a6bbe564..d2c278a35
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2
31
,6 +2
31
,7 @@
class vk_memory_logger;
@@ -2
29
,6 +2
29
,7 @@
class vk_memory_logger;
#endif
class vk_perf_logger;
static void ggml_vk_destroy_buffer(vk_buffer& buf);
...
...
@@ -254,7 +254,7 @@ index bd3ece516..7cfb14a54 100644
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11
585
,6 +11
586
,29 @@
static void ggml_vk_get_device_description(int device, char * description, size_
@@ -11
813
,6 +11
814
,29 @@
static void ggml_vk_get_device_description(int device, char * description, size_
snprintf(description, description_size, "%s", props.deviceName.data());
}
...
...
@@ -284,7 +284,7 @@ index bd3ece516..7cfb14a54 100644
// backend interface
#define UNUSED GGML_UNUSED
@@ -12
392
,31 +12
416
,102 @@
void ggml_backend_vk_get_device_description(int device, char * description, size
@@ -12
761
,31 +12
785
,102 @@
void ggml_backend_vk_get_device_description(int device, char * description, size
ggml_vk_get_device_description(dev_idx, description, description_size);
}
...
...
@@ -404,7 +404,7 @@ index bd3ece516..7cfb14a54 100644
break;
}
}
@@ -12
449
,8 +12
544
,13 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
@@ -12
818
,8 +12
913
,13 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
}
}
...
...
@@ -419,7 +419,7 @@ index bd3ece516..7cfb14a54 100644
}
vk::PhysicalDeviceProperties2 props = {};
@@ -12
467
,19 +12
567
,24 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
@@ -12
836
,19 +12
936
,24 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
...
...
@@ -453,7 +453,7 @@ index bd3ece516..7cfb14a54 100644
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -12
491
,9 +12
5
96,14 @@
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
@@ -12
860
,9 +1296
5
,14 @@
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
return ctx->description.c_str();
}
...
...
@@ -469,7 +469,7 @@ index bd3ece516..7cfb14a54 100644
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -12
517
,8 +12
627
,9 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
@@ -12
886
,8 +12
996
,9 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
...
...
@@ -480,7 +480,7 @@ index bd3ece516..7cfb14a54 100644
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
@@ -12
526
,6 +1
2637
,13 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
@@ -12
895
,6 +1
3006
,13 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
...
...
@@ -494,7 +494,7 @@ index bd3ece516..7cfb14a54 100644
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -1
2954
,6 +13
072
,8 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -1
3365
,6 +13
483
,8 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
...
...
@@ -503,7 +503,7 @@ index bd3ece516..7cfb14a54 100644
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
@@ -1
2962
,12 +13
082
,41 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -1
3373
,12 +13
493
,41 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
...
...
llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
deleted
100644 → 0
View file @
c4ba257c
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Julius Tischbein <ju.tischbein@gmail.com>
Date: Wed, 15 Oct 2025 13:54:15 +0200
Subject: [PATCH] CUDA: Changing the CUDA scheduling strategy to spin (#16585)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* CUDA set scheduling strategy to spinning for cc121
* Using prop.major and prop.minor, include HIP and MUSA
* Exclude HIP and MUSA
* Remove trailing whitespace
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* Remove empty line
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
ggml/src/ggml-cuda/ggml-cuda.cu | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b075a18be..d62f412d6 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -340,6 +340,15 @@
static ggml_cuda_device_info ggml_cuda_init() {
} else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") {
turing_devices_without_mma.push_back({ id, device_name });
}
+
+ // Temporary performance fix:
+ // Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
+ // TODO: Check for future drivers the default scheduling strategy and
+ // remove this call again when cudaDeviceScheduleSpin is default.
+ if (prop.major == 12 && prop.minor == 1) {
+ CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
+ }
+
#endif // defined(GGML_USE_HIP)
}
llama/patches/002
9
-report-LoadLibrary-failures.patch
→
llama/patches/002
8
-report-LoadLibrary-failures.patch
View file @
544b6739
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] report LoadLibrary failures
1 file changed, 12 insertions(+)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index
f794d9cfa..3a855ab2e
100644
index
a55d9b280..ec6f7f1e9
100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -1
18
,6 +1
18
,18 @@
static dl_handle * dl_load_library(const fs::path & path) {
@@ -1
22
,6 +1
22
,18 @@
static dl_handle * dl_load_library(const fs::path & path) {
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.wstring().c_str());
...
...
llama/patches/00
31
-interleave-multi-rope.patch
→
llama/patches/00
29
-interleave-multi-rope.patch
View file @
544b6739
...
...
@@ -13,7 +13,7 @@ interleaved version used for qwen3vl
4 files changed, 11 insertions(+), 30 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
31478dd8e..4d1ed207e
100644
index
902fdad69..70955347d
100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5509,15 +5509,12 @@
static void ggml_mrope_cache_init(
...
...
@@ -62,10 +62,10 @@ index d058504cd..287fe9d2c 100644
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
375a0c7fd..9866c96b4
100644
index
50b8071de..65a3183c8
100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -38
5
8,15 +38
5
8,11 @@
kernel void kernel_rope_multi(
@@ -38
8
8,15 +38
8
8,11 @@
kernel void kernel_rope_multi(
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
const int sector = ic % sect_dims;
...
...
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
View file @
544b6739
...
...
@@ -12,7 +12,7 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
aefe43bdd..21fe4640c
100644
index
03f359ae9..4b3e5efb5
100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -211,6 +211,7 @@
add_library(ggml-base
...
...
@@ -24,10 +24,10 @@ index aefe43bdd..21fe4640c 100644
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index
b63edd0c1..81cad8cf3
100644
index
44ae76d66..639d551a2
100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -64
5
,6 +64
5
,9 @@
GGML_API void ggml_nvml_release();
@@ -6
8
4,6 +6
8
4,9 @@
GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API void ggml_hip_mgmt_release();
...
...
@@ -38,7 +38,7 @@ index b63edd0c1..81cad8cf3 100644
#ifdef __cplusplus
}
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
7cfb14a54..a1c46d0b3
100644
index
d2c278a35..221e29509
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -73,6 +73,7 @@
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
...
...
@@ -49,7 +49,7 @@ index 7cfb14a54..a1c46d0b3 100644
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
VkStructureType sType;
@@ -12
433
,6 +12
434
,7 @@
struct ggml_backend_vk_device_context {
@@ -12
802
,6 +12
803
,7 @@
struct ggml_backend_vk_device_context {
std::string pci_id;
std::string id;
std::string uuid;
...
...
@@ -57,7 +57,7 @@ index 7cfb14a54..a1c46d0b3 100644
int major;
int minor;
int driver_major;
@@ -12
448
,8 +12
450
,22 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
@@ -12
817
,8 +12
819
,22 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
vk::PhysicalDeviceProperties2 props2;
vkdev.getProperties2(&props2);
...
...
@@ -81,7 +81,7 @@ index 7cfb14a54..a1c46d0b3 100644
{
// Use vendor specific management libraries for best VRAM reporting if available
switch (props2.properties.vendorID) {
@@ -12
477
,8 +12
493
,8 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
@@ -12
846
,8 +12
862
,8 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
break;
}
}
...
...
@@ -91,7 +91,7 @@ index 7cfb14a54..a1c46d0b3 100644
*total = 0;
*free = 0;
vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
@@ -13
089
,7 +13
105
,6 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -13
500
,7 +13
516
,6 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
/* .reg = */ reg,
/* .context = */ ctx,
});
...
...
@@ -99,7 +99,7 @@ index 7cfb14a54..a1c46d0b3 100644
// Gather additional information about the device
int dev_idx = vk_instance.device_indices[i];
vk::PhysicalDeviceProperties props1;
@@ -13
112
,6 +13
127
,14 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -13
523
,6 +13
538
,14 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
ctx->uuid = oss.str();
...
...
ml/backend/ggml/ggml/include/ggml-hexagon.h
0 → 100644
View file @
544b6739
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern
"C"
{
#endif
// backend API
GGML_BACKEND_API
ggml_backend_t
ggml_backend_hexagon_init
(
void
);
GGML_BACKEND_API
bool
ggml_backend_is_hexagon
(
ggml_backend_t
backend
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_hexagon_reg
(
void
);
#ifdef __cplusplus
}
#endif
ml/backend/ggml/ggml/include/ggml-rpc.h
View file @
544b6739
...
...
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
GGML_BACKEND_API
void
ggml_backend_rpc_get_device_memory
(
const
char
*
endpoint
,
uint32_t
device
,
size_t
*
free
,
size_t
*
total
);
GGML_BACKEND_API
void
ggml_backend_rpc_start_server
(
const
char
*
endpoint
,
const
char
*
cache_dir
,
size_t
n_threads
,
size_t
n_devices
,
ggml_backend_dev_t
*
devices
,
size_t
*
free_mem
,
size_t
*
total_mem
);
size_t
n_threads
,
size_t
n_devices
,
ggml_backend_dev_t
*
devices
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_reg
(
void
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_add_server
(
const
char
*
endpoint
);
...
...
ml/backend/ggml/ggml/include/ggml.h
View file @
544b6739
...
...
@@ -577,6 +577,10 @@ extern "C" {
GGML_UNARY_OP_EXP
,
GGML_UNARY_OP_GELU_ERF
,
GGML_UNARY_OP_XIELU
,
GGML_UNARY_OP_FLOOR
,
GGML_UNARY_OP_CEIL
,
GGML_UNARY_OP_ROUND
,
GGML_UNARY_OP_TRUNC
,
GGML_UNARY_OP_COUNT
,
};
...
...
@@ -1151,6 +1155,46 @@ extern "C" {
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_floor
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_floor_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_ceil
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_ceil_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_round
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_round_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
/**
* Truncates the fractional part of each element in the tensor (towards zero).
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
* Similar to std::trunc in C/C++.
*/
GGML_API
struct
ggml_tensor
*
ggml_trunc
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_trunc_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
// xIELU activation function
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
544b6739
...
...
@@ -310,6 +310,10 @@ function(ggml_add_cpu_backend_variant tag_name)
foreach
(
feat
${
ARGN
}
)
set
(
GGML_INTERNAL_
${
feat
}
ON
)
endforeach
()
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
foreach
(
feat
${
ARGN
}
)
set
(
GGML_INTERNAL_
${
feat
}
ON
)
endforeach
()
endif
()
ggml_add_cpu_backend_variant_impl
(
${
tag_name
}
)
...
...
@@ -372,6 +376,14 @@ if (GGML_CPU_ALL_VARIANTS)
else
()
message
(
FATAL_ERROR
"Unsupported PowerPC target OS:
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
if
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
ggml_add_cpu_backend_variant
(
s390x_z15 Z15 VXE
)
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
else
()
message
(
FATAL_ERROR
"Unsupported s390x target OS:
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
else
()
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS not yet supported with
${
GGML_SYSTEM_ARCH
}
on
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
...
...
@@ -391,6 +403,7 @@ ggml_add_backend(Vulkan)
ggml_add_backend
(
WebGPU
)
ggml_add_backend
(
zDNN
)
ggml_add_backend
(
OpenCL
)
ggml_add_backend
(
Hexagon
)
foreach
(
target ggml-base ggml
)
target_include_directories
(
${
target
}
PUBLIC $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/../include> $<INSTALL_INTERFACE:include>
)
...
...
ml/backend/ggml/ggml/src/ggml-alloc.c
View file @
544b6739
...
...
@@ -603,6 +603,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
return
t
->
data
!=
NULL
||
ggml_gallocr_hash_get
(
galloc
,
t
)
->
allocated
;
}
// free the extra space at the end if the new tensor is smaller
static
void
ggml_gallocr_free_extra_space
(
ggml_gallocr_t
galloc
,
struct
ggml_tensor
*
node
,
struct
ggml_tensor
*
parent
)
{
struct
hash_node
*
hn
=
ggml_gallocr_hash_get
(
galloc
,
node
);
struct
hash_node
*
p_hn
=
ggml_gallocr_hash_get
(
galloc
,
parent
);
size_t
parent_size
=
ggml_backend_buft_get_alloc_size
(
galloc
->
bufts
[
p_hn
->
buffer_id
],
parent
);
size_t
node_size
=
ggml_backend_buft_get_alloc_size
(
galloc
->
bufts
[
hn
->
buffer_id
],
node
);
GGML_ASSERT
(
parent_size
>=
node_size
);
if
(
parent_size
>
node_size
)
{
struct
ggml_dyn_tallocr
*
p_alloc
=
galloc
->
buf_tallocs
[
p_hn
->
buffer_id
];
struct
buffer_address
p_addr
=
p_hn
->
addr
;
p_addr
.
offset
+=
node_size
;
size_t
extra_size
=
parent_size
-
node_size
;
AT_PRINTF
(
"freeing extra %zu bytes from parent %s for %s
\n
"
,
extra_size
,
parent
->
name
,
node
->
name
);
ggml_dyn_tallocr_free_tensor
(
p_alloc
,
p_addr
,
extra_size
,
parent
);
}
}
static
void
ggml_gallocr_allocate_node
(
ggml_gallocr_t
galloc
,
struct
ggml_tensor
*
node
,
int
buffer_id
)
{
GGML_ASSERT
(
buffer_id
>=
0
);
struct
hash_node
*
hn
=
ggml_gallocr_hash_get
(
galloc
,
node
);
...
...
@@ -648,6 +668,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
hn
->
addr
=
p_hn
->
addr
;
p_hn
->
allocated
=
false
;
// avoid freeing the parent
view_src_hn
->
allocated
=
false
;
ggml_gallocr_free_extra_space
(
galloc
,
node
,
view_src
);
return
;
}
}
else
{
...
...
@@ -655,6 +676,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
hn
->
buffer_id
=
p_hn
->
buffer_id
;
hn
->
addr
=
p_hn
->
addr
;
p_hn
->
allocated
=
false
;
// avoid freeing the parent
ggml_gallocr_free_extra_space
(
galloc
,
node
,
parent
);
return
;
}
}
...
...
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
View file @
544b6739
...
...
@@ -57,6 +57,10 @@
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_HEXAGON
#include "ggml-hexagon.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
...
...
@@ -211,6 +215,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_OPENCL
register_backend
(
ggml_backend_opencl_reg
());
#endif
#ifdef GGML_USE_HEXAGON
register_backend
(
ggml_backend_hexagon_reg
());
#endif
#ifdef GGML_USE_CANN
register_backend
(
ggml_backend_cann_reg
());
#endif
...
...
@@ -615,6 +622,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best
(
"sycl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"vulkan"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"opencl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"hexagon"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"musa"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cpu"
,
silent
,
dir_path
);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
544b6739
...
...
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list
(
APPEND ARCH_FLAGS
"-march=
${
MARCH_STR
}
"
-mabi=lp64d
)
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
message
(
STATUS
"s390x detected"
)
list
(
APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c
)
file
(
READ
"/proc/cpuinfo"
CPUINFO_CONTENTS
)
string
(
REGEX REPLACE
"machine[
\t\r\n
]*=[
\t\r\n
]*([0-9]+)"
"
\\
1"
S390X_M
${
CPUINFO_CONTENTS
}
)
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
message
(
STATUS
"z15 target"
)
list
(
APPEND ARCH_FLAGS -march=z15
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=z16
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=arch15
)
else
()
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
list
(
APPEND ARCH_FLAGS -march=native -mtune=native
)
list
(
APPEND GGML_CPU_SOURCES
ggml-cpu/arch/s390/quants.c
)
# for native compilation
if
(
GGML_NATIVE
)
# check machine level to determine target
file
(
READ
"/proc/cpuinfo"
CPUINFO_CONTENTS
)
string
(
REGEX REPLACE
"machine[
\t\r\n
]*=[
\t\r\n
]*([0-9]+)"
"
\\
1"
S390X_M
${
CPUINFO_CONTENTS
}
)
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
message
(
STATUS
"z15 target"
)
list
(
APPEND ARCH_FLAGS -march=z15
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=z16
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=arch15
)
else
()
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
list
(
APPEND ARCH_FLAGS -march=native -mtune=native
)
endif
()
# for cross-compilation
elseif
(
GGML_CPU_ALL_VARIANTS
)
# range through IBM z15 to z17
# NOTE: update when a new hardware level is released
foreach
(
ZHW RANGE 15 17
)
if
(
DEFINED GGML_INTERNAL_Z
${
ZHW
}
)
message
(
STATUS
"z
${
ZHW
}
cross-compile target"
)
list
(
APPEND ARCH_FLAGS -march=z
${
ZHW
}
)
endif
()
endforeach
()
endif
()
if
(
GGML_VXE
)
if
(
GGML_VXE
OR GGML_INTERNAL_VXE
)
message
(
STATUS
"VX/VXE/VXE2 enabled"
)
list
(
APPEND ARCH_FLAGS -mvx -mzvector
)
list
(
APPEND ARCH_DEFINITIONS GGML_VXE
)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
View file @
544b6739
...
...
@@ -2186,6 +2186,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case
GGML_UNARY_OP_HARDSWISH
:
case
GGML_UNARY_OP_HARDSIGMOID
:
case
GGML_UNARY_OP_EXP
:
case
GGML_UNARY_OP_FLOOR
:
case
GGML_UNARY_OP_CEIL
:
case
GGML_UNARY_OP_ROUND
:
case
GGML_UNARY_OP_TRUNC
:
{
n_tasks
=
1
;
}
break
;
...
...
@@ -3569,13 +3573,17 @@ void ggml_cpu_init(void) {
#ifdef GGML_USE_OPENMP
//if (!getenv("OMP_WAIT_POLICY")) {
// // set the wait policy to active, so that OpenMP threads don't sleep
//
pu
tenv("OMP_WAIT_POLICY
=
active")
;
//
se
tenv("OMP_WAIT_POLICY
", "
active"
, 0
)
//}
if
(
!
getenv
(
"KMP_BLOCKTIME"
))
{
// set the time to wait before sleeping a thread
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
putenv
(
"KMP_BLOCKTIME=200"
);
// 200ms
#ifdef _WIN32
_putenv_s
(
"KMP_BLOCKTIME"
,
"200"
);
// 200ms
#else
setenv
(
"KMP_BLOCKTIME"
,
"200"
,
0
);
// 200ms
#endif
}
#endif
}
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
View file @
544b6739
...
...
@@ -9033,6 +9033,22 @@ void ggml_compute_forward_unary(
{
ggml_compute_forward_exp
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_FLOOR
:
{
ggml_compute_forward_floor
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_CEIL
:
{
ggml_compute_forward_ceil
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_ROUND
:
{
ggml_compute_forward_round
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_TRUNC
:
{
ggml_compute_forward_trunc
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_XIELU
:
{
ggml_compute_forward_xielu
(
params
,
dst
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
View file @
544b6739
...
...
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
return
logf
(
x
);
}
static
inline
float
op_floor
(
float
x
)
{
return
floorf
(
x
);
}
static
inline
float
op_ceil
(
float
x
)
{
return
ceilf
(
x
);
}
static
inline
float
op_round
(
float
x
)
{
return
roundf
(
x
);
}
static
inline
float
op_trunc
(
float
x
)
{
return
truncf
(
x
);
}
template
<
float
(
*
op
)(
float
),
typename
src0_t
,
typename
dst_t
>
static
inline
void
vec_unary_op
(
int64_t
n
,
dst_t
*
y
,
const
src0_t
*
x
)
{
constexpr
auto
src0_to_f32
=
type_conversion_table
<
src0_t
>::
to_f32
;
...
...
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
unary_op
<
op_log
>
(
params
,
dst
);
}
void
ggml_compute_forward_floor
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_floor
>
(
params
,
dst
);
}
void
ggml_compute_forward_ceil
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_ceil
>
(
params
,
dst
);
}
void
ggml_compute_forward_round
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_round
>
(
params
,
dst
);
}
void
ggml_compute_forward_trunc
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_trunc
>
(
params
,
dst
);
}
void
ggml_compute_forward_xielu
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
const
float
alpha_n
=
ggml_get_op_params_f32
(
dst
,
1
);
const
float
alpha_p
=
ggml_get_op_params_f32
(
dst
,
2
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
View file @
544b6739
...
...
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
void
ggml_compute_forward_sin
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_cos
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_log
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_floor
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_ceil
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_round
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_trunc
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_xielu
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
#ifdef __cplusplus
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment