Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
272 additions
and
140 deletions
+272
-140
llama/patches/0022-ggml-No-alloc-mode.patch
llama/patches/0022-ggml-No-alloc-mode.patch
+19
-19
llama/patches/0023-decode-disable-output_all.patch
llama/patches/0023-decode-disable-output_all.patch
+1
-1
llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
.../patches/0024-ggml-Enable-resetting-backend-devices.patch
+9
-9
llama/patches/0025-harden-uncaught-exception-registration.patch
...patches/0025-harden-uncaught-exception-registration.patch
+1
-1
llama/patches/0026-GPU-discovery-enhancements.patch
llama/patches/0026-GPU-discovery-enhancements.patch
+22
-22
llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
...UDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
+0
-49
llama/patches/0028-report-LoadLibrary-failures.patch
llama/patches/0028-report-LoadLibrary-failures.patch
+2
-2
llama/patches/0029-interleave-multi-rope.patch
llama/patches/0029-interleave-multi-rope.patch
+3
-3
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
+9
-9
ml/backend/ggml/ggml/include/ggml-hexagon.h
ml/backend/ggml/ggml/include/ggml-hexagon.h
+19
-0
ml/backend/ggml/ggml/include/ggml-rpc.h
ml/backend/ggml/ggml/include/ggml-rpc.h
+1
-2
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+44
-0
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+13
-0
ml/backend/ggml/ggml/src/ggml-alloc.c
ml/backend/ggml/ggml/src/ggml-alloc.c
+22
-0
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+8
-0
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+37
-21
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+10
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+16
-0
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
+32
-0
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
+4
-0
No files found.
llama/patches/0022-ggml-No-alloc-mode.patch
View file @
544b6739
...
@@ -219,7 +219,7 @@ index 41eef3b5f..c81a2e48a 100644
...
@@ -219,7 +219,7 @@ index 41eef3b5f..c81a2e48a 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index
e0abde542..e98044bd8
100644
index
41ff89c4d..2931c15ca
100644
--- a/ggml/src/ggml-cuda/common.cuh
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,41 @@
@@ -35,6 +35,41 @@
...
@@ -274,7 +274,7 @@ index e0abde542..e98044bd8 100644
...
@@ -274,7 +274,7 @@ index e0abde542..e98044bd8 100644
};
};
template<typename T>
template<typename T>
@@ -99
9
,11 +103
7
,11 @@
struct ggml_backend_cuda_context {
@@ -99
2
,11 +103
0
,11 @@
struct ggml_backend_cuda_context {
// pool
// pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
...
@@ -288,7 +288,7 @@ index e0abde542..e98044bd8 100644
...
@@ -288,7 +288,7 @@ index e0abde542..e98044bd8 100644
}
}
return *pools[device];
return *pools[device];
}
}
@@ -10
11
,4 +104
9
,20 @@
struct ggml_backend_cuda_context {
@@ -10
04
,4 +104
2
,20 @@
struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() {
ggml_cuda_pool & pool() {
return pool(device);
return pool(device);
}
}
...
@@ -310,10 +310,10 @@ index e0abde542..e98044bd8 100644
...
@@ -310,10 +310,10 @@ index e0abde542..e98044bd8 100644
+ }
+ }
};
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
c555cd30f..eb3db0f19
100644
index
02d413467..f79e5d65c
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -35
0
,6 +35
0
,8 @@
const ggml_cuda_device_info & ggml_cuda_info() {
@@ -35
9
,6 +35
9
,8 @@
const ggml_cuda_device_info & ggml_cuda_info() {
// #define DEBUG_CUDA_MALLOC
// #define DEBUG_CUDA_MALLOC
...
@@ -322,7 +322,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -322,7 +322,7 @@ index c555cd30f..eb3db0f19 100644
// buffer pool for cuda (legacy)
// buffer pool for cuda (legacy)
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
static const int MAX_BUFFERS = 256;
static const int MAX_BUFFERS = 256;
@@ -3
62
,9 +3
64
,12 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -3
71
,9 +3
73
,12 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
size_t pool_size = 0;
size_t pool_size = 0;
...
@@ -337,7 +337,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -337,7 +337,7 @@ index c555cd30f..eb3db0f19 100644
}
}
~ggml_cuda_pool_leg() {
~ggml_cuda_pool_leg() {
@@ -3
72
,7 +3
77
,9 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -3
81
,7 +3
86
,9 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
for (int i = 0; i < MAX_BUFFERS; ++i) {
for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cuda_buffer & b = buffer_pool[i];
ggml_cuda_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
if (b.ptr != nullptr) {
...
@@ -348,7 +348,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -348,7 +348,7 @@ index c555cd30f..eb3db0f19 100644
pool_size -= b.size;
pool_size -= b.size;
}
}
}
}
@@ -42
0
,8 +4
27
,15 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -42
9
,8 +4
36
,15 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
void * ptr;
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
...
@@ -366,7 +366,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -366,7 +366,7 @@ index c555cd30f..eb3db0f19 100644
*actual_size = look_ahead_size;
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
#ifdef DEBUG_CUDA_MALLOC
@@ -4
41
,10 +4
55
,20 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -4
50
,10 +4
64
,20 @@
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}
}
}
}
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
...
@@ -389,7 +389,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -389,7 +389,7 @@ index c555cd30f..eb3db0f19 100644
};
};
// pool with virtual memory
// pool with virtual memory
@@ -4
5
6,18 +48
0
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -46
5
,18 +48
9
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
CUdeviceptr pool_addr = 0;
CUdeviceptr pool_addr = 0;
size_t pool_used = 0;
size_t pool_used = 0;
size_t pool_size = 0;
size_t pool_size = 0;
...
@@ -417,7 +417,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -417,7 +417,7 @@ index c555cd30f..eb3db0f19 100644
#if defined(GGML_USE_HIP)
#if defined(GGML_USE_HIP)
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
@@ -
494
,35 +5
24
,49 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -
503
,35 +5
33
,49 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
...
@@ -493,7 +493,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -493,7 +493,7 @@ index c555cd30f..eb3db0f19 100644
// add to the pool
// add to the pool
pool_size += reserve_size;
pool_size += reserve_size;
@@ -5
55
,16 +
599
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
@@ -5
64
,16 +
608
,24 @@
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
// all deallocations must be in reverse order of the allocations
// all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
}
}
...
@@ -521,7 +521,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -521,7 +521,7 @@ index c555cd30f..eb3db0f19 100644
}
}
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
@@ -7
48
,11 +80
0
,20 @@
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
@@ -7
57
,11 +80
9
,20 @@
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
}
}
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
...
@@ -543,7 +543,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -543,7 +543,7 @@ index c555cd30f..eb3db0f19 100644
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
size_t size = ggml_nbytes(tensor);
size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0];
int64_t ne0 = tensor->ne[0];
@@ -7
76
,6 +8
37
,7 @@
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
@@ -7
85
,6 +8
46
,7 @@
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .is_host = */ NULL,
/* .is_host = */ NULL,
...
@@ -551,7 +551,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -551,7 +551,7 @@ index c555cd30f..eb3db0f19 100644
};
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -
3003
,6 +30
65
,7 @@
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
@@ -
2986
,6 +30
48
,7 @@
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
...
@@ -559,7 +559,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -559,7 +559,7 @@ index c555cd30f..eb3db0f19 100644
// flag used to determine whether it is an integrated_gpu
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -301
8
,6 +30
81
,11 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -30
0
1,6 +30
64
,11 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
continue;
}
}
...
@@ -571,7 +571,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -571,7 +571,7 @@ index c555cd30f..eb3db0f19 100644
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
if (!disable_fusion) {
@@ -314
4
,6 +32
12
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -314
0
,6 +32
08
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
...
@@ -579,7 +579,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -579,7 +579,7 @@ index c555cd30f..eb3db0f19 100644
ggml_cuda_set_device(cuda_ctx->device);
ggml_cuda_set_device(cuda_ctx->device);
@@ -32
23
,6 +32
92
,71 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -32
15
,6 +32
84
,71 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
return GGML_STATUS_SUCCESS;
}
}
...
@@ -651,7 +651,7 @@ index c555cd30f..eb3db0f19 100644
...
@@ -651,7 +651,7 @@ index c555cd30f..eb3db0f19 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -32
63
,6 +339
7
,9 @@
static const ggml_backend_i ggml_backend_cuda_interface = {
@@ -32
55
,6 +33
8
9,9 @@
static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
/* .graph_optimize = */ NULL,
...
...
llama/patches/0023-decode-disable-output_all.patch
View file @
544b6739
...
@@ -8,7 +8,7 @@ Subject: [PATCH] decode: disable output_all
...
@@ -8,7 +8,7 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-)
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
e7526e7d..53a5e3a9
100644
index
bd348bcad..8b4a89d38
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -974,8 +974,7 @@
int llama_context::decode(const llama_batch & batch_inp) {
@@ -974,8 +974,7 @@
int llama_context::decode(const llama_batch & batch_inp) {
...
...
llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
View file @
544b6739
...
@@ -16,7 +16,7 @@ unused then it can be reset to free these data structures.
...
@@ -16,7 +16,7 @@ unused then it can be reset to free these data structures.
6 files changed, 32 insertions(+), 2 deletions(-)
6 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index
1ff53ed03..ba181d09d
100644
index
b3b5b356a..69223c488
100644
--- a/ggml/include/ggml-backend.h
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -178,6 +178,7 @@
extern "C" {
@@ -178,6 +178,7 @@
extern "C" {
...
@@ -28,7 +28,7 @@ index 1ff53ed03..ba181d09d 100644
...
@@ -28,7 +28,7 @@ index 1ff53ed03..ba181d09d 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index
3c3f22fc0..43c91d9f2
100644
index
7bdf9d81f..21b35ac5c
100644
--- a/ggml/src/ggml-backend-impl.h
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -195,6 +195,10 @@
extern "C" {
@@ -195,6 +195,10 @@
extern "C" {
...
@@ -43,7 +43,7 @@ index 3c3f22fc0..43c91d9f2 100644
...
@@ -43,7 +43,7 @@ index 3c3f22fc0..43c91d9f2 100644
struct ggml_backend_device {
struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index
6ef5eeafa..0b757af59
100644
index
c81a2e48a..9b0a9b91f
100644
--- a/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -526,6 +526,14 @@
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
@@ -526,6 +526,14 @@
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
...
@@ -62,7 +62,7 @@ index 6ef5eeafa..0b757af59 100644
...
@@ -62,7 +62,7 @@ index 6ef5eeafa..0b757af59 100644
GGML_ASSERT(device);
GGML_ASSERT(device);
return device->iface.get_buffer_type(device);
return device->iface.get_buffer_type(device);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
811462c79..87c6c34a4
100644
index
f79e5d65c..c9333689f
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -107,6 +107,11 @@
int ggml_cuda_get_device() {
@@ -107,6 +107,11 @@
int ggml_cuda_get_device() {
...
@@ -77,7 +77,7 @@ index 811462c79..87c6c34a4 100644
...
@@ -77,7 +77,7 @@ index 811462c79..87c6c34a4 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
ggml_cuda_set_device(device);
cudaError_t err;
cudaError_t err;
@@ -3
515
,7 +35
2
0,10 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -3
499
,7 +350
4
,10 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->id = ggml_backend_cuda_device_get_id(dev);
props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
...
@@ -89,7 +89,7 @@ index 811462c79..87c6c34a4 100644
...
@@ -89,7 +89,7 @@ index 811462c79..87c6c34a4 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
#ifdef GGML_CUDA_NO_PEER_COPY
@@ -39
48
,6 +39
56
,11 @@
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
@@ -39
36
,6 +39
44
,11 @@
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
}
...
@@ -101,7 +101,7 @@ index 811462c79..87c6c34a4 100644
...
@@ -101,7 +101,7 @@ index 811462c79..87c6c34a4 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description,
/* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -39
64
,6 +39
77
,7 @@
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
@@ -39
52
,6 +39
65
,7 @@
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
...
@@ -122,10 +122,10 @@ index 890c10364..1f06be80e 100644
...
@@ -122,10 +122,10 @@ index 890c10364..1f06be80e 100644
#define cudaError_t hipError_t
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
diff --git a/src/llama.cpp b/src/llama.cpp
diff --git a/src/llama.cpp b/src/llama.cpp
index
fe5a7a835..d821a96a0
100644
index
ab2e9868a..74c49e651
100644
--- a/src/llama.cpp
--- a/src/llama.cpp
+++ b/src/llama.cpp
+++ b/src/llama.cpp
@@ -2
6
7,10 +2
6
7,12 @@
static struct llama_model * llama_model_load_from_file_impl(
@@ -27
0
,10 +27
0
,12 @@
static struct llama_model * llama_model_load_from_file_impl(
for (auto * dev : model->devices) {
for (auto * dev : model->devices) {
ggml_backend_dev_props props;
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
ggml_backend_dev_get_props(dev, &props);
...
...
llama/patches/0025-harden-uncaught-exception-registration.patch
View file @
544b6739
...
@@ -8,7 +8,7 @@ Subject: [PATCH] harden uncaught exception registration
...
@@ -8,7 +8,7 @@ Subject: [PATCH] harden uncaught exception registration
1 file changed, 6 insertions(+), 2 deletions(-)
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
index 0d388d45..f5bcb446 100644
index 0d388d45
5
..f5bcb446
d
100644
--- a/ggml/src/ggml.cpp
--- a/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
@@ -19,8 +19,12 @@
static bool ggml_uncaught_exception_init = []{
@@ -19,8 +19,12 @@
static bool ggml_uncaught_exception_init = []{
...
...
llama/patches/0026-GPU-discovery-enhancements.patch
View file @
544b6739
...
@@ -45,7 +45,7 @@ index 69223c488..6510e0cba 100644
...
@@ -45,7 +45,7 @@ index 69223c488..6510e0cba 100644
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
0609c6503..aefe43bdd
100644
index
f9a6587f1..03f359ae9
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -209,6 +209,8 @@
add_library(ggml-base
@@ -209,6 +209,8 @@
add_library(ggml-base
...
@@ -58,7 +58,7 @@ index 0609c6503..aefe43bdd 100644
...
@@ -58,7 +58,7 @@ index 0609c6503..aefe43bdd 100644
target_include_directories(ggml-base PRIVATE .)
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
5787e8cd5..d232bf828
100644
index
c9333689f..41b00af83
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -261,6 +261,16 @@
static ggml_cuda_device_info ggml_cuda_init() {
@@ -261,6 +261,16 @@
static ggml_cuda_device_info ggml_cuda_init() {
...
@@ -90,7 +90,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -90,7 +90,7 @@ index 5787e8cd5..d232bf828 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
ggml_cuda_parse_uuid(prop, id).c_str());
@@ -34
7
6,6 +34
91
,11 @@
struct ggml_backend_cuda_device_context {
@@ -346
8
,6 +34
83
,11 @@
struct ggml_backend_cuda_device_context {
std::string description;
std::string description;
std::string pci_bus_id;
std::string pci_bus_id;
std::string id;
std::string id;
...
@@ -102,7 +102,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -102,7 +102,7 @@ index 5787e8cd5..d232bf828 100644
};
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -34
96
,6 +35
16
,28 @@
static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
@@ -34
88
,6 +35
08
,28 @@
static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
ggml_cuda_set_device(ctx->device);
...
@@ -131,7 +131,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -131,7 +131,7 @@ index 5787e8cd5..d232bf828 100644
CUDA_CHECK(cudaMemGetInfo(free, total));
CUDA_CHECK(cudaMemGetInfo(free, total));
}
}
@@ -3
504
,6 +35
46
,7 @@
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
@@ -3
496
,6 +35
38
,7 @@
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
}
...
@@ -139,7 +139,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -139,7 +139,7 @@ index 5787e8cd5..d232bf828 100644
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -35
17
,6 +35
60
,19 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -35
09
,6 +35
52
,19 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
props->memory_total = props->memory_free = 0;
...
@@ -159,7 +159,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -159,7 +159,7 @@ index 5787e8cd5..d232bf828 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
bool events = false;
@@ -407
9
,6 +413
5
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -407
5
,6 +413
1
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
...
@@ -167,7 +167,7 @@ index 5787e8cd5..d232bf828 100644
...
@@ -167,7 +167,7 @@ index 5787e8cd5..d232bf828 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -409
4
,6 +41
51
,14 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -409
0
,6 +41
47
,14 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->pci_bus_id = pci_bus_id;
...
@@ -204,11 +204,11 @@ index 1f06be80e..2f9ef2dc0 100644
...
@@ -204,11 +204,11 @@ index 1f06be80e..2f9ef2dc0 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index
d0fb3bcca..b63edd0c1
100644
index
e9201cdc6..44ae76d66
100644
--- a/ggml/src/ggml-impl.h
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -6
38
,6 +6
38
,14 @@
static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph,
int node_idx
@@ -6
77
,6 +6
77
,14 @@
static inline bool ggml_can_fuse
_subgraph
(const struct ggml_cgraph * cgraph,
return ggml_can_fuse_ext(cgraph, idxs,
ops, num_op
s);
return ggml_can_fuse_
subgraph_
ext(cgraph, idxs,
count, ops, outputs, num_output
s);
}
}
+// Management libraries for fetching more accurate free VRAM data
+// Management libraries for fetching more accurate free VRAM data
...
@@ -243,10 +243,10 @@ index 05ff6a5a6..032dee76d 100644
...
@@ -243,10 +243,10 @@ index 05ff6a5a6..032dee76d 100644
/* .async = */ true,
/* .async = */ true,
/* .host_buffer = */ false,
/* .host_buffer = */ false,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
bd3ece516..7cfb14a54
100644
index
3a6bbe564..d2c278a35
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2
31
,6 +2
31
,7 @@
class vk_memory_logger;
@@ -2
29
,6 +2
29
,7 @@
class vk_memory_logger;
#endif
#endif
class vk_perf_logger;
class vk_perf_logger;
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static void ggml_vk_destroy_buffer(vk_buffer& buf);
...
@@ -254,7 +254,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -254,7 +254,7 @@ index bd3ece516..7cfb14a54 100644
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11
585
,6 +11
586
,29 @@
static void ggml_vk_get_device_description(int device, char * description, size_
@@ -11
813
,6 +11
814
,29 @@
static void ggml_vk_get_device_description(int device, char * description, size_
snprintf(description, description_size, "%s", props.deviceName.data());
snprintf(description, description_size, "%s", props.deviceName.data());
}
}
...
@@ -284,7 +284,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -284,7 +284,7 @@ index bd3ece516..7cfb14a54 100644
// backend interface
// backend interface
#define UNUSED GGML_UNUSED
#define UNUSED GGML_UNUSED
@@ -12
392
,31 +12
416
,102 @@
void ggml_backend_vk_get_device_description(int device, char * description, size
@@ -12
761
,31 +12
785
,102 @@
void ggml_backend_vk_get_device_description(int device, char * description, size
ggml_vk_get_device_description(dev_idx, description, description_size);
ggml_vk_get_device_description(dev_idx, description, description_size);
}
}
...
@@ -404,7 +404,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -404,7 +404,7 @@ index bd3ece516..7cfb14a54 100644
break;
break;
}
}
}
}
@@ -12
449
,8 +12
544
,13 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
@@ -12
818
,8 +12
913
,13 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
}
}
}
}
...
@@ -419,7 +419,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -419,7 +419,7 @@ index bd3ece516..7cfb14a54 100644
}
}
vk::PhysicalDeviceProperties2 props = {};
vk::PhysicalDeviceProperties2 props = {};
@@ -12
467
,19 +12
567
,24 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
@@ -12
836
,19 +12
936
,24 @@
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
char pci_bus_id[16] = {};
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
...
@@ -453,7 +453,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -453,7 +453,7 @@ index bd3ece516..7cfb14a54 100644
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -12
491
,9 +12
5
96,14 @@
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
@@ -12
860
,9 +1296
5
,14 @@
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
return ctx->description.c_str();
return ctx->description.c_str();
}
}
...
@@ -469,7 +469,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -469,7 +469,7 @@ index bd3ece516..7cfb14a54 100644
}
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -12
517
,8 +12
627
,9 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
@@ -12
886
,8 +12
996
,9 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->name = ggml_backend_vk_device_get_name(dev);
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
props->description = ggml_backend_vk_device_get_description(dev);
...
@@ -480,7 +480,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -480,7 +480,7 @@ index bd3ece516..7cfb14a54 100644
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
props->caps = {
/* .async = */ false,
/* .async = */ false,
@@ -12
526
,6 +1
2637
,13 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
@@ -12
895
,6 +1
3006
,13 @@
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
/* .buffer_from_host_ptr = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
/* .events = */ false,
};
};
...
@@ -494,7 +494,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -494,7 +494,7 @@ index bd3ece516..7cfb14a54 100644
}
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -1
2954
,6 +13
072
,8 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -1
3365
,6 +13
483
,8 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
if (!initialized) {
...
@@ -503,7 +503,7 @@ index bd3ece516..7cfb14a54 100644
...
@@ -503,7 +503,7 @@ index bd3ece516..7cfb14a54 100644
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
char desc[256];
@@ -1
2962
,12 +13
082
,41 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -1
3373
,12 +13
493
,41 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->description = desc;
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
...
...
llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
deleted
100644 → 0
View file @
c4ba257c
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Julius Tischbein <ju.tischbein@gmail.com>
Date: Wed, 15 Oct 2025 13:54:15 +0200
Subject: [PATCH] CUDA: Changing the CUDA scheduling strategy to spin (#16585)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* CUDA set scheduling strategy to spinning for cc121
* Using prop.major and prop.minor, include HIP and MUSA
* Exclude HIP and MUSA
* Remove trailing whitespace
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* Remove empty line
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
ggml/src/ggml-cuda/ggml-cuda.cu | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b075a18be..d62f412d6 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -340,6 +340,15 @@
static ggml_cuda_device_info ggml_cuda_init() {
} else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") {
turing_devices_without_mma.push_back({ id, device_name });
}
+
+ // Temporary performance fix:
+ // Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
+ // TODO: Check for future drivers the default scheduling strategy and
+ // remove this call again when cudaDeviceScheduleSpin is default.
+ if (prop.major == 12 && prop.minor == 1) {
+ CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
+ }
+
#endif // defined(GGML_USE_HIP)
}
llama/patches/002
9
-report-LoadLibrary-failures.patch
→
llama/patches/002
8
-report-LoadLibrary-failures.patch
View file @
544b6739
...
@@ -8,10 +8,10 @@ Subject: [PATCH] report LoadLibrary failures
...
@@ -8,10 +8,10 @@ Subject: [PATCH] report LoadLibrary failures
1 file changed, 12 insertions(+)
1 file changed, 12 insertions(+)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index
f794d9cfa..3a855ab2e
100644
index
a55d9b280..ec6f7f1e9
100644
--- a/ggml/src/ggml-backend-reg.cpp
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -1
18
,6 +1
18
,18 @@
static dl_handle * dl_load_library(const fs::path & path) {
@@ -1
22
,6 +1
22
,18 @@
static dl_handle * dl_load_library(const fs::path & path) {
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.wstring().c_str());
HMODULE handle = LoadLibraryW(path.wstring().c_str());
...
...
llama/patches/00
31
-interleave-multi-rope.patch
→
llama/patches/00
29
-interleave-multi-rope.patch
View file @
544b6739
...
@@ -13,7 +13,7 @@ interleaved version used for qwen3vl
...
@@ -13,7 +13,7 @@ interleaved version used for qwen3vl
4 files changed, 11 insertions(+), 30 deletions(-)
4 files changed, 11 insertions(+), 30 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
31478dd8e..4d1ed207e
100644
index
902fdad69..70955347d
100644
--- a/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5509,15 +5509,12 @@
static void ggml_mrope_cache_init(
@@ -5509,15 +5509,12 @@
static void ggml_mrope_cache_init(
...
@@ -62,10 +62,10 @@ index d058504cd..287fe9d2c 100644
...
@@ -62,10 +62,10 @@ index d058504cd..287fe9d2c 100644
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
375a0c7fd..9866c96b4
100644
index
50b8071de..65a3183c8
100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -38
5
8,15 +38
5
8,11 @@
kernel void kernel_rope_multi(
@@ -38
8
8,15 +38
8
8,11 @@
kernel void kernel_rope_multi(
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
const int sector = ic % sect_dims;
const int sector = ic % sect_dims;
...
...
llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch
View file @
544b6739
...
@@ -12,7 +12,7 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
...
@@ -12,7 +12,7 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
aefe43bdd..21fe4640c
100644
index
03f359ae9..4b3e5efb5
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -211,6 +211,7 @@
add_library(ggml-base
@@ -211,6 +211,7 @@
add_library(ggml-base
...
@@ -24,10 +24,10 @@ index aefe43bdd..21fe4640c 100644
...
@@ -24,10 +24,10 @@ index aefe43bdd..21fe4640c 100644
target_include_directories(ggml-base PRIVATE .)
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index
b63edd0c1..81cad8cf3
100644
index
44ae76d66..639d551a2
100644
--- a/ggml/src/ggml-impl.h
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -64
5
,6 +64
5
,9 @@
GGML_API void ggml_nvml_release();
@@ -6
8
4,6 +6
8
4,9 @@
GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API void ggml_hip_mgmt_release();
GGML_API void ggml_hip_mgmt_release();
...
@@ -38,7 +38,7 @@ index b63edd0c1..81cad8cf3 100644
...
@@ -38,7 +38,7 @@ index b63edd0c1..81cad8cf3 100644
#ifdef __cplusplus
#ifdef __cplusplus
}
}
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
7cfb14a54..a1c46d0b3
100644
index
d2c278a35..221e29509
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -73,6 +73,7 @@
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
@@ -73,6 +73,7 @@
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
...
@@ -49,7 +49,7 @@ index 7cfb14a54..a1c46d0b3 100644
...
@@ -49,7 +49,7 @@ index 7cfb14a54..a1c46d0b3 100644
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
VkStructureType sType;
VkStructureType sType;
@@ -12
433
,6 +12
434
,7 @@
struct ggml_backend_vk_device_context {
@@ -12
802
,6 +12
803
,7 @@
struct ggml_backend_vk_device_context {
std::string pci_id;
std::string pci_id;
std::string id;
std::string id;
std::string uuid;
std::string uuid;
...
@@ -57,7 +57,7 @@ index 7cfb14a54..a1c46d0b3 100644
...
@@ -57,7 +57,7 @@ index 7cfb14a54..a1c46d0b3 100644
int major;
int major;
int minor;
int minor;
int driver_major;
int driver_major;
@@ -12
448
,8 +12
450
,22 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
@@ -12
817
,8 +12
819
,22 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
vk::PhysicalDeviceProperties2 props2;
vk::PhysicalDeviceProperties2 props2;
vkdev.getProperties2(&props2);
vkdev.getProperties2(&props2);
...
@@ -81,7 +81,7 @@ index 7cfb14a54..a1c46d0b3 100644
...
@@ -81,7 +81,7 @@ index 7cfb14a54..a1c46d0b3 100644
{
{
// Use vendor specific management libraries for best VRAM reporting if available
// Use vendor specific management libraries for best VRAM reporting if available
switch (props2.properties.vendorID) {
switch (props2.properties.vendorID) {
@@ -12
477
,8 +12
493
,8 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
@@ -12
846
,8 +12
862
,8 @@
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
break;
break;
}
}
}
}
...
@@ -91,7 +91,7 @@ index 7cfb14a54..a1c46d0b3 100644
...
@@ -91,7 +91,7 @@ index 7cfb14a54..a1c46d0b3 100644
*total = 0;
*total = 0;
*free = 0;
*free = 0;
vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
@@ -13
089
,7 +13
105
,6 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -13
500
,7 +13
516
,6 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
/* .reg = */ reg,
/* .reg = */ reg,
/* .context = */ ctx,
/* .context = */ ctx,
});
});
...
@@ -99,7 +99,7 @@ index 7cfb14a54..a1c46d0b3 100644
...
@@ -99,7 +99,7 @@ index 7cfb14a54..a1c46d0b3 100644
// Gather additional information about the device
// Gather additional information about the device
int dev_idx = vk_instance.device_indices[i];
int dev_idx = vk_instance.device_indices[i];
vk::PhysicalDeviceProperties props1;
vk::PhysicalDeviceProperties props1;
@@ -13
112
,6 +13
127
,14 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
@@ -13
523
,6 +13
538
,14 @@
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
}
}
ctx->uuid = oss.str();
ctx->uuid = oss.str();
...
...
ml/backend/ggml/ggml/include/ggml-hexagon.h
0 → 100644
View file @
544b6739
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern
"C"
{
#endif
// backend API
GGML_BACKEND_API
ggml_backend_t
ggml_backend_hexagon_init
(
void
);
GGML_BACKEND_API
bool
ggml_backend_is_hexagon
(
ggml_backend_t
backend
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_hexagon_reg
(
void
);
#ifdef __cplusplus
}
#endif
ml/backend/ggml/ggml/include/ggml-rpc.h
View file @
544b6739
...
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
...
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
GGML_BACKEND_API
void
ggml_backend_rpc_get_device_memory
(
const
char
*
endpoint
,
uint32_t
device
,
size_t
*
free
,
size_t
*
total
);
GGML_BACKEND_API
void
ggml_backend_rpc_get_device_memory
(
const
char
*
endpoint
,
uint32_t
device
,
size_t
*
free
,
size_t
*
total
);
GGML_BACKEND_API
void
ggml_backend_rpc_start_server
(
const
char
*
endpoint
,
const
char
*
cache_dir
,
GGML_BACKEND_API
void
ggml_backend_rpc_start_server
(
const
char
*
endpoint
,
const
char
*
cache_dir
,
size_t
n_threads
,
size_t
n_devices
,
size_t
n_threads
,
size_t
n_devices
,
ggml_backend_dev_t
*
devices
);
ggml_backend_dev_t
*
devices
,
size_t
*
free_mem
,
size_t
*
total_mem
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_reg
(
void
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_reg
(
void
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_add_server
(
const
char
*
endpoint
);
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_rpc_add_server
(
const
char
*
endpoint
);
...
...
ml/backend/ggml/ggml/include/ggml.h
View file @
544b6739
...
@@ -577,6 +577,10 @@ extern "C" {
...
@@ -577,6 +577,10 @@ extern "C" {
GGML_UNARY_OP_EXP
,
GGML_UNARY_OP_EXP
,
GGML_UNARY_OP_GELU_ERF
,
GGML_UNARY_OP_GELU_ERF
,
GGML_UNARY_OP_XIELU
,
GGML_UNARY_OP_XIELU
,
GGML_UNARY_OP_FLOOR
,
GGML_UNARY_OP_CEIL
,
GGML_UNARY_OP_ROUND
,
GGML_UNARY_OP_TRUNC
,
GGML_UNARY_OP_COUNT
,
GGML_UNARY_OP_COUNT
,
};
};
...
@@ -1151,6 +1155,46 @@ extern "C" {
...
@@ -1151,6 +1155,46 @@ extern "C" {
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_floor
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_floor_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_ceil
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_ceil_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_round
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_round_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
/**
* Truncates the fractional part of each element in the tensor (towards zero).
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
* Similar to std::trunc in C/C++.
*/
GGML_API
struct
ggml_tensor
*
ggml_trunc
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
GGML_API
struct
ggml_tensor
*
ggml_trunc_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
);
// xIELU activation function
// xIELU activation function
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
544b6739
...
@@ -310,6 +310,10 @@ function(ggml_add_cpu_backend_variant tag_name)
...
@@ -310,6 +310,10 @@ function(ggml_add_cpu_backend_variant tag_name)
foreach
(
feat
${
ARGN
}
)
foreach
(
feat
${
ARGN
}
)
set
(
GGML_INTERNAL_
${
feat
}
ON
)
set
(
GGML_INTERNAL_
${
feat
}
ON
)
endforeach
()
endforeach
()
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
foreach
(
feat
${
ARGN
}
)
set
(
GGML_INTERNAL_
${
feat
}
ON
)
endforeach
()
endif
()
endif
()
ggml_add_cpu_backend_variant_impl
(
${
tag_name
}
)
ggml_add_cpu_backend_variant_impl
(
${
tag_name
}
)
...
@@ -372,6 +376,14 @@ if (GGML_CPU_ALL_VARIANTS)
...
@@ -372,6 +376,14 @@ if (GGML_CPU_ALL_VARIANTS)
else
()
else
()
message
(
FATAL_ERROR
"Unsupported PowerPC target OS:
${
CMAKE_SYSTEM_NAME
}
"
)
message
(
FATAL_ERROR
"Unsupported PowerPC target OS:
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
endif
()
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
if
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
ggml_add_cpu_backend_variant
(
s390x_z15 Z15 VXE
)
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
else
()
message
(
FATAL_ERROR
"Unsupported s390x target OS:
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
else
()
else
()
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS not yet supported with
${
GGML_SYSTEM_ARCH
}
on
${
CMAKE_SYSTEM_NAME
}
"
)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS not yet supported with
${
GGML_SYSTEM_ARCH
}
on
${
CMAKE_SYSTEM_NAME
}
"
)
endif
()
endif
()
...
@@ -391,6 +403,7 @@ ggml_add_backend(Vulkan)
...
@@ -391,6 +403,7 @@ ggml_add_backend(Vulkan)
ggml_add_backend
(
WebGPU
)
ggml_add_backend
(
WebGPU
)
ggml_add_backend
(
zDNN
)
ggml_add_backend
(
zDNN
)
ggml_add_backend
(
OpenCL
)
ggml_add_backend
(
OpenCL
)
ggml_add_backend
(
Hexagon
)
foreach
(
target ggml-base ggml
)
foreach
(
target ggml-base ggml
)
target_include_directories
(
${
target
}
PUBLIC $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/../include> $<INSTALL_INTERFACE:include>
)
target_include_directories
(
${
target
}
PUBLIC $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/../include> $<INSTALL_INTERFACE:include>
)
...
...
ml/backend/ggml/ggml/src/ggml-alloc.c
View file @
544b6739
...
@@ -603,6 +603,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
...
@@ -603,6 +603,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
return
t
->
data
!=
NULL
||
ggml_gallocr_hash_get
(
galloc
,
t
)
->
allocated
;
return
t
->
data
!=
NULL
||
ggml_gallocr_hash_get
(
galloc
,
t
)
->
allocated
;
}
}
// free the extra space at the end if the new tensor is smaller
static
void
ggml_gallocr_free_extra_space
(
ggml_gallocr_t
galloc
,
struct
ggml_tensor
*
node
,
struct
ggml_tensor
*
parent
)
{
struct
hash_node
*
hn
=
ggml_gallocr_hash_get
(
galloc
,
node
);
struct
hash_node
*
p_hn
=
ggml_gallocr_hash_get
(
galloc
,
parent
);
size_t
parent_size
=
ggml_backend_buft_get_alloc_size
(
galloc
->
bufts
[
p_hn
->
buffer_id
],
parent
);
size_t
node_size
=
ggml_backend_buft_get_alloc_size
(
galloc
->
bufts
[
hn
->
buffer_id
],
node
);
GGML_ASSERT
(
parent_size
>=
node_size
);
if
(
parent_size
>
node_size
)
{
struct
ggml_dyn_tallocr
*
p_alloc
=
galloc
->
buf_tallocs
[
p_hn
->
buffer_id
];
struct
buffer_address
p_addr
=
p_hn
->
addr
;
p_addr
.
offset
+=
node_size
;
size_t
extra_size
=
parent_size
-
node_size
;
AT_PRINTF
(
"freeing extra %zu bytes from parent %s for %s
\n
"
,
extra_size
,
parent
->
name
,
node
->
name
);
ggml_dyn_tallocr_free_tensor
(
p_alloc
,
p_addr
,
extra_size
,
parent
);
}
}
static
void
ggml_gallocr_allocate_node
(
ggml_gallocr_t
galloc
,
struct
ggml_tensor
*
node
,
int
buffer_id
)
{
static
void
ggml_gallocr_allocate_node
(
ggml_gallocr_t
galloc
,
struct
ggml_tensor
*
node
,
int
buffer_id
)
{
GGML_ASSERT
(
buffer_id
>=
0
);
GGML_ASSERT
(
buffer_id
>=
0
);
struct
hash_node
*
hn
=
ggml_gallocr_hash_get
(
galloc
,
node
);
struct
hash_node
*
hn
=
ggml_gallocr_hash_get
(
galloc
,
node
);
...
@@ -648,6 +668,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
...
@@ -648,6 +668,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
hn
->
addr
=
p_hn
->
addr
;
hn
->
addr
=
p_hn
->
addr
;
p_hn
->
allocated
=
false
;
// avoid freeing the parent
p_hn
->
allocated
=
false
;
// avoid freeing the parent
view_src_hn
->
allocated
=
false
;
view_src_hn
->
allocated
=
false
;
ggml_gallocr_free_extra_space
(
galloc
,
node
,
view_src
);
return
;
return
;
}
}
}
else
{
}
else
{
...
@@ -655,6 +676,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
...
@@ -655,6 +676,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
hn
->
buffer_id
=
p_hn
->
buffer_id
;
hn
->
buffer_id
=
p_hn
->
buffer_id
;
hn
->
addr
=
p_hn
->
addr
;
hn
->
addr
=
p_hn
->
addr
;
p_hn
->
allocated
=
false
;
// avoid freeing the parent
p_hn
->
allocated
=
false
;
// avoid freeing the parent
ggml_gallocr_free_extra_space
(
galloc
,
node
,
parent
);
return
;
return
;
}
}
}
}
...
...
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
View file @
544b6739
...
@@ -57,6 +57,10 @@
...
@@ -57,6 +57,10 @@
#include "ggml-opencl.h"
#include "ggml-opencl.h"
#endif
#endif
#ifdef GGML_USE_HEXAGON
#include "ggml-hexagon.h"
#endif
#ifdef GGML_USE_BLAS
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#include "ggml-blas.h"
#endif
#endif
...
@@ -211,6 +215,9 @@ struct ggml_backend_registry {
...
@@ -211,6 +215,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_OPENCL
#ifdef GGML_USE_OPENCL
register_backend
(
ggml_backend_opencl_reg
());
register_backend
(
ggml_backend_opencl_reg
());
#endif
#endif
#ifdef GGML_USE_HEXAGON
register_backend
(
ggml_backend_hexagon_reg
());
#endif
#ifdef GGML_USE_CANN
#ifdef GGML_USE_CANN
register_backend
(
ggml_backend_cann_reg
());
register_backend
(
ggml_backend_cann_reg
());
#endif
#endif
...
@@ -615,6 +622,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
...
@@ -615,6 +622,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best
(
"sycl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"sycl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"vulkan"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"vulkan"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"opencl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"opencl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"hexagon"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"musa"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"musa"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cpu"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cpu"
,
silent
,
dir_path
);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
544b6739
...
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list
(
APPEND ARCH_FLAGS
"-march=
${
MARCH_STR
}
"
-mabi=lp64d
)
list
(
APPEND ARCH_FLAGS
"-march=
${
MARCH_STR
}
"
-mabi=lp64d
)
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
elseif
(
GGML_SYSTEM_ARCH STREQUAL
"s390x"
)
message
(
STATUS
"s390x detected"
)
message
(
STATUS
"s390x detected"
)
list
(
APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c
)
list
(
APPEND GGML_CPU_SOURCES
file
(
READ
"/proc/cpuinfo"
CPUINFO_CONTENTS
)
ggml-cpu/arch/s390/quants.c
)
string
(
REGEX REPLACE
"machine[
\t\r\n
]*=[
\t\r\n
]*([0-9]+)"
"
\\
1"
S390X_M
${
CPUINFO_CONTENTS
}
)
# for native compilation
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
GGML_NATIVE
)
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
# check machine level to determine target
message
(
STATUS
"z15 target"
)
file
(
READ
"/proc/cpuinfo"
CPUINFO_CONTENTS
)
list
(
APPEND ARCH_FLAGS -march=z15
)
string
(
REGEX REPLACE
"machine[
\t\r\n
]*=[
\t\r\n
]*([0-9]+)"
"
\\
1"
S390X_M
${
CPUINFO_CONTENTS
}
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
# TODO: Separation to determine activation of VX/VXE/VXE2
list
(
APPEND ARCH_FLAGS -march=z16
)
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
message
(
STATUS
"z15 target"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
list
(
APPEND ARCH_FLAGS -march=z15
)
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z17 target"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=arch15
)
list
(
APPEND ARCH_FLAGS -march=z16
)
else
()
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
message
(
STATUS
"Unknown target"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
list
(
APPEND ARCH_FLAGS -march=native -mtune=native
)
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=arch15
)
else
()
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
list
(
APPEND ARCH_FLAGS -march=native -mtune=native
)
endif
()
# for cross-compilation
elseif
(
GGML_CPU_ALL_VARIANTS
)
# range through IBM z15 to z17
# NOTE: update when a new hardware level is released
foreach
(
ZHW RANGE 15 17
)
if
(
DEFINED GGML_INTERNAL_Z
${
ZHW
}
)
message
(
STATUS
"z
${
ZHW
}
cross-compile target"
)
list
(
APPEND ARCH_FLAGS -march=z
${
ZHW
}
)
endif
()
endforeach
()
endif
()
endif
()
if
(
GGML_VXE
)
if
(
GGML_VXE
OR GGML_INTERNAL_VXE
)
message
(
STATUS
"VX/VXE/VXE2 enabled"
)
message
(
STATUS
"VX/VXE/VXE2 enabled"
)
list
(
APPEND ARCH_FLAGS -mvx -mzvector
)
list
(
APPEND ARCH_FLAGS -mvx -mzvector
)
list
(
APPEND ARCH_DEFINITIONS GGML_VXE
)
list
(
APPEND ARCH_DEFINITIONS GGML_VXE
)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
View file @
544b6739
...
@@ -2186,6 +2186,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
...
@@ -2186,6 +2186,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case
GGML_UNARY_OP_HARDSWISH
:
case
GGML_UNARY_OP_HARDSWISH
:
case
GGML_UNARY_OP_HARDSIGMOID
:
case
GGML_UNARY_OP_HARDSIGMOID
:
case
GGML_UNARY_OP_EXP
:
case
GGML_UNARY_OP_EXP
:
case
GGML_UNARY_OP_FLOOR
:
case
GGML_UNARY_OP_CEIL
:
case
GGML_UNARY_OP_ROUND
:
case
GGML_UNARY_OP_TRUNC
:
{
{
n_tasks
=
1
;
n_tasks
=
1
;
}
break
;
}
break
;
...
@@ -3569,13 +3573,17 @@ void ggml_cpu_init(void) {
...
@@ -3569,13 +3573,17 @@ void ggml_cpu_init(void) {
#ifdef GGML_USE_OPENMP
#ifdef GGML_USE_OPENMP
//if (!getenv("OMP_WAIT_POLICY")) {
//if (!getenv("OMP_WAIT_POLICY")) {
// // set the wait policy to active, so that OpenMP threads don't sleep
// // set the wait policy to active, so that OpenMP threads don't sleep
//
pu
tenv("OMP_WAIT_POLICY
=
active")
;
//
se
tenv("OMP_WAIT_POLICY
", "
active"
, 0
)
//}
//}
if
(
!
getenv
(
"KMP_BLOCKTIME"
))
{
if
(
!
getenv
(
"KMP_BLOCKTIME"
))
{
// set the time to wait before sleeping a thread
// set the time to wait before sleeping a thread
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
putenv
(
"KMP_BLOCKTIME=200"
);
// 200ms
#ifdef _WIN32
_putenv_s
(
"KMP_BLOCKTIME"
,
"200"
);
// 200ms
#else
setenv
(
"KMP_BLOCKTIME"
,
"200"
,
0
);
// 200ms
#endif
}
}
#endif
#endif
}
}
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
View file @
544b6739
...
@@ -9033,6 +9033,22 @@ void ggml_compute_forward_unary(
...
@@ -9033,6 +9033,22 @@ void ggml_compute_forward_unary(
{
{
ggml_compute_forward_exp
(
params
,
dst
);
ggml_compute_forward_exp
(
params
,
dst
);
}
break
;
}
break
;
case
GGML_UNARY_OP_FLOOR
:
{
ggml_compute_forward_floor
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_CEIL
:
{
ggml_compute_forward_ceil
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_ROUND
:
{
ggml_compute_forward_round
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_TRUNC
:
{
ggml_compute_forward_trunc
(
params
,
dst
);
}
break
;
case
GGML_UNARY_OP_XIELU
:
case
GGML_UNARY_OP_XIELU
:
{
{
ggml_compute_forward_xielu
(
params
,
dst
);
ggml_compute_forward_xielu
(
params
,
dst
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
View file @
544b6739
...
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
...
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
return
logf
(
x
);
return
logf
(
x
);
}
}
static
inline
float
op_floor
(
float
x
)
{
return
floorf
(
x
);
}
static
inline
float
op_ceil
(
float
x
)
{
return
ceilf
(
x
);
}
static
inline
float
op_round
(
float
x
)
{
return
roundf
(
x
);
}
static
inline
float
op_trunc
(
float
x
)
{
return
truncf
(
x
);
}
template
<
float
(
*
op
)(
float
),
typename
src0_t
,
typename
dst_t
>
template
<
float
(
*
op
)(
float
),
typename
src0_t
,
typename
dst_t
>
static
inline
void
vec_unary_op
(
int64_t
n
,
dst_t
*
y
,
const
src0_t
*
x
)
{
static
inline
void
vec_unary_op
(
int64_t
n
,
dst_t
*
y
,
const
src0_t
*
x
)
{
constexpr
auto
src0_to_f32
=
type_conversion_table
<
src0_t
>::
to_f32
;
constexpr
auto
src0_to_f32
=
type_conversion_table
<
src0_t
>::
to_f32
;
...
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
...
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
unary_op
<
op_log
>
(
params
,
dst
);
unary_op
<
op_log
>
(
params
,
dst
);
}
}
void
ggml_compute_forward_floor
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_floor
>
(
params
,
dst
);
}
void
ggml_compute_forward_ceil
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_ceil
>
(
params
,
dst
);
}
void
ggml_compute_forward_round
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_round
>
(
params
,
dst
);
}
void
ggml_compute_forward_trunc
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
unary_op
<
op_trunc
>
(
params
,
dst
);
}
void
ggml_compute_forward_xielu
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
void
ggml_compute_forward_xielu
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
const
float
alpha_n
=
ggml_get_op_params_f32
(
dst
,
1
);
const
float
alpha_n
=
ggml_get_op_params_f32
(
dst
,
1
);
const
float
alpha_p
=
ggml_get_op_params_f32
(
dst
,
2
);
const
float
alpha_p
=
ggml_get_op_params_f32
(
dst
,
2
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
View file @
544b6739
...
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
...
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
void
ggml_compute_forward_sin
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_sin
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_cos
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_cos
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_log
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_log
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_floor
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_ceil
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_round
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_trunc
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_xielu
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_xielu
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
#ifdef __cplusplus
#ifdef __cplusplus
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment