Unverified Commit c68f367e authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Update GGML to b6646 (#12245)

Notable EOLs with this change:
- MacOS v12 and v13 are no longer supported (v14+ required)
- AMD gfx900 and gfx906 are no longer supported
parent fdb10946
...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar ...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 637891f5..98b8280f 100644 index db1f0b23..f4de7e34 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -307,7 +307,7 @@ private: @@ -308,7 +308,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
......
...@@ -11,10 +11,10 @@ with the fastest acceleration is loaded ...@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 6c315137..3040b2aa 100644 index 136afec7..f794d9cf 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry { @@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends; std::vector<ggml_backend_reg_entry> backends;
...@@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644 ...@@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
ggml_backend_registry() { ggml_backend_registry() {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
@@ -207,7 +207,7 @@ struct ggml_backend_registry { @@ -223,7 +223,7 @@ struct ggml_backend_registry {
} }
} }
...@@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644 ...@@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
if (!reg) { if (!reg) {
return; return;
} }
@@ -218,15 +218,20 @@ struct ggml_backend_registry { @@ -234,15 +234,20 @@ struct ggml_backend_registry {
#endif #endif
backends.push_back({ reg, std::move(handle) }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
...@@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644 ...@@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
} }
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) { ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -270,7 +275,7 @@ struct ggml_backend_registry { @@ -286,7 +291,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str()); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
...@@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644 ...@@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
return reg; return reg;
} }
@@ -293,7 +298,7 @@ struct ggml_backend_registry { @@ -309,7 +314,7 @@ struct ggml_backend_registry {
// remove devices // remove devices
devices.erase( devices.erase(
std::remove_if(devices.begin(), devices.end(), std::remove_if(devices.begin(), devices.end(),
...@@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644 ...@@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
devices.end()); devices.end());
// remove backend // remove backend
@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() { @@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) { ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count()); GGML_ASSERT(index < ggml_backend_dev_count());
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants ...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 177fb282..f5a5079a 100644 index c8f3d859..ff6229a0 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endif() endif()
ggml_add_cpu_backend_variant_impl(${tag_name}) ggml_add_cpu_backend_variant_impl(${tag_name})
...@@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644 ...@@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS) @@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH) elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS") message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif() endif()
......
...@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems ...@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f5a5079a..5158acd6 100644 index ff6229a0..33b3a15f 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
......
...@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644 ...@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
// get ith C string from array with given key_id // get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 53504399..0f71d5f3 100644 index 8cc4ef1c..d950dbdf 100644
--- a/ggml/src/gguf.cpp --- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id @@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
...@@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644 ...@@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index c011008f..fa388b03 100644 index 26fa9fad..64c78a16 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
...@@ -66,4 +66,4 @@ index c011008f..fa388b03 100644 ...@@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); + const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index d89cd8f4..a5689c18 100644 index dbc07301..f8574d01 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644 ...@@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644 ...@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index bfbf5fa2..11f93f42 100644 index 2186f827..8fb86009 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
} }
...@@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644 ...@@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( @@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str, /* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root, /* .grammar_root = */ grammar_root,
......
...@@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700 ...@@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32 Subject: [PATCH] add argsort and cuda copy for i32
--- ---
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++++ ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++++++- ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++-
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++ ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++ ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++
4 files changed, 192 insertions(+), 2 deletions(-) ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++
5 files changed, 256 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 854f1c2b..a2924757 100644 index 14f7dcf4..f7f8da35 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32( @@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
} }
} }
...@@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644 ...@@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
void ggml_compute_forward_argsort( void ggml_compute_forward_argsort(
const ggml_compute_params * params, const ggml_compute_params * params,
ggml_tensor * dst) { ggml_tensor * dst) {
@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort( @@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
{ {
ggml_compute_forward_argsort_f32(params, dst); ggml_compute_forward_argsort_f32(params, dst);
} break; } break;
...@@ -196,12 +197,12 @@ index 607ded85..53b02634 100644 ...@@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
+ } + }
} }
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index 410c12b7..b8e9e107 100644 index e621cb98..597c0c8b 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh --- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t> @@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) { static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
convert_flt((const src_t *)cxi, (dst_t *)cdsti); *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
} }
+ +
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) { +static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
...@@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644 ...@@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
+ *dst = *src; + *dst = *src;
+} +}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index f9bb0256..9c3774e5 100644 index 746f4396..911220e9 100644
--- a/ggml/src/ggml-cuda/cpy.cu --- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda( @@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
} }
...@@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644 ...@@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg @@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
...@@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644 ...@@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 96df6f0c..44dc31c0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
}
}
+typedef void (i32_argsort_t)(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_i32_i32(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
+ // bitonic sort
+ int col = tpitg[0];
+ int row = tgpig[1];
+
+ if (col >= args.ncols_pad) return;
+
+ device const int32_t * x_row = x + row * args.ncols;
+ threadgroup int32_t * dst_row = shared_values;
+
+ // initialize indices
+ dst_row[col] = col;
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
+ for (int j = k / 2; j > 0; j /= 2) {
+ int ixj = col ^ j;
+ if (ixj > col) {
+ if ((col & k) == 0) {
+ if (dst_row[col] >= args.ncols ||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ } else {
+ if (dst_row[ixj] >= args.ncols ||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ }
+
+ // copy the result to dst without the padding
+ if (col < args.ncols) {
+ dst[row * args.ncols + col] = dst_row[col];
+ }
+}
+
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
kernel void kernel_leaky_relu_f32(
constant ggml_metal_kargs_leaky_relu & args,
...@@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure ...@@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
--- ---
ggml/include/ggml-alloc.h | 1 + ggml/include/ggml-alloc.h | 1 +
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++---- ggml/src/ggml-alloc.c | 34 +++++++++++++++++++++++++++++++---
ggml/src/ggml-backend.cpp | 7 +++++++ ggml/src/ggml-backend.cpp | 7 +++++++
4 files changed, 41 insertions(+), 4 deletions(-) 4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd2..7ab3f0192 100644 index 2cb150fd..7ab3f019 100644
--- a/ggml/include/ggml-alloc.h --- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( @@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
...@@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644 ...@@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a2977ea2e..e8cf30841 100644 index 62b6d65e..fe20dca3 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -303,6 +303,7 @@ extern "C" { @@ -316,6 +316,7 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 8b6e60283..b58bd671d 100644 index fa46f3b4..421ff7c7 100644
--- a/ggml/src/ggml-alloc.c --- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c
@@ -350,6 +350,7 @@ struct node_alloc { @@ -492,6 +492,7 @@ struct node_alloc {
struct ggml_gallocr { struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers] ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers] struct vbuffer ** buffers; // [n_buffers]
+ size_t *buffer_sizes; // [n_buffers] + size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers; int n_buffers;
@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs @@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t)); + galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
...@@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644 ...@@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL); GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { @@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
...@@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644 ...@@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
free(galloc->buffers); free(galloc->buffers);
free(galloc->buf_tallocs); free(galloc->buf_tallocs);
free(galloc->node_allocs); free(galloc->node_allocs);
@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
...@@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644 ...@@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer // if the buffer type is used multiple times, we reuse the same buffer
@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
- if (galloc->buffers[i] == NULL) { - if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) { + if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else { + } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false; - return false;
+ galloc->buffer_sizes[i] = new_size; + galloc->buffer_sizes[i] = new_size;
+ success = false; + success = false;
} }
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else { + } else {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
} }
} }
...@@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644 ...@@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { @@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); return ggml_vbuffer_size(galloc->buffers[buffer_id]);
} }
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { +size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
...@@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644 ...@@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 97f47abd2..d02a40e60 100644 index 8ba86f82..cb2b9956 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe @@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
} }
...@@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644 ...@@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
+} +}
+ +
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
...@@ -6,28 +6,28 @@ Subject: [PATCH] ggml: Export GPU UUIDs ...@@ -6,28 +6,28 @@ Subject: [PATCH] ggml: Export GPU UUIDs
This enables matching up devices and information reported by the backend This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
--- ---
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.m | 1 + ggml/src/ggml-metal/ggml-metal.cpp | 1 +
3 files changed, 63 insertions(+), 6 deletions(-) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 8a91b381..9424394e 100644 index fe20dca3..48777212 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" { @@ -158,6 +158,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
const char * description; const char * description;
+ const char * id; // device free memory in bytes
size_t memory_free; size_t memory_free;
+ const char * id;
// device total memory in bytes
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 37ee2a6d..57eae461 100644 index fdf8c63d..ad389ece 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
} }
#endif // defined(GGML_USE_HIP) #endif // defined(GGML_USE_HIP)
...@@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644 ...@@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
+} +}
+ +
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
#if defined(GGML_USE_HIP) ggml_cuda_device_info info = {};
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
} }
} }
...@@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644 ...@@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str()); + ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP) std::string device_name(prop.name);
} if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context { @@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name; std::string name;
std::string description; std::string description;
std::string pci_bus_id;
+ std::string id; + std::string id;
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t @@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str(); return ctx->description.c_str();
} }
...@@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644 ...@@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend @@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
+ props->id = ggml_backend_cuda_device_get_id(dev); + props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i); + dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
ggml_backend_dev_t dev = new ggml_backend_device { char pci_bus_id[16] = {};
/* .iface = */ ggml_backend_cuda_device_interface, snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 7bccc7bf..fe7b2f0a 100644 index 909e17de..08ab4fc9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev); props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev); props->description = ggml_backend_metal_device_get_description(dev);
+ props->id = "0"; + props->id = "0";
props->type = ggml_backend_metal_device_get_type(dev); props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (struct ggml_backend_dev_caps) {
...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> ...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+) 2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index a05373d5..6f70f7f4 100644 index cd022c5e..3d680945 100644
--- a/tools/mtmd/mtmd.cpp --- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl { @@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc ...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a5689c18..85af19a3 100644 index f8574d01..530efce0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { @@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with // all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4 // n_threads > 4
......
...@@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard ...@@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard
Only enable BF16 on supported MacOS versions (v14+) Only enable BF16 on supported MacOS versions (v14+)
--- ---
ggml/src/ggml-metal/ggml-metal.m | 6 +++++- ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
1 file changed, 5 insertions(+), 1 deletion(-) 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index fe7b2f0a..e4c31268 100644 index 052efb7a..b47dc787 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev @@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
- res->use_bfloat = props_dev->has_bfloat;
+ if (@available(macOS 14.0, *)) {
+ res->use_bfloat = props_dev->has_bfloat;
+ } else {
+ res->use_bfloat = false;
+ }
+
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
#if defined(GGML_METAL_USE_BF16)
- ctx->use_bfloat = ctx->has_bfloat;
+ if (@available(macOS 14.0, *)) {
+ ctx->use_bfloat = ctx->has_bfloat;
+ } else {
+ ctx->use_bfloat = false;
+ }
#else
ctx->use_bfloat = false;
#endif
...@@ -13,10 +13,10 @@ checks. ...@@ -13,10 +13,10 @@ checks.
1 file changed, 18 insertions(+) 1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644 index ad389ece..e51c5035 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud @@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
...@@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644 ...@@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+ +
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud @@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if (node->op == GGML_OP_ADD && if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 && node->src[1] && node->src[1]->ne[1] > 1 &&
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older ...@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
1 file changed, 5 insertions(+) 1 file changed, 5 insertions(+)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index aeac2e57..40738d5b 100644 index 5b888cdd..2a9ff7f6 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp --- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { @@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
}; };
ggml_backend_reg_t ggml_backend_blas_reg(void) { ggml_backend_reg_t ggml_backend_blas_reg(void) {
......
...@@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data. ...@@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
5 files changed, 310 insertions(+), 44 deletions(-) 5 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 2773cc310..ae94887dd 100644 index 48777212..d4352663 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -291,6 +291,7 @@ extern "C" { @@ -303,6 +303,7 @@ extern "C" {
// Initialize a backend scheduler, backends with low index are given priority over backends with high index // Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
...@@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644 ...@@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index c36c12d65..369e9e25a 100644 index 07784d6f..869dc07d 100644
--- a/ggml/src/ggml-backend-impl.h --- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" { @@ -26,12 +26,17 @@ extern "C" {
...@@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644 ...@@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
}; };
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -114,6 +120,16 @@ extern "C" { @@ -117,6 +123,16 @@ extern "C" {
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
// wait for an event on on a different stream // (optional) sort/optimize the nodes in the graph
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ +
+ // (optional) reserves intermediate buffers needed for the compution + // (optional) reserves intermediate buffers needed for the compution
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size + // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
...@@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644 ...@@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
struct ggml_backend { struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d02a40e60..6b4dee4c7 100644 index cb2b9956..6ef5eeaf 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
...@@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
+ return buf; + return buf;
+ } + }
+ +
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft, /* .buft = */ buft,
/* .context = */ context, /* .context = */ context,
/* .size = */ size, /* .size = */ size,
...@@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
}; };
return buffer; return buffer;
@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL; return NULL;
} }
...@@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
void * base = buffer->iface.get_base(buffer); void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -663,6 +683,12 @@ struct ggml_backend_sched { @@ -723,6 +743,12 @@ struct ggml_backend_sched {
bool op_offload; bool op_offload;
int debug; int debug;
...@@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
}; };
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size, size_t graph_size,
bool parallel, bool parallel,
bool op_offload) { bool op_offload) {
...@@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device); sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
} }
} }
...@@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { @@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]); ggml_backend_event_free(sched->events[b][c]);
} }
...@@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
} }
ggml_gallocr_free(sched->galloc); ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx); ggml_free(sched->ctx);
@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * @@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false; return false;
} }
...@@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
return true; return true;
@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, @@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
...@@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644 ...@@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 2e5d48797..b915ee1b8 100644 index c4246b65..448badf0 100644
--- a/ggml/src/ggml-cuda/common.cuh --- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@ @@ -35,6 +35,31 @@
...@@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644 ...@@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
#define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -771,6 +796,9 @@ struct ggml_cuda_pool { @@ -880,6 +905,9 @@ struct ggml_cuda_pool {
virtual void * alloc(size_t size, size_t * actual_size) = 0; virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0; virtual void free(void * ptr, size_t size) = 0;
...@@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644 ...@@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
}; };
template<typename T> template<typename T>
@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context { @@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
// pool // pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
...@@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644 ...@@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
} }
return *pools[device]; return *pools[device];
} }
@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context { @@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() { ggml_cuda_pool & pool() {
return pool(device); return pool(device);
} }
...@@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644 ...@@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
+ } + }
}; };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..d5abe09e0 100644 index e51c5035..d324bc68 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
...@@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644 ...@@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, @@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
...@@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644 ...@@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
// flag used to determine whether it is an integrated_gpu // flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue; continue;
} }
...@@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644 ...@@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
+ +
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) { if (!disable_fusion) {
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
...@@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644 ...@@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, @@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
...@@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644 ...@@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { @@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record, /* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait, /* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve, + /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size, + /* .buffer_size = */ ggml_backend_cuda_buffer_size,
+ /* .reset = */ ggml_backend_cuda_reset, + /* .reset = */ ggml_backend_cuda_reset,
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all ...@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-) 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644 index d8a8b5e6..09247cef 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) { @@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens(); const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
......
...@@ -15,10 +15,10 @@ unused then it can be reset to free these data structures. ...@@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
5 files changed, 29 insertions(+), 2 deletions(-) 5 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index b602a7c78..fda5ceb24 100644 index d4352663..0a2dae26 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -167,6 +167,7 @@ extern "C" { @@ -178,6 +178,7 @@ extern "C" {
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
...@@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644 ...@@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 81749a5a3..6f10c353b 100644 index 869dc07d..4889df79 100644
--- a/ggml/src/ggml-backend-impl.h --- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h
@@ -178,6 +178,10 @@ extern "C" { @@ -195,6 +195,10 @@ extern "C" {
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
...@@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644 ...@@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
struct ggml_backend_device { struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 05a842ed5..6556943b0 100644 index 6ef5eeaf..0b757af5 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par @@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
return device->iface.init_backend(device, params); return device->iface.init_backend(device, params);
} }
...@@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644 ...@@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
+} +}
+ +
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_buffer_type(device); return device->iface.get_buffer_type(device);
}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..e43fde523 100644 index d324bc68..531d6e27 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -103,6 +103,11 @@ int ggml_cuda_get_device() { @@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
return id; return id;
} }
...@@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644 ...@@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
cudaError_t err; cudaError_t err;
@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->description = ggml_backend_cuda_device_get_description(dev);
props->id = ggml_backend_cuda_device_get_id(dev); props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); - ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ +
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device). + // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
...@@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644 ...@@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g @@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
} }
...@@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644 ...@@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = { static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name, /* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description, /* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { @@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new, /* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free, /* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
...@@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644 ...@@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
}; };
// backend reg // backend reg
@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->device = i; dev_ctx->device = i;
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
...@@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644 ...@@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index c31f31923..cf22e60d2 100644 index 37386afc..06f9e7c1 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h --- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -40,6 +40,7 @@ @@ -41,6 +41,7 @@
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
......
...@@ -6,25 +6,25 @@ Subject: [PATCH] GPU discovery enhancements ...@@ -6,25 +6,25 @@ Subject: [PATCH] GPU discovery enhancements
Expose more information about the devices through backend props, and leverage Expose more information about the devices through backend props, and leverage
management libraries for more accurate VRAM usage reporting if available. management libraries for more accurate VRAM usage reporting if available.
--- ---
ggml/include/ggml-backend.h | 9 + ggml/include/ggml-backend.h | 9 +
ggml/src/CMakeLists.txt | 2 + ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++- ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++
ggml/src/ggml-cuda/vendors/hip.h | 1 + ggml/src/ggml-cuda/vendors/hip.h | 4 +
ggml/src/ggml-impl.h | 8 + ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.m | 2 + ggml/src/ggml-metal/ggml-metal.cpp | 3 +-
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++ ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 172 ++++++++++++ ggml/src/mem_nvml.cpp | 172 +++++++++++
8 files changed, 717 insertions(+), 1 deletion(-) 8 files changed, 718 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/mem_hip.cpp create mode 100644 ggml/src/mem_hip.cpp
create mode 100644 ggml/src/mem_nvml.cpp create mode 100644 ggml/src/mem_nvml.cpp
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index fda5ceb24..7c2d86703 100644 index 0a2dae26..a6bf3378 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,15 @@ extern "C" { @@ -169,6 +169,15 @@ extern "C" {
size_t memory_total; const char * device_id;
enum ggml_backend_dev_type type; // device capabilities
struct ggml_backend_dev_caps caps; struct ggml_backend_dev_caps caps;
+ int driver_major; + int driver_major;
+ int driver_minor; + int driver_minor;
...@@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644 ...@@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5158acd6a..3a428a22d 100644 index 33b3a15f..86191ef2 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base @@ -206,6 +206,8 @@ add_library(ggml-base
ggml-threading.h ggml-threading.h
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h
...@@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644 ...@@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
target_include_directories(ggml-base PRIVATE .) target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e43fde523..14baf0fb1 100644 index 531d6e27..3fa3a057 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
...@@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644 ...@@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
#if defined(GGML_USE_VMM) #if defined(GGML_USE_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
...@@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644 ...@@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str()); ggml_cuda_parse_uuid(prop, id).c_str());
+ @@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
#endif // defined(GGML_USE_HIP)
}
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description; std::string description;
std::string pci_bus_id;
std::string id; std::string id;
+ int major; + int major;
+ int minor; + int minor;
+ int driver_major; + int driver_major;
+ int driver_minor; + int driver_minor;
+ int integrated; + int integrated;
+ int pci_bus_id; + int pciBusID;
+ int pci_device_id; + int pciDeviceID;
+ int pci_domain_id; + int pciDomainID;
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { @@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
+ +
+#if defined(GGML_USE_HIP) +#if defined(GGML_USE_HIP)
+ if (ggml_hip_mgmt_init() == 0) { + if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); + int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+ if (status == 0) { + if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_hip_mgmt_release(); + ggml_hip_mgmt_release();
...@@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644 ...@@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
} }
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend @@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
+#define GGML_HIP_NAME "HIP" +#define GGML_HIP_NAME "HIP"
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev); ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly. // If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0; props->memory_total = props->memory_free = 0;
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP) +#if defined(GGML_USE_HIP)
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD; + int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+ props->compute_major = cc / 0x100; + props->compute_major = cc / 0x100;
...@@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644 ...@@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
+ props->driver_major = ctx->driver_major; + props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor; + props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->integrated; + props->integrated = ctx->integrated;
+ props->pci_bus_id = ctx->pci_bus_id; + props->pci_bus_id = ctx->pciBusID;
+ props->pci_device_id = ctx->pci_device_id; + props->pci_device_id = ctx->pciDeviceID;
+ props->pci_domain_id = ctx->pci_domain_id; + props->pci_domain_id = ctx->pciDomainID;
+ props->library = GGML_CUDA_NAME; + props->library = GGML_CUDA_NAME;
+ +
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
bool events = false; bool events = false;
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (!initialized) { if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
...@@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644 ...@@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) { for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->description = prop.name; dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
+ dev_ctx->major = prop.major; + dev_ctx->major = prop.major;
+ dev_ctx->minor = prop.minor; + dev_ctx->minor = prop.minor;
+ dev_ctx->driver_major = driverVersion / 1000; + dev_ctx->driver_major = driverVersion / 1000;
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; + dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+ dev_ctx->integrated = prop.integrated; + dev_ctx->integrated = prop.integrated;
+ dev_ctx->pci_bus_id = prop.pciBusID; + dev_ctx->pciBusID = prop.pciBusID;
+ dev_ctx->pci_device_id = prop.pciDeviceID; + dev_ctx->pciDeviceID = prop.pciDeviceID;
+ dev_ctx->pci_domain_id = prop.pciDomainID; + dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index cf22e60d2..957a795f2 100644 index 06f9e7c1..eb8f66cb 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h --- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -42,6 +42,7 @@ @@ -5,6 +5,9 @@
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -43,6 +46,7 @@
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceReset hipDeviceReset #define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
...@@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644 ...@@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 19a7adb2d..b9b102a5e 100644 index 86a1ebf6..9fc9fbfc 100644
--- a/ggml/src/ggml-impl.h --- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx @@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return true; return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
} }
+// Management libraries for fetching more accurate free VRAM data +// Management libraries for fetching more accurate free VRAM data
...@@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644 ...@@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index e4c31268f..ec6b385ba 100644 index 08ab4fc9..17999a61 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
+#define GGML_METAL_NAME "Metal" +#define GGML_METAL_NAME "Metal"
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev); props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev); props->description = ggml_backend_metal_device_get_description(dev);
props->id = "0"; @@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
props->type = ggml_backend_metal_device_get_type(dev); props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
+ props->library = GGML_METAL_NAME; + props->library = GGML_METAL_NAME;
props->caps = (struct ggml_backend_dev_caps) { props->caps = {
/* .async = */ false, /* .async = */ true,
/* .host_buffer = */ false, /* .host_buffer = */ false,
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
new file mode 100644 new file mode 100644
index 000000000..8ef19b8cf index 00000000..8ef19b8c
--- /dev/null --- /dev/null
+++ b/ggml/src/mem_hip.cpp +++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@ @@ -0,0 +1,449 @@
...@@ -697,7 +703,7 @@ index 000000000..8ef19b8cf ...@@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
\ No newline at end of file \ No newline at end of file
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
new file mode 100644 new file mode 100644
index 000000000..aa05e9dc1 index 00000000..aa05e9dc
--- /dev/null --- /dev/null
+++ b/ggml/src/mem_nvml.cpp +++ b/ggml/src/mem_nvml.cpp
@@ -0,0 +1,172 @@ @@ -0,0 +1,172 @@
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 23 Sep 2025 15:41:58 -0700
Subject: [PATCH] ggml: Backport scale kernel fixes
The GGML scale kernel uses signed 32-bit ints to represent
the number of elements in the tensor. For large images,
mistral-small3.2 overflows this, triggering CUDA errors due
to negative arguments.
Currently, this can happen when the user passes a large image
to mistral-small3.2. However, with upcoming changes to reserve
CUDA memory, it happens every time mistral-small is loaded as
we reserve using a worst case batch.
This patch is part of an upstream GGML commit and should be removed
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
(cuda && cpu) (#15669)".
Fixes #10388
---
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
index 2ee9e5889..0ddeff6a1 100644
--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
@@ -1,18 +1,19 @@
#include "scale.cuh"
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF
- if (i >= k) {
- return;
- }
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
- dst[i] = scale * x[i] + bias;
+ for (int64_t i = tid; i < nelements; i += stride) {
+ dst[i] = scale * x[i] + bias;
+ }
}
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
}
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment