llama: update to commit 71e90e88 (#10192)

943464cc · Jeffrey Morgan · GitHub · 369de832 · 943464cc · 369de832
Unverified Commit 943464cc authored Apr 16, 2025 by Jeffrey Morgan Committed by GitHub Apr 16, 2025
20 changed files
--- a/llama/patches/0013-sort-devices-by-score.patch
+++ b/llama/patches/0013-sort-devices-by-score.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 12:01:24 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:31:38 -0700
 Subject: [PATCH] sort devices by score

+in the ggml backend loading code, devices
+are now sorted by score, ensuring the device
+with the fastest acceleration is loaded
 ---
 ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 95036ef8..98d5e14d 100644
+index 82ae1b5b..1487f322 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
+@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@@ -20,7 +23,7 @@ index 95036ef8..98d5e14d 100644
 
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
-@@ -195,7 +195,7 @@ struct ggml_backend_registry {
+@@ -202,7 +202,7 @@ struct ggml_backend_registry {
         }
     }
 
@@ -29,7 +32,7 @@ index 95036ef8..98d5e14d 100644
         if (!reg) {
             return;
         }
-@@ -206,15 +206,20 @@ struct ggml_backend_registry {
+@@ -213,15 +213,20 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -52,17 +55,17 @@ index 95036ef8..98d5e14d 100644
 +        );
     }
 
-     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-@@ -257,7 +262,7 @@ struct ggml_backend_registry {
+     ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
+@@ -265,7 +270,7 @@ struct ggml_backend_registry {
 
-         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
 
 -        register_backend(reg, std::move(handle));
 +        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
 
         return reg;
     }
-@@ -280,7 +285,7 @@ struct ggml_backend_registry {
+@@ -288,7 +293,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@@ -71,7 +74,7 @@ index 95036ef8..98d5e14d 100644
             devices.end());
 
         // remove backend
-@@ -338,7 +343,7 @@ size_t ggml_backend_dev_count() {
+@@ -346,7 +351,7 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());

--- a/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 4 Jan 2025 22:52:48 -0800
-Subject: [PATCH] use dynamic backend loading for clip
-
---
- examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
- 1 file changed, 27 insertions(+), 47 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -9,25 +9,25 @@
- #include "ggml-backend.h"
- #include "gguf.h"
- 
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
-+#ifdef GGML_USE_CUDA
-+#include "ggml-cuda.h"
-+#endif
-+
-+#ifdef GGML_USE_SYCL
-+#include "ggml-sycl.h"
-+#endif
-+
-+#ifdef GGML_USE_METAL
-+#include "ggml-metal.h"
-+#endif
-+
-+#ifdef GGML_USE_CANN
-+#include "ggml-cann.h"
-+#endif
-+
-+#ifdef GGML_USE_VULKAN
-+#include "ggml-vulkan.h"
-+#endif
- 
- #define STB_IMAGE_IMPLEMENTATION
- #include "stb_image.h"
-@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-         }
-     }
- 
-//#ifdef GGML_USE_CUDA
-//    new_clip->backend = ggml_backend_cuda_init(0);
-//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//    new_clip->backend = ggml_backend_metal_init();
-//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//    new_clip->backend = ggml_backend_cann_init(0);
-//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//    new_clip->backend = ggml_backend_vk_init(0);
-//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//    new_clip->backend = ggml_backend_sycl_init(0);
-//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
-
-    if (!new_clip->backend) {
-        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
-+    ggml_backend_t backend = ggml_backend_init_best();
-+    if (backend == nullptr) {
-+        LOG_ERR("%s: failed to initialize backend\n", __func__);
-+        clip_free(new_clip);
-+        gguf_free(ctx);
-+        return nullptr;
-     }
-+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-+    new_clip->backend = backend;
- 
-     // model size and capabilities
-     {
--- a/llama/patches/0014-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0014-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 15:59:04 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:32:07 -0700
 Subject: [PATCH] add phony target ggml-cpu for all cpu variants

 ---
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0002ac18..0a8d1092 100644
+index f00700da..91d6a7d5 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,11 +19,11 @@ index 0002ac18..0a8d1092 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
 +    add_custom_target(ggml-cpu)
     ggml_add_cpu_backend_variant(sandybridge    AVX)
-     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
--- a/llama/patches/0016-remove-amx.patch
+++ b/llama/patches/0016-remove-amx.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 18 Feb 2025 14:47:21 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:33:01 -0700
 Subject: [PATCH] remove amx

+disable amx as it reduces performance on some systems
 ---
 ggml/src/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0a8d1092..4564df91 100644
+index 91d6a7d5..d6b393a2 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
-     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 -    if (NOT MSVC)
 -        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -    endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")

--- a/llama/patches/0019-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0019-fix-string-arr-kv-loading.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Wed, 5 Mar 2025 17:41:07 -0800
+Date: Tue, 8 Apr 2025 20:35:53 -0700
 Subject: [PATCH] fix string arr kv loading

+certain models would error when loading
+kv metadata fields that contain an array of strings
+such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
@@ -22,7 +25,7 @@ index 79ee2020..3efb22f0 100644
     // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index ab13669c..f75b923f 100644
+index 381a9c7d..e45b453d 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
 @@ -777,10 +777,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@@ -50,10 +53,10 @@ index ab13669c..f75b923f 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index c7ff28be..7a185443 100644
+index d74919d2..c90f636c 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1443,7 +1443,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {

--- a/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 16 Feb 2025 20:00:22 -0500
-Subject: [PATCH] use std::filesystem::path instead of wstring
-
---
- ggml/src/ggml-backend-reg.cpp | 199 +++++++++++++++-------------------
- 1 file changed, 88 insertions(+), 111 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 98d5e14d..799af5f3 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -66,26 +66,6 @@
- #include "ggml-kompute.h"
- #endif
- 
-// disable C++17 deprecation warning for std::codecvt_utf8
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-static std::wstring utf8_to_utf16(const std::string & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
-}
-
-static std::string utf16_to_utf8(const std::wstring & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.to_bytes(str);
-}
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
- #ifdef _WIN32
- 
- using dl_handle = std::remove_pointer_t<HMODULE>;
-@@ -96,7 +76,7 @@ struct dl_handle_deleter {
-     }
- };
- 
-static dl_handle * dl_load_library(const std::wstring & path) {
-+static dl_handle * dl_load_library(const std::filesystem::path & path) {
-     // suppress error dialogs for missing DLLs
-     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-@@ -129,8 +109,8 @@ struct dl_handle_deleter {
-     }
- };
- 
-static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
-+static void * dl_load_library(const std::filesystem::path & path) {
-+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
- 
-     return handle;
- }
-@@ -141,6 +121,25 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
- 
- #endif
- 
-+static std::string path_to_string(const std::filesystem::path & path)
-+{
-+#ifdef _WIN32
-+    const std::wstring wstr = path.wstring();
-+    const int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, 0, nullptr, nullptr);
-+    if (size_needed <= 0) {
-+        return std::string();
-+    }
-+
-+    // size_needed includes the null terminator
-+    std::string str(size_needed - 1, '\0');
-+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, str.data(), size_needed, nullptr, nullptr);
-+    return str;
-+#else
-+    return path.string();
-+#endif
-+}
-+
-+
- using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
- 
- struct ggml_backend_reg_entry {
-@@ -222,11 +221,11 @@ struct ggml_backend_registry {
-         );
-     }
- 
-    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-+    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
-         dl_handle_ptr handle { dl_load_library(path) };
-         if (!handle) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -234,7 +233,7 @@ struct ggml_backend_registry {
-         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-         if (score_fn && score_fn() == 0) {
-             if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -242,7 +241,7 @@ struct ggml_backend_registry {
-         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
-         if (!backend_init_fn) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -251,16 +250,16 @@ struct ggml_backend_registry {
-         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
-             if (!silent) {
-                 if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
-+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path_to_string(path).c_str());
-                 } else {
-                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-+                        __func__, path_to_string(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-                 }
-             }
-             return nullptr;
-         }
- 
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
-+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_to_string(path).c_str());
- 
-         register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
- 
-@@ -396,14 +395,14 @@ ggml_backend_t ggml_backend_init_best(void) {
- 
- // Dynamic loading
- ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(utf8_to_utf16(path), false);
-+    return get_reg().load_backend(path, false);
- }
- 
- void ggml_backend_unload(ggml_backend_reg_t reg) {
-     get_reg().unload_backend(reg, true);
- }
- 
-static std::wstring get_executable_path() {
-+static std::filesystem::path get_executable_path() {
- #if defined(__APPLE__)
-     // get executable path
-     std::vector<char> path;
-@@ -415,15 +414,9 @@ static std::wstring get_executable_path() {
-         }
-         path.resize(size);
-     }
-    std::string base_path(path.data(), size);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('/');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return utf8_to_utf16(base_path + "/");
-+
-+    return std::filesystem::path(path.data()).parent_path();
- #elif defined(__linux__) || defined(__FreeBSD__)
-    std::string base_path = ".";
-     std::vector<char> path(1024);
-     while (true) {
-         // get executable path
-@@ -436,76 +429,55 @@ static std::wstring get_executable_path() {
-             break;
-         }
-         if (len < (ssize_t) path.size()) {
-            base_path = std::string(path.data(), len);
-            // remove executable name
-            auto last_slash = base_path.find_last_of('/');
-            if (last_slash != std::string::npos) {
-                base_path = base_path.substr(0, last_slash);
-            }
-            break;
-+            return std::filesystem::path(path.data()).parent_path();
-         }
-         path.resize(path.size() * 2);
-     }
-
-    return utf8_to_utf16(base_path + "/");
- #elif defined(_WIN32)
-     std::vector<wchar_t> path(MAX_PATH);
-     DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
-     if (len == 0) {
-         return {};
-     }
-    std::wstring base_path(path.data(), len);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('\\');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + L"\\";
-#else
-    return {};
-#endif
-}
- 
-static std::wstring backend_filename_prefix() {
-#ifdef _WIN32
-    return L"ggml-";
-#else
-    return L"libggml-";
-+    return std::filesystem::path(path.data()).parent_path();
- #endif
-+    return {};
- }
- 
-static std::wstring backend_filename_suffix() {
-+static std::string backend_filename_prefix() {
- #ifdef _WIN32
-    return L".dll";
-+    return "ggml-";
- #else
-    return L".so";
-+    return "libggml-";
- #endif
- }
- 
-static std::wstring path_separator() {
-+static std::string backend_filename_suffix() {
- #ifdef _WIN32
-    return L"\\";
-+    return ".dll";
- #else
-    return L"/";
-+    return ".so";
- #endif
- }
- 
- static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
-     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
-      // TODO: search system paths
-    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::wstring> search_paths;
-+    namespace fs = std::filesystem;
-+    std::string file_prefix = backend_filename_prefix() + name + "-";
-+    std::vector<fs::path> search_paths;
-+
-     if (user_search_path == nullptr) {
-        search_paths.push_back(L"." + path_separator());
-+        search_paths.push_back(fs::current_path());
-         search_paths.push_back(get_executable_path());
-     } else {
-        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
-+        search_paths.push_back(fs::u8path(user_search_path));
-     }
- 
-     int best_score = 0;
-    std::wstring best_path;
-+    fs::path best_path;
- 
-    namespace fs = std::filesystem;
-     for (const auto & search_path : search_paths) {
-         if (!fs::exists(search_path)) {
-             continue;
-@@ -513,29 +485,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
-         for (const auto & entry : dir_it) {
-             if (entry.is_regular_file()) {
-                std::wstring filename = entry.path().filename().wstring();
-                std::wstring ext = entry.path().extension().wstring();
-+                std::string filename = entry.path().filename().string();
-+                std::string ext = entry.path().extension().string();
-                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
-                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                    dl_handle_ptr handle { dl_load_library(entry.path()) };
-+                    if (!handle) {
-+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-                     }
-                    if (handle) {
-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                        if (score_fn) {
-                            int s = score_fn();
-#ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
-#endif
-                            if (s > best_score) {
-                                best_score = s;
-                                best_path = entry.path().wstring();
-                            }
-                        } else {
-                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-                            }
-                        }
-+
-+                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-+                    if (!score_fn) {
-+                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-+                    }
-+
-+                    int s = score_fn();
-+                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-+                    if (s > best_score) {
-+                        best_score = s;
-+                        best_path = entry.path();
-                     }
-                 }
-             }
-@@ -545,7 +514,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-     if (best_score == 0) {
-         // try to load the base backend
-         for (const auto & search_path : search_paths) {
-            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
-+            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
-             if (fs::exists(path)) {
-                 return get_reg().load_backend(path, silent);
-             }
-@@ -560,6 +529,14 @@ void ggml_backend_load_all() {
-     ggml_backend_load_all_from_path(nullptr);
- }
- 
-+static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
-+    try {
-+        ggml_backend_load_best(name, silent, user_search_path);
-+    } catch (const std::exception & e) {
-+        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
-+    }
-+}
-+
- void ggml_backend_load_all_from_path(const char * dir_path) {
- #ifdef NDEBUG
-     bool silent = true;
-@@ -567,18 +544,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
-     bool silent = false;
- #endif
- 
-    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-    ggml_backend_load_best("kompute", silent, dir_path);
-    ggml_backend_load_best("metal", silent, dir_path);
-    ggml_backend_load_best("rpc", silent, dir_path);
-    ggml_backend_load_best("sycl", silent, dir_path);
-    ggml_backend_load_best("vulkan", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("cpu", silent, dir_path);
-+    ggml_backend_try_load_best("blas", silent, dir_path);
-+    ggml_backend_try_load_best("cann", silent, dir_path);
-+    ggml_backend_try_load_best("cuda", silent, dir_path);
-+    ggml_backend_try_load_best("hip", silent, dir_path);
-+    ggml_backend_try_load_best("kompute", silent, dir_path);
-+    ggml_backend_try_load_best("metal", silent, dir_path);
-+    ggml_backend_try_load_best("rpc", silent, dir_path);
-+    ggml_backend_try_load_best("sycl", silent, dir_path);
-+    ggml_backend_try_load_best("vulkan", silent, dir_path);
-+    ggml_backend_try_load_best("opencl", silent, dir_path);
-+    ggml_backend_try_load_best("musa", silent, dir_path);
-+    ggml_backend_try_load_best("cpu", silent, dir_path);
-     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
-     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
-     if (backend_path) {
--- a/llama/patches/0020-ollama-debug-tensor.patch
+++ b/llama/patches/0020-ollama-debug-tensor.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Sun, 9 Mar 2025 14:44:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:36:41 -0700
 Subject: [PATCH] ollama debug tensor

 ---
@@ -8,11 +8,11 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 2f606d82..ec60e8fc 100644
+index 432942bf..6d4abe4c 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -11,6 +11,8 @@
- #include "ggml-threading.h"
+@@ -15,6 +15,8 @@
+ #include "ops.h"
 #include "ggml.h"
 
 +#include "ollama-debug.h"
@@ -20,7 +20,7 @@ index 2f606d82..ec60e8fc 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 

--- a/llama/patches/0021-add-model-quantizations.patch
+++ b/llama/patches/0021-add-model-quantizations.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Patrick Devine <patrick@infrahq.com>
-Date: Fri, 14 Mar 2025 16:33:23 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:39:32 -0700
 Subject: [PATCH] add model quantizations

- gemma3
- mistral3
+a temporary patch to add model quantization for
+models not supported in llama.cpp
 ---
- src/llama-arch.cpp  | 36 ++++++++++++++++++++++++++++++++++++
- src/llama-arch.h    |  2 ++
- src/llama-model.cpp | 10 ++++++++++
+ src/llama-arch.cpp  | 17 +++++++++++++++++
+ src/llama-arch.h    |  1 +
+ src/llama-model.cpp |  2 ++
 src/llama-quant.cpp |  4 ++++
- 4 files changed, 52 insertions(+)
+ 4 files changed, 24 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index b6f20286..13a0a988 100644
+index c1f78618..bdf3d898 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_MINICPM3,         "minicpm3"         },
-     { LLM_ARCH_GEMMA,            "gemma"            },
-     { LLM_ARCH_GEMMA2,           "gemma2"           },
-+    { LLM_ARCH_GEMMA3,           "gemma3"           },
-     { LLM_ARCH_STARCODER2,       "starcoder2"       },
-     { LLM_ARCH_MAMBA,            "mamba"            },
-     { LLM_ARCH_XVERSE,           "xverse"           },
-@@ -64,6 +65,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_CHAMELEON,        "chameleon"        },
-     { LLM_ARCH_SOLAR,            "solar"            },
+@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
 +    { LLM_ARCH_MISTRAL3,         "mistral3"         },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
-@@ -804,6 +806,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-         },
-     },
-+    {
-+        LLM_ARCH_GEMMA3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-+        },
-+    },
-     {
-         LLM_ARCH_STARCODER2,
-         {
-@@ -1352,6 +1372,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
+@@ -1582,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
         },
     },
 +    {
@@ -81,86 +48,42 @@ index b6f20286..13a0a988 100644
         LLM_ARCH_UNKNOWN,
         {
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index ec742224..8476ae0a 100644
+index f987844d..ee081fbf 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -41,6 +41,7 @@ enum llm_arch {
-     LLM_ARCH_MINICPM3,
-     LLM_ARCH_GEMMA,
-     LLM_ARCH_GEMMA2,
-+    LLM_ARCH_GEMMA3,
-     LLM_ARCH_STARCODER2,
-     LLM_ARCH_MAMBA,
-     LLM_ARCH_XVERSE,
-@@ -68,6 +69,7 @@ enum llm_arch {
+@@ -75,6 +75,7 @@ enum llm_arch {
     LLM_ARCH_CHAMELEON,
     LLM_ARCH_SOLAR,
     LLM_ARCH_WAVTOKENIZER_DEC,
 +    LLM_ARCH_MISTRAL3,
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
     LLM_ARCH_UNKNOWN,
- };
- 
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index ab1a07d1..db4f2685 100644
+index d5ad466e..cd1d239c 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1423,6 +1423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
-                }
-             } break;
-+        case LLM_ARCH_GEMMA3:
-+            {
-+            } break;
-         case LLM_ARCH_STARCODER2:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-@@ -1274,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
-                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                 }
             } break;
 +        case LLM_ARCH_MISTRAL3: break;
         default: throw std::runtime_error("unsupported model architecture");
     }
 
-@@ -2537,6 +2541,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                     }
-                 } break;
-+            case LLM_ARCH_GEMMA3:
-+                {
-+                } break;
-             case LLM_ARCH_STARCODER2:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -3531,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                     output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
-                     output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
-                 } break;
-+            case LLM_ARCH_MISTRAL3: break;
-             default:
-                 throw std::runtime_error("unknown architecture");
-         }
-@@ -4009,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE_MOE:
+@@ -13652,6 +13653,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
 +        case LLM_ARCH_MISTRAL3:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
-@@ -4029,6 +4038,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_PHIMOE:
-         case LLM_ARCH_GEMMA:
-         case LLM_ARCH_GEMMA2:
-+        case LLM_ARCH_GEMMA3:
-         case LLM_ARCH_STARCODER2:
-         case LLM_ARCH_OPENELM:
-         case LLM_ARCH_GPTNEOX:
 diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 6eb1da08..ebcbafa1 100644
+index 223e1f3f..8ae6dde8 100644
 --- a/src/llama-quant.cpp
 +++ b/src/llama-quant.cpp
-@@ -737,6 +737,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
+@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // This used to be a regex, but <regex> has an extreme cost to compile times.
         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
 

--- a/llama/patches/0017-fix-clip-compiler-error.patch
+++ b/llama/patches/0017-fix-clip-compiler-error.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 25 Feb 2025 19:14:51 -0800
-Subject: [PATCH] fix-clip-compiler-error
-
---
- examples/llava/clip.cpp | 2 +-
- examples/llava/clip.h   | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 560021c7..54265beb 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
-     }
- }
- 
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
-+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
-     img->nx = nx;
-     img->ny = ny;
-     img->buf.resize(3 * nx * ny);
-diff --git a/examples/llava/clip.h b/examples/llava/clip.h
-index ce6f6194..f9f80d7d 100644
--- a/examples/llava/clip.h
-+++ b/examples/llava/clip.h
-@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
- CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
- 
- /** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
-+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
- 
- CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
- 
--- a/llama/patches/0022-metal-add-op_neg.patch
+++ b/llama/patches/0022-metal-add-op_neg.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Wed, 2 Apr 2025 15:26:15 -0700
-Subject: [PATCH] metal: add op_neg
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:41:24 -0700
+Subject: [PATCH] add op_neg

+adds the neg operator to ggml
 ---
 ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal |  7 +++++++
 2 files changed, 22 insertions(+)

 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index e4c093f9..d8422f1b 100644
+index b121ab9e..fea50521 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -423,6 +423,7 @@ enum ggml_metal_kernel_type {
+@@ -461,6 +461,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_SQRT,
     GGML_METAL_KERNEL_TYPE_SIN,
     GGML_METAL_KERNEL_TYPE_COS,
@@ -20,23 +21,23 @@ index e4c093f9..d8422f1b 100644
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
-@@ -1039,6 +1040,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                           neg,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                        argmax,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
-@@ -1202,6 +1204,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1119,6 +1120,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                            sqrt,                            true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                             sin,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
+@@ -1280,6 +1282,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_ELU:
 +                case GGML_UNARY_OP_NEG:
-                     return ggml_is_contiguous(op->src[0]);
+                     return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                 default:
                     return false;
-@@ -1873,6 +1876,18 @@ static void ggml_metal_encode_node(
+@@ -1966,6 +1969,18 @@ static void ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -56,10 +57,10 @@ index e4c093f9..d8422f1b 100644
                 {
                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index f38909d0..bb0ff668 100644
+index e3185e5b..ede9d1e6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -945,6 +945,13 @@ kernel void kernel_cos(
+@@ -949,6 +949,13 @@ kernel void kernel_cos(
     dst[tpig] = cos(src0[tpig]);
 }
 

--- a/llama/patches/0018-add-phi4-support.patch
+++ b/llama/patches/0018-add-phi4-support.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Thu, 27 Feb 2025 15:12:26 -0800
-Subject: [PATCH] add phi4 support
-
---
- include/llama.h     |  1 +
- src/llama-model.cpp | 10 +++++++---
- src/llama-vocab.cpp | 11 +++++++++++
- 3 files changed, 19 insertions(+), 3 deletions(-)
-
-diff --git a/include/llama.h b/include/llama.h
-index cc948005..16774711 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -105,6 +105,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-     };
- 
-     enum llama_rope_type {
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 21819080..ab1a07d1 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -2283,7 +2283,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
- 
-                     // output
-                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
-+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-+                    // if output is NULL, init from the input tok embed
-+                    if (output == NULL) {
-+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-+                    }
- 
-                     for (int i = 0; i < n_layer; ++i) {
-                         auto & layer = layers[i];
-@@ -2298,8 +2302,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                         layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
- 
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                     }
-                 } break;
-             case LLM_ARCH_PHIMOE:
-diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1ca827eb..c7ff28be 100644
--- a/src/llama-vocab.cpp
-+++ b/src/llama-vocab.cpp
-@@ -392,6 +392,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
-                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
-+                // original regex from tokenizer.json
-+                // [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
-@@ -1583,6 +1590,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-             } else if (
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
-+            } else if (
-+                tokenizer_pre == "gpt-4o") {
-+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
-+                clean_spaces = false;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- a/llama/patches/0019-fix-compiler-error-in-clip.h.patch
+++ b/llama/patches/0019-fix-compiler-error-in-clip.h.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:49:50 -0700
+Subject: [PATCH] fix compiler error in clip.h
+
+fixes an error that occurs in clip.h when compiling
+using CGo
+---
+ examples/llava/clip.h | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/examples/llava/clip.h b/examples/llava/clip.h
+index cc133a58..5fc45d3e 100644
+--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
+@@ -30,12 +30,13 @@ struct clip_image_size {
+     int height;
+ };
+ 
+struct clip_image_f32;
+ struct clip_image_u8_batch;
+ struct clip_image_f32_batch;
+ 
+ struct clip_context_params {
+     bool use_gpu;
+-    ggml_log_level verbosity;
+    enum ggml_log_level verbosity;
+ };
+ 
+ // deprecated, use clip_init
+@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+ CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+ CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+ CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+ 
+ /**
+  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
--- a/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
+++ b/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 13:06:57 -0700
+Subject: [PATCH] Revert "Simplify and improve CUDA graphs through use of
+ indirect copy pointers (#9017)"
+
+this commit in llama.cpp causes errors when running llama 3.2
+vision - temporarily revert it
+
+This reverts commit 3f9da22c2b21a2cef216de50006436ef1cab8764.
+---
+ ggml/src/ggml-cuda/common.cuh   |   8 +-
+ ggml/src/ggml-cuda/cpy.cu       | 149 ++++++++++++--------------------
+ ggml/src/ggml-cuda/cpy.cuh      |   2 -
+ ggml/src/ggml-cuda/ggml-cuda.cu |  93 +++++++++++++++-----
+ 4 files changed, 124 insertions(+), 128 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index 8284a001..a718b6a1 100644
+--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
+@@ -729,13 +729,7 @@ struct ggml_cuda_graph {
+     bool disable_due_to_failed_graph_capture = false;
+     int number_consecutive_updates = 0;
+     std::vector<ggml_graph_node_properties> ggml_graph_properties;
+-    bool use_cpy_indirection = false;
+-    std::vector<char *> cpy_dest_ptrs;
+-    char ** dest_ptrs_d;
+-    int dest_ptrs_size = 0;
+-    // Index to allow each cpy kernel to be aware of it's position within the graph
+-    // relative to other cpy nodes.
+-    int graph_cpynode_index = -1;
+    std::vector<char **> updated_kernel_arg;
+ #endif
+ };
+ 
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 4f4faa3e..8396df28 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -39,18 +39,16 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_1>
+-static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                   const int nb12, const int nb13) {
+     const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+     // then combine those indices with the corresponding byte offsets to get the total offsets
+     const int64_t i03 = i/(ne00 * ne01 * ne02);
+@@ -297,18 +295,16 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -325,18 +321,16 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -352,97 +346,76 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
+     cpy_blck(cx + x_offset, cdst + dst_offset);
+ }
+ 
+-// Copy destination pointers to GPU to be available when pointer indirection is in use
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
+-        CUDA_CHECK(cudaStreamSynchronize(stream));
+-        if (cuda_graph->dest_ptrs_d != nullptr) {
+-            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
+-        }
+-        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
+-        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
+-    }
+-    // copy destination pointers to GPU
+-    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
+-    cuda_graph->graph_cpynode_index = 0; // reset index
+-#else
+-    GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
+-    GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
+-#endif
+-}
+-
+ static void ggml_cpy_f16_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_bf16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q8_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK8_0 == 0);
+     const int num_blocks = ne / QK8_0;
+     cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q8_0_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_0 == 0);
+     const int num_blocks = ne / QK4_0;
+     cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_0_f32_cuda(
+@@ -451,22 +424,22 @@ static void ggml_cpy_q4_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_1 == 0);
+     const int num_blocks = ne / QK4_1;
+     cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_1_f32_cuda(
+@@ -475,22 +448,22 @@ static void ggml_cpy_q4_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_0 == 0);
+     const int num_blocks = ne / QK5_0;
+     cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_0_f32_cuda(
+@@ -499,22 +472,22 @@ static void ggml_cpy_q5_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_1 == 0);
+     const int num_blocks = ne / QK5_1;
+     cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_1_f32_cuda(
+@@ -523,32 +496,32 @@ static void ggml_cpy_q5_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_iq4_nl_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_NL == 0);
+     const int num_blocks = ne / QK4_NL;
+     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f16_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -585,62 +558,48 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+     char * src0_ddc = (char *) src0->data;
+     char * src1_ddc = (char *) src1->data;
+ 
+-    char ** dest_ptrs_d = nullptr;
+-    int graph_cpynode_index = -1;
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+-        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+-    }
+-#endif
+     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
+         CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
+-        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
+-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+     }
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+-    }
+-#endif
+-
+ }
+ 
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
+index 6bed0564..28b06cdd 100644
+--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
+@@ -7,5 +7,3 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+ 
+ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 67208cba..a44788db 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2477,11 +2477,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ 
+ #ifdef USE_CUDA_GRAPH
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool use_cuda_graph) {
+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
+ 
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+-    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+-
+    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2513,11 +2512,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+ 
+         if (node->op == GGML_OP_CPY) {
+-
+-            // Store the pointers which are updated for each token, such that these can be sent
+-            // to the device and accessed using indirection from CUDA graph
+-            cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
+-
+            // store the copy op parameter which changes with each token.
+            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
+             // store a pointer to each copy op CUDA kernel to identify it later
+             void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
+             if (!ptr) {
+@@ -2525,6 +2521,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #ifndef NDEBUG
+                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+ #endif
+            } else {
+                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
+                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
+                }
+             }
+         }
+ 
+@@ -2533,12 +2533,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+     }
+ 
+-    if (use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = true;
+-        // copy pointers to GPU so they can be accessed via indirection within CUDA graph
+-        ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
+-    }
+-
+     return use_cuda_graph;
+ }
+ 
+@@ -2593,6 +2587,51 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
+     return true;
+ }
+ 
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
+
+    if (cuda_graph_update_required) {
+        // Extract nodes from graph
+        // First call with null argument gets number of nodes in graph
+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
+        // Subsequent call with non-null argument gets nodes
+        cuda_ctx->cuda_graph->nodes.clear();
+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+        cuda_ctx->cuda_graph->params.clear();
+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
+
+            // Loop over nodes, and extract kernel parameters from each node
+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+                cudaGraphNodeType node_type;
+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
+                if (node_type == cudaGraphNodeTypeKernel) {
+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
+                    if (stat == cudaErrorInvalidDeviceFunction) {
+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
+                        // We don't need to update blas nodes, so clear error and move on.
+                        (void)cudaGetLastError();
+                    } else {
+                        GGML_ASSERT(stat == cudaSuccess);
+                    }
+                }
+            }
+        }
+    } else {
+        // One of the arguments to the copy kernel is updated for each token, hence we need to
+        // replace that argument with the updated value in the CUDA graph
+        // on update steps, the live parameters will already be captured
+        int k = 0;
+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
+                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
+            }
+        }
+    }
+}
+
+ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
+ 
+     bool cuda_graph_update_required = false;
+@@ -2652,7 +2691,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
+ #endif
+ 
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
+    bool & cuda_graph_update_required) {
+ 
+     while (!graph_evaluated_or_captured) {
+         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
+@@ -2702,9 +2742,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+         if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
+             CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+         }
+-        if (cuda_graph_update_required) { // Update graph executable
+-            update_cuda_graph_executable(cuda_ctx);
+-        }
+
+        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
+
+        // Update graph executable
+        update_cuda_graph_executable(cuda_ctx);
+
+         // Launch graph
+         CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
+ #else
+@@ -2718,6 +2762,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     ggml_cuda_set_device(cuda_ctx->device);
+ 
+    // vector of pointers to CUDA cpy kernels, which are required to identify
+    // kernel parameters which need updated in the graph for each token
+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
+
+ #ifdef USE_CUDA_GRAPH
+     static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+ 
+@@ -2751,7 +2799,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+     if (use_cuda_graph) {
+         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+ 
+-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+ 
+         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+         if (use_cuda_graph && cuda_graph_update_required) {
+@@ -2772,10 +2821,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+     }
+ 
+-    if (!use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = false;
+-    }
+-
+ #else
+     bool use_cuda_graph = false;
+     bool cuda_graph_update_required = false;
+@@ -2783,7 +2828,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     bool graph_evaluated_or_captured = false;
+ 
+-    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+ 
+     return GGML_STATUS_SUCCESS;
+ }
--- a/llama/patches/0021-remove-ggml-git-build-info.patch
+++ b/llama/patches/0021-remove-ggml-git-build-info.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 21:13:44 -0400
+Subject: [PATCH] remove ggml git build info
+
+---
+ ggml/CMakeLists.txt | 25 -------------------------
+ 1 file changed, 25 deletions(-)
+
+diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
+index d33f843b..a6c59f22 100644
+--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
+@@ -287,31 +287,6 @@ if (GGML_STANDALONE)
+         DESTINATION share/pkgconfig)
+ endif()
+ 
+-#
+-# Create CMake package
+-#
+-
+-# Generate version info based on git commit.
+-
+-if(NOT DEFINED GGML_BUILD_NUMBER)
+-    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
+-    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_NUMBER
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-
+-    if(GGML_BUILD_NUMBER EQUAL 1)
+-        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
+-    endif()
+-
+-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-endif()
+-
+ 
+ # Capture variables prefixed with GGML_.
+ 
--- a/llama/patches/0022-add-rdna4-support.patch
+++ b/llama/patches/0022-add-rdna4-support.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Saman <saman.khatir@amd.com>
-Date: Wed, 19 Mar 2025 14:02:26 -0700
-Subject: [PATCH] add rdna4 support
-
---
- ggml/src/ggml-cuda/common.cuh    | 6 ++++--
- ggml/src/ggml-cuda/mmq.cu        | 2 +-
- ggml/src/ggml-cuda/mmq.cuh       | 4 ++--
- ggml/src/ggml-cuda/mmvq.cu       | 4 ++--
- ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
- 5 files changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index adf0d3ec..b24593fc 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -61,11 +61,13 @@
- #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
- #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
- #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
- 
- #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
- #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
- #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
-+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
- #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
- #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
- 
-@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
-     c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-+#elif defined(RDNA3) || defined(RDNA4)
-     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
- #elif defined(__gfx1010__) || defined(__gfx900__)
-     int tmp1;
-diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
-index 10f2ebb1..933d945c 100644
--- a/ggml/src/ggml-cuda/mmq.cu
-+++ b/ggml/src/ggml-cuda/mmq.cu
-@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
-         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-     }
- 
-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
- }
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 0451c65f..66ce2bc9 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
- 
- template <ggml_type type, int mmq_x, int nwarps, bool need_check>
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
- #else
- #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
-     __launch_bounds__(WARP_SIZE*nwarps, 1)
-diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
-index 4fb466ca..23ae7abc 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
-+++ b/ggml/src/ggml-cuda/mmvq.cu
-@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
- 
-     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
- 
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
-     constexpr int nwarps              = 1;
-     constexpr int rows_per_cuda_block = 1;
- #else
-     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
- 
-     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-     const     int row0 = rows_per_cuda_block*blockIdx.x;
-diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 81964611..a62544b5 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
-+++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -150,6 +150,10 @@
- #define CDNA
- #endif
- 
-+#if defined(__gfx1200__) || defined(__gfx1201__)
-+#define RDNA4
-+#endif
-+
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-     defined(__gfx1150__) || defined(__gfx1151__)
- #define RDNA3
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -73,7 +73,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
    try {
        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
        std::vector<std::string> splits = {};
-        llama_model_loader ml(std::string(fname), splits, false, false, nullptr);
+        llama_model_loader ml(std::string(fname), splits, false, false, nullptr, nullptr);
        vocab->load(ml, kv);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());

--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
 protect *.go
 protect *-embed.*
+include cmake/
 include include/
 include src/
 include src/CMakeLists.txt
@@ -13,6 +14,7 @@ include src/ggml-cuda/vendors/
 include src/ggml-cuda/template-instances/
 include src/ggml-hip/
 include src/ggml-metal/
+include CMakeLists.txt
 include *.c
 include *.h
 include *.cpp
@@ -20,4 +22,6 @@ include *.cu
 include *.cuh
 include *.m
 include *.metal
+include common.cmake
+include ggml-config.cmake.in
 exclude *
--- a/ml/backend/ggml/ggml/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("ggml" C CXX)
+include(CheckIncludeFileCXX)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(GGML_STANDALONE ON)
+
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+    # configure project version
+    # TODO
+else()
+    set(GGML_STANDALONE OFF)
+endif()
+
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+
+    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+
+# remove the lib prefix on win32 mingw
+if (WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_MODULE_PREFIX  "")
+endif()
+
+option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
+
+#
+# option list
+#
+
+# TODO: mark all options as advanced when not GGML_STANDALONE
+
+if (APPLE)
+    set(GGML_METAL_DEFAULT ON)
+    set(GGML_BLAS_DEFAULT ON)
+    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
+else()
+    set(GGML_METAL_DEFAULT OFF)
+    set(GGML_BLAS_DEFAULT OFF)
+    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
+endif()
+
+if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
+    message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
+    set(GGML_NATIVE_DEFAULT OFF)
+else()
+    set(GGML_NATIVE_DEFAULT ON)
+endif()
+
+# defaults
+if (NOT GGML_LLAMAFILE_DEFAULT)
+    set(GGML_LLAMAFILE_DEFAULT OFF)
+endif()
+
+if (NOT GGML_CUDA_GRAPHS_DEFAULT)
+    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
+endif()
+
+# general
+option(GGML_STATIC "ggml: static link libraries"                     OFF)
+option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
+option(GGML_LTO    "ggml: enable link time optimization"             OFF)
+option(GGML_CCACHE "ggml: use ccache if available"                   ON)
+
+# debug
+option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
+option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
+option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+
+# build
+option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
+
+# sanitizers
+option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
+option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
+option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
+
+# instruction set specific
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
+message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
+message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
+message(DEBUG "INS_ENB             : ${INS_ENB}")
+
+option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
+option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
+option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_BMI2             "ggml: enable BMI2"             ${INS_ENB})
+option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
+option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
+if (NOT MSVC)
+    # in MSVC F16C and FMA is implied with AVX2/AVX512
+    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
+    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
+    # MSVC does not seem to support AMX
+    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
+    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
+    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
+endif()
+option(GGML_LASX             "ggml: enable lasx"             ON)
+option(GGML_LSX              "ggml: enable lsx"              ON)
+option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
+option(GGML_VXE              "ggml: enable vxe"              ON)
+
+option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
+
+
+if (WIN32)
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
+endif()
+
+# ggml core
+set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
+
+# 3rd party libs / backends
+option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
+option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
+set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                            "ggml: BLAS library vendor")
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+
+option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
+option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
+option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
+                                            "ggml: max. batch size for using peer access")
+option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
+option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
+option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
+option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
+option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
+set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
+                                            "ggml: cuda link binary compression mode; requires cuda 12.8+")
+set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
+
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
+option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
+option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
+option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
+option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
+option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
+option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
+option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
+option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
+option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
+option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
+option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
+option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
+option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
+option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
+set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                            "ggml: metal minimum macOS version")
+set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
+option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
+option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
+set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
+                                            "ggml: sycl target device")
+set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
+                                            "ggml: sycl device architecture")
+
+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
+                                            "gmml: OpenCL API version to target")
+
+# toolchain for vulkan-shaders-gen
+set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
+
+# extra artifacts
+option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
+
+#
+# dependencies
+#
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+find_package(Threads REQUIRED)
+
+include(GNUInstallDirs)
+
+#
+# build the library
+#
+
+add_subdirectory(src)
+
+#
+# tests and examples
+#
+
+if (GGML_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+if (GGML_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
+
+#
+# install
+#
+
+include(CMakePackageConfigHelpers)
+
+# all public headers
+set(GGML_PUBLIC_HEADERS
+    include/ggml.h
+    include/ggml-cpu.h
+    include/ggml-alloc.h
+    include/ggml-backend.h
+    include/ggml-blas.h
+    include/ggml-cann.h
+    include/ggml-cpp.h
+    include/ggml-cuda.h
+    include/ggml-kompute.h
+    include/ggml-opt.h
+    include/ggml-metal.h
+    include/ggml-rpc.h
+    include/ggml-sycl.h
+    include/ggml-vulkan.h
+    include/gguf.h)
+
+set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+#if (GGML_METAL)
+#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
+#endif()
+install(TARGETS ggml LIBRARY PUBLIC_HEADER)
+install(TARGETS ggml-base LIBRARY)
+
+if (GGML_STANDALONE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        @ONLY)
+
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        DESTINATION share/pkgconfig)
+endif()
+
+
+# Capture variables prefixed with GGML_.
+
+set(variable_set_statements
+"
+####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
+####### Any changes to this file will be overwritten by the next CMake run        #######
+
+")
+
+set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
+
+get_cmake_property(all_variables VARIABLES)
+foreach(variable_name IN LISTS all_variables)
+    if(variable_name MATCHES "^GGML_")
+        string(REPLACE ";" "\\;"
+               variable_value "${${variable_name}}")
+
+        set(variable_set_statements
+            "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
+    endif()
+endforeach()
+
+set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
+
+# Create the CMake package and set install location.
+
+set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
+set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
+    PATH_VARS GGML_INCLUDE_INSTALL_DIR
+              GGML_LIB_INSTALL_DIR
+              GGML_BIN_INSTALL_DIR)
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+    VERSION ${GGML_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
--- a/ml/backend/ggml/ggml/cmake/common.cmake
+++ b/ml/backend/ggml/ggml/cmake/common.cmake
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
--- a/ml/backend/ggml/ggml/cmake/ggml-config.cmake.in
+++ b/ml/backend/ggml/ggml/cmake/ggml-config.cmake.in
+
+@GGML_VARIABLES_EXPANDED@
+
+@PACKAGE_INIT@
+
+set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
+set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
+#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+
+find_package(Threads REQUIRED)
+
+find_library(GGML_LIBRARY ggml
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_LIBRARY}")
+
+find_library(GGML_BASE_LIBRARY ggml-base
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml-base UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml-base
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
+
+if (NOT GGML_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
+    endif()
+
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
+    endif()
+
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()
+
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+
+        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
+                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
+
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
+    endif()
+
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
+endif()
+
+set(_ggml_all_targets "")
+foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
+    string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
+
+    find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+        REQUIRED
+        HINTS ${GGML_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH)
+
+    message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
+
+    add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+    set_target_properties(ggml::${_ggml_backend}
+        PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
+            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+            IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
+            INTERFACE_COMPILE_FEATURES c_std_90
+            POSITION_INDEPENDENT_CODE ON)
+
+    string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+    if(is_cpu_variant)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+           PROPERTIES
+               INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
+
+        if(GGML_CPU_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
+        endif()
+
+    else()
+        list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+            PROPERTIES
+                INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
+
+        if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
+        endif()
+    endif()
+
+    list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+endforeach()
+
+list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
+set_target_properties(ggml::ggml
+    PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
+
+add_library(ggml::all INTERFACE IMPORTED)
+set_target_properties(ggml::all
+    PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
+
+check_required_components(ggml)