Update GGML to b6646 (#12245)

Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported

Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
c68f367e · Daniel Hiltgen · GitHub · fdb10946 · c68f367e · c68f367e
Unverified Commit c68f367e authored Oct 02, 2025 by Daniel Hiltgen Committed by GitHub Oct 02, 2025
20 changed files
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -789,6 +789,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
 }
 struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
+    LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
    const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
    if (cur == NULL) {

--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-vocab.h"
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -28,6 +29,7 @@ enum llm_type {
    LLM_TYPE_80M,
    LLM_TYPE_109M,
    LLM_TYPE_137M,
+    LLM_TYPE_140M,
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
@@ -36,12 +38,15 @@ enum llm_type {
    LLM_TYPE_270M,
    LLM_TYPE_335M,
    LLM_TYPE_350M,
+    LLM_TYPE_360M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
+    LLM_TYPE_558M,
    LLM_TYPE_700M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
+    LLM_TYPE_950M,
    LLM_TYPE_0_3B,
    LLM_TYPE_0_5B,
    LLM_TYPE_0_6B,
@@ -54,6 +59,7 @@ enum llm_type {
    LLM_TYPE_1_7B,
    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
+    LLM_TYPE_2_6B,
    LLM_TYPE_2_8B,
    LLM_TYPE_2_9B,
    LLM_TYPE_3B,
@@ -76,9 +82,11 @@ enum llm_type {
    LLM_TYPE_32B,
    LLM_TYPE_34B,
    LLM_TYPE_35B,
+    LLM_TYPE_36B,
    LLM_TYPE_40B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
+    LLM_TYPE_120B,
    LLM_TYPE_142B,
    LLM_TYPE_236B,
    LLM_TYPE_290B,
@@ -268,6 +276,11 @@ struct llama_layer {
    struct ggml_tensor * ffn_down_shexp     = nullptr;
    struct ggml_tensor * ffn_up_shexp       = nullptr;
+    // ff adjugate experts (chexps)
+    struct ggml_tensor * ffn_gate_chexps     = nullptr;
+    struct ggml_tensor * ffn_down_chexps     = nullptr;
+    struct ggml_tensor * ffn_up_chexps       = nullptr;
    // ff bias
    struct ggml_tensor * ffn_gate_b = nullptr;
    struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -449,10 +462,12 @@ struct llama_model {
    std::string desc() const;
-    size_t size() const;
+    size_t size() const; // file size
    size_t n_tensors() const;
    size_t n_devices() const;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
    // total number of parameters in the model
    uint64_t n_elements() const;

--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // attention layers have a non-zero number of kv heads
        int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
        if (llama_model_has_encoder(&model)) {
-            n_attn_layer *= 3;
+            // now n_attn_layer is the number of attention layers in the encoder
+            // for each decoder block, there are 2 attention layers
+            n_attn_layer += 2 * model.hparams.dec_n_layer;
        }
        GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
    }
@@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            new_type = tensor->type;
            new_data = tensor->data;
            new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+            LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
        } else {
            const int64_t nelements = ggml_nelements(tensor);
@@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }
    close_ofstream();
-    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: model size  = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
-    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
    if (qs.n_fallback > 0) {
        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",

--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_GROK_2:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -1763,7 +1770,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
-#ifdef IS_BIG_ENDIAN
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
                // correct endiannes of data in precompiled_charsmap binary blob
                uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@@ -1944,7 +1951,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
                clean_spaces = false;
            } else if (
-                tokenizer_pre == "bailingmoe") {
+                tokenizer_pre == "bailingmoe" ||
+                tokenizer_pre == "llada-moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                clean_spaces = false;
            } else if (
@@ -1963,6 +1971,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "kimi-k2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "grok-2") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
+                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2331,7 +2343,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
-            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
+            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }
@@ -2378,6 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
@@ -2459,7 +2472,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        // set attributes by model/tokenizer/architecture name
        if (false
                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
-                || _contains_any(general_arch, {"nomic-bert-moe"})
+                || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
           ) {
            if (token_to_id.count("<mask>") == 0) {
                LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);

--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE  = 38,
+    LLAMA_VOCAB_PRE_TYPE_GROK_2         = 39,
 };
 struct LLM_KV;

--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -25,6 +25,18 @@
 // interface implementation
 //
+const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
+    switch (flash_attn_type) {
+        case LLAMA_FLASH_ATTN_TYPE_AUTO:
+            return "auto";
+        case LLAMA_FLASH_ATTN_TYPE_DISABLED:
+            return "disabled";
+        case LLAMA_FLASH_ATTN_TYPE_ENABLED:
+            return "enabled";
+    }
+    GGML_ABORT("fatal error");
+}
 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
        /*.no_perf                     =*/ true,
@@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
 bool llama_supports_gpu_offload(void) {
    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
+           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
           llama_supports_rpc();
 }
@@ -71,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
        GGML_ASSERT(dev && "CPU backend is not loaded");
        auto * reg = ggml_backend_dev_backend_reg(dev);
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
-        numa_init_fn(numa);
+        if (numa_init_fn) {
+            numa_init_fn(numa);
+        }
    }
 }
@@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
            model->devices.push_back(*dev);
        }
    } else {
+        // default device selection
+        // build list of available devices
+        std::vector<ggml_backend_dev_t> gpus;
+        std::vector<ggml_backend_dev_t> igpus;
        std::vector<ggml_backend_dev_t> rpc_servers;
-        // use all available devices
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
            switch (ggml_backend_dev_type(dev)) {
@@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
                    // skip CPU backends since they are handled separately
                    break;
-                case GGML_BACKEND_DEVICE_TYPE_GPU:
+                case GGML_BACKEND_DEVICE_TYPE_GPU: {
                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                        rpc_servers.push_back(dev);
                    } else {
-                        model->devices.push_back(dev);
+                        // check if there is already a GPU with the same device id
+                        ggml_backend_dev_props props;
+                        ggml_backend_dev_get_props(dev, &props);
+                        auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
+                            ggml_backend_dev_props d_props;
+                            ggml_backend_dev_get_props(d, &d_props);
+                            if (props.device_id && d_props.device_id) {
+                                return strcmp(props.device_id, d_props.device_id) == 0;
+                            }
+                            return false;
+                        });
+                        if (it != gpus.end()) {
+                            LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
+                                    __func__,
+                                    ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+                                    props.device_id ? props.device_id : "unknown id",
+                                    ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
+                        } else {
+                            gpus.push_back(dev);
+                        }
                    }
                    break;
+                }
+                case GGML_BACKEND_DEVICE_TYPE_IGPU:
+                    igpus.push_back(dev);
+                    break;
            }
        }
-        // add RPC servers at the front of the list
-        if (!rpc_servers.empty()) {
+        // add RPC servers at the front of the list to minimize network transfers
-            model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+        model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+        // add GPUs
+        model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
+        // add integrated GPUs only if no other devices were found
+        if (model->devices.empty()) {
+            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
        }
    }
@@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
    }
    for (auto * dev : model->devices) {
-        size_t free, total; // NOLINT
+        ggml_backend_dev_props props;
-        ggml_backend_dev_memory(dev, &free, &total);
+        ggml_backend_dev_get_props(dev, &props);
-        LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+        LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
+                ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+                props.device_id ? props.device_id : "unknown id",
+                props.memory_free/1024/1024);
    }
    const int status = llama_model_load(path_model, splits, *model, params);

--- a/llama/llama.cpp/src/unicode.h
+++ b/llama/llama.cpp/src/unicode.h
@@ -4,6 +4,7 @@
 #include <string>
 #include <vector>
+// TODO: reimplement this structure in endian-independent way
 struct unicode_cpt_flags {
    enum {
        UNDEFINED       = 0x0001,
@@ -15,6 +16,10 @@ struct unicode_cpt_flags {
        SYMBOL          = 0x0040,  // regex: \p{S}
        CONTROL         = 0x0080,  // regex: \p{C}
        MASK_CATEGORIES = 0x00FF,
+        WHITESPACE      = 0x0100,
+        LOWERCASE       = 0x0200,
+        UPPERCASE       = 0x0400,
+        NFD             = 0x0800,
    };
    // codepoint type
@@ -34,11 +39,49 @@ struct unicode_cpt_flags {
    // decode from uint16
    inline unicode_cpt_flags(const uint16_t flags = 0) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        *reinterpret_cast<uint16_t*>(this) = flags;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        is_undefined   = (flags & UNDEFINED)   ? 1 : 0;
+        is_number      = (flags & NUMBER)      ? 1 : 0;
+        is_letter      = (flags & LETTER)      ? 1 : 0;
+        is_separator   = (flags & SEPARATOR)   ? 1 : 0;
+        is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
+        is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
+        is_symbol      = (flags & SYMBOL)      ? 1 : 0;
+        is_control     = (flags & CONTROL)     ? 1 : 0;
+        is_whitespace  = (flags & WHITESPACE)  ? 1 : 0;
+        is_lowercase   = (flags & LOWERCASE)   ? 1 : 0;
+        is_uppercase   = (flags & UPPERCASE)   ? 1 : 0;
+        is_nfd         = (flags & NFD)         ? 1 : 0;
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
    }
    inline uint16_t as_uint() const {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        return *reinterpret_cast<const uint16_t*>(this);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        uint16_t result =
+              is_undefined   * UNDEFINED
+            + is_number      * NUMBER
+            + is_letter      * LETTER
+            + is_separator   * SEPARATOR
+            + is_accent_mark * ACCENT_MARK
+            + is_punctuation * PUNCTUATION
+            + is_symbol      * SYMBOL
+            + is_control     * CONTROL
+            + is_whitespace  * WHITESPACE
+            + is_lowercase   * LOWERCASE
+            + is_uppercase   * UPPERCASE
+            + is_nfd         * NFD
+            ;
+        return result;
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
    }
    inline uint16_t category_flag() const {

--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -44,6 +44,7 @@
 #define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
 #define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
 #define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"
 // audio-specific
 #define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
@@ -81,6 +82,7 @@
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MM_INP_NORM     "mm.input_norm.weight"
+#define TN_MM_INP_NORM_B   "mm.input_norm.bias"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
@@ -132,6 +134,8 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2A,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
+    PROJECTOR_TYPE_LFM2,
+    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_UNKNOWN,
 };
@@ -152,6 +156,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
+    { PROJECTOR_TYPE_LFM2,      "lfm2"},
+    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
 };
 static projector_type clip_projector_type_from_string(const std::string & str) {

--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
--- a/llama/llama.cpp/tools/mtmd/clip.h
+++ b/llama/llama.cpp/tools/mtmd/clip.h
@@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
 */
 void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
-bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
 bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

--- a/llama/llama.cpp/tools/mtmd/mtmd.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd.cpp
--- a/llama/llama.cpp/vendor/miniaudio/miniaudio.h
+++ b/llama/llama.cpp/vendor/miniaudio/miniaudio.h
--- a/llama/llama.go
+++ b/llama/llama.go
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch