ggml update to b6840 (#12791)

544b6739 · Daniel Hiltgen · GitHub · c4ba257c · 544b6739 · 544b6739
Unverified Commit 544b6739 authored Nov 06, 2025 by Daniel Hiltgen Committed by GitHub Nov 06, 2025
20 changed files
--- a/Makefile.sync
+++ b/Makefile.sync
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=7049736b2dd9011bf819e298b844ebbc4b5afdc9
+FETCH_HEAD=3cfa9c3f125763305b4226bc032f1954f08990dc
 .PHONY: help
 help:

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "7049736b2dd9011bf819e298b844ebbc4b5afdc9";
+char const *LLAMA_COMMIT = "3cfa9c3f125763305b4226bc032f1954f08990dc";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }
-static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
-    auto has_min = min_value != std::numeric_limits<int>::min();
+    auto has_min = min_value != std::numeric_limits<int64_t>::min();
-    auto has_max = max_value != std::numeric_limits<int>::max();
+    auto has_max = max_value != std::numeric_limits<int64_t>::max();
    auto digit_range = [&](char from, char to) {
        out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            }
            digit_range(c, c);
            out << " (";
-            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@@ -925,17 +925,17 @@ public:
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
-            int min_value = std::numeric_limits<int>::min();
+            int64_t min_value = std::numeric_limits<int64_t>::min();
-            int max_value = std::numeric_limits<int>::max();
+            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
-                min_value = schema["minimum"].get<int>();
+                min_value = schema["minimum"].get<int64_t>();
            } else if (schema.contains("exclusiveMinimum")) {
-                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+                min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
            }
            if (schema.contains("maximum")) {
-                max_value = schema["maximum"].get<int>();
+                max_value = schema["maximum"].get<int64_t>();
            } else if (schema.contains("exclusiveMaximum")) {
-                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+                max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
            }
            std::stringstream out;
            out << "(";

--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -5,6 +5,7 @@
 #include <map>
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
    { LLM_ARCH_LLAMA,            "llama"            },
    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
@@ -85,6 +86,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_BAILINGMOE2,      "bailingmoe2"      },
    { LLM_ARCH_DOTS1,            "dots1"            },
    { LLM_ARCH_ARCEE,            "arcee"            },
    { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
@@ -135,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_COUNT,                      "%s.expert_count"                      },
    { LLM_KV_EXPERT_USED_COUNT,                 "%s.expert_used_count"                 },
    { LLM_KV_EXPERT_SHARED_COUNT,               "%s.expert_shared_count"               },
+    { LLM_KV_EXPERT_GROUP_COUNT,                "%s.expert_group_count"                },
+    { LLM_KV_EXPERT_GROUP_USED_COUNT,           "%s.expert_group_used_count"           },
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
@@ -277,6 +281,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 };
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_CLIP,
+        {},
+    },
    {
        LLM_ARCH_LLAMA,
        {
@@ -1961,6 +1969,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
+    {
+        LLM_ARCH_BAILINGMOE2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_NEXTN_EH_PROJ,      "blk.%d.nextn.eh_proj" },
+            { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
+            { LLM_TENSOR_NEXTN_ENORM,        "blk.%d.nextn.enorm" },
+            { LLM_TENSOR_NEXTN_HNORM,        "blk.%d.nextn.hnorm" },
+            { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
+            { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
+            { LLM_TENSOR_LAYER_OUT_NORM,     "blk.%d.layer_output_norm" },
+        },
+    },
    {
        LLM_ARCH_DOTS1,
        {

--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -9,6 +9,7 @@
 //
 enum llm_arch {
+    LLM_ARCH_CLIP,
    LLM_ARCH_LLAMA,
    LLM_ARCH_LLAMA4,
    LLM_ARCH_DECI,
@@ -89,6 +90,7 @@ enum llm_arch {
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
+    LLM_ARCH_BAILINGMOE2,
    LLM_ARCH_DOTS1,
    LLM_ARCH_ARCEE,
    LLM_ARCH_ERNIE4_5,
@@ -139,6 +141,8 @@ enum llm_kv {
    LLM_KV_EXPERT_COUNT,
    LLM_KV_EXPERT_USED_COUNT,
    LLM_KV_EXPERT_SHARED_COUNT,
+    LLM_KV_EXPERT_GROUP_COUNT,
+    LLM_KV_EXPERT_GROUP_USED_COUNT,
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_EXPERT_WEIGHTS_NORM,
    LLM_KV_EXPERT_GATING_FUNC,

--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -123,7 +123,7 @@ private:
    uint32_t n_seq_max;
    uint32_t n_outputs;
-    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+    std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;

--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
    { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
+    { "bailing-think",     LLM_CHAT_TEMPLATE_BAILING_THINK     },
+    { "bailing2",          LLM_CHAT_TEMPLATE_BAILING2          },
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_YANDEX;
    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
        return LLM_CHAT_TEMPLATE_BAILING;
+    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
+        return LLM_CHAT_TEMPLATE_BAILING_THINK;
+    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
+        return LLM_CHAT_TEMPLATE_BAILING2;
    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
    } else if (tmpl_contains("<|endofuserprompt|>")) {
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << " Ассистент:[SEP]";
        }
-    }  else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
-        // Bailing (Ling) template
+        // Bailing (Ling/Ring) template
        for (auto message : chat) {
            std::string role(message->role);
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
            ss << "<role>" << role << "</role>" << message->content;
        }
+        if (add_ass) {
+            ss << "<role>ASSISTANT</role>";
+            if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
+                ss << "<think>";
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
+        // Bailing2 (Ling 2.0) template
+        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+        if (!has_system) {
+            ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
+        }
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                role = "HUMAN";
+            } else {
+                std::transform(role.begin(), role.end(), role.begin(), ::toupper);
+            }
+            ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
+        }
        if (add_ass) {
            ss << "<role>ASSISTANT</role>";
        }

--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -42,6 +42,8 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MEGREZ,
    LLM_CHAT_TEMPLATE_YANDEX,
    LLM_CHAT_TEMPLATE_BAILING,
+    LLM_CHAT_TEMPLATE_BAILING_THINK,
+    LLM_CHAT_TEMPLATE_BAILING2,
    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_DOTS1,

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -2345,7 +2345,8 @@ llama_context * llama_init_from_model(
        return nullptr;
    }
-    if (params.pooling_type != model->hparams.pooling_type) {
+    if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
+        params.pooling_type != model->hparams.pooling_type) {
        //user-specified pooling-type is different from the model default
        LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
                       model->hparams.pooling_type, params.pooling_type);

--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
    }
 }
-static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
    LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
-    const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
+    const char * swa_type_str = "unknown";
-                          (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
-                          (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
+    switch (swa_type) {
-                          (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
+        case LLAMA_SWA_TYPE_NONE:      swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
+        case LLAMA_SWA_TYPE_STANDARD:  swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
+        case LLAMA_SWA_TYPE_CHUNKED:   swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
+        case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
+    };
    LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
    LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
    LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
    const int64_t n_kv     = ubatch->n_tokens;
    const int64_t n_tokens = ubatch->n_tokens;
-    GGML_ASSERT(kq_mask);
+    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
-    GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
+        for (int h = 0; h < 1; ++h) {
+            for (int i1 = 0; i1 < n_tokens; ++i1) {
-    float * data = (float *) kq_mask->data;
+                const llama_seq_id s1 = ubatch->seq_id[i1][0];
+                const llama_pos    p1 = ubatch->pos[i1];
-    // [TAG_NO_CACHE_ISWA]
-    GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
-    for (int h = 0; h < 1; ++h) {
-        for (int i1 = 0; i1 < n_tokens; ++i1) {
-            const llama_seq_id s1 = ubatch->seq_id[i1][0];
-            for (int i0 = 0; i0 < n_tokens; ++i0) {
+                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
-                float f = -INFINITY;
-                for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
+                for (int i0 = 0; i0 < n_tokens; ++i0) {
                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
+                    const llama_pos p0    = ubatch->pos[i0];
+                    // mask different sequences
                    if (s0 != s1) {
-                        continue; // skip different sequences
+                        continue;
                    }
-                    if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
+                    // mask future tokens
-                        continue; // skip future tokens for causal attention
+                    if (cparams.causal_attn && p0 > p1) {
+                        continue;
                    }
-                    // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
+                    // apply SWA if any
-                    //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
+                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
-                    //    continue; // skip masked tokens for SWA
+                        continue;
-                    //}
-                    // TODO: reimplement this like in llama_kv_cache_unified
-                    if (hparams.use_alibi) {
-                        f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
-                    } else {
-                        f = 0.0f;
                    }
+                    data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                }
-                data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
            }
        }
+    };
+    {
+        GGML_ASSERT(self_kq_mask);
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+        float * data = (float *) self_kq_mask->data;
+        std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
+        fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
+        if (debug) {
+            print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
+        }
    }
-    if (debug) {
-        print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        GGML_ASSERT(self_kq_mask_swa);
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+        float * data = (float *) self_kq_mask_swa->data;
+        std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
+        fill_mask(data, hparams.n_swa, hparams.swa_type);
+        if (debug) {
+            print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+        }
    }
 }
@@ -928,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        cb(selection_probs, "ffn_moe_probs_biased", il);
    }
+    // select top n_group_used expert groups
+    // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
+    if (hparams.n_expert_groups > 1 && n_tokens > 0) {
+        const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
+        // organize experts into n_expert_groups
+        ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
+        ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
+        group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
+        // get top n_group_used expert groups
+        group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
+        group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
+        ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
+        cb(expert_groups, "ffn_moe_group_topk", il);
+        // mask out the other groups
+        selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
+        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+        selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
+        cb(selection_probs, "ffn_moe_probs_masked", il);
+    }
    // select experts
    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
    cb(selected_experts->src[0], "ffn_moe_argsort", il);
@@ -959,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
        cb(weights_sum, "ffn_moe_weights_sum", il);
+        if (arch == LLM_ARCH_BAILINGMOE2) {
+            weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
+            cb(weights_sum, "ffn_moe_weights_sum_biased", il);
+        }
        weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
        cb(weights, "ffn_moe_weights_norm", il);
@@ -1299,12 +1351,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
    v = ggml_permute(ctx0, v, 0, 2, 1, 3);
-    const auto n_kv = k->ne[1];
    ggml_tensor * cur;
-    // TODO: replace hardcoded padding with ggml-provided padding
+    if (cparams.flash_attn && kq_b == nullptr) {
-    if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
        GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
        if (v_trans) {
@@ -1419,10 +1468,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
    auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
-    ggml_set_input(inp->kq_mask);
+    ggml_set_input(inp->self_kq_mask);
+    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        ggml_set_input(inp->self_kq_mask_swa);
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+    } else {
+        inp->self_kq_mask_swa     = nullptr;
+        inp->self_kq_mask_swa_cnv = nullptr;
+    }
    return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
 }
@@ -1447,7 +1506,9 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, k_cur);
    ggml_build_forward_expand(gf, v_cur);
-    const auto & kq_mask = inp->get_kq_mask();
+    const bool is_swa = hparams.is_swa(il);
+    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
    // [TAG_NO_CACHE_PAD]
    // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams

--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -257,10 +257,14 @@ public:
    void set_input(const llama_ubatch * ubatch) override;
-    ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
+    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
+    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
-    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch, 1, 1]
+    // n_tokens == n_batch
-    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch, 1, 1]
+    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
    const llama_hparams hparams;
    const llama_cparams cparams;

--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -74,6 +74,8 @@ struct llama_hparams {
    uint32_t n_ff_chexp         = 0;
    uint32_t n_expert_shared    = 0;
    uint32_t n_norm_groups      = 0;
+    uint32_t n_expert_groups    = 0;
+    uint32_t n_group_used       = 0;
    uint32_t n_group_experts    = 0;
    float    expert_group_scale   = 0.05f;

--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -108,9 +108,12 @@ enum llm_type {
    LLM_TYPE_17B_16E, // llama4 Scout
    LLM_TYPE_17B_128E, // llama4 Maverick
    LLM_TYPE_A13B,
+    LLM_TYPE_7B_A1B,
    LLM_TYPE_8B_A1B, // lfm2moe
+    LLM_TYPE_16B_A1B,
    LLM_TYPE_21B_A3B, // Ernie MoE small
    LLM_TYPE_30B_A3B,
+    LLM_TYPE_100B_A6B,
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big

--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }
+    bool is_clip_model = false;
    for (const auto * it : tensors) {
        const struct ggml_tensor * tensor = it->tensor;
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
            qs.has_output = true;
        }
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
    }
    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
    {
        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
        // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
        ggml_type new_type;
        void * new_data;
        size_t new_size;

--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1957,6 +1957,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                clean_spaces = false;
            } else if (
                tokenizer_pre == "bailingmoe" ||
+                tokenizer_pre == "bailingmoe2" ||
                tokenizer_pre == "llada-moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                clean_spaces = false;

--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
        }
+        if (model.arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+        }
        try {
            model.load_vocab(ml);
        } catch(const std::exception & e) {
@@ -314,6 +317,7 @@ struct llama_model * llama_model_load_from_splits(
        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
        return nullptr;
    }
+    splits.reserve(n_paths);
    for (size_t i = 0; i < n_paths; ++i) {
        splits.push_back(paths[i]);
    }

--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -30,6 +30,7 @@
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
 // vision-specific
+#define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
@@ -48,6 +49,7 @@
 #define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"
 // audio-specific
+#define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
 #define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
 #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"

--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -2234,15 +2234,27 @@ struct clip_model_loader {
        // projector type
        std::string proj_type;
        {
+            // default key
            get_string(KEY_PROJ_TYPE, proj_type, false);
-            if (!proj_type.empty()) {
-                model.proj_type = clip_projector_type_from_string(proj_type);
+            // for models with mixed modalities
+            if (proj_type.empty()) {
+                if (modality == CLIP_MODALITY_VISION) {
+                    get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+                } else if (modality == CLIP_MODALITY_AUDIO) {
+                    get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+                } else {
+                    GGML_ABORT("unknown modality");
+                }
            }
+            model.proj_type = clip_projector_type_from_string(proj_type);
            if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
                throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
            }
-            // correct arch for multimodal models
+            // correct arch for multimodal models (legacy method)
            if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
                model.proj_type = modality == CLIP_MODALITY_VISION
                                    ? PROJECTOR_TYPE_QWEN25VL

--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -23,7 +23,7 @@ problem.
 8 files changed, 21 insertions(+), 2 deletions(-)
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index ff9135fe..8ba86f82 100644
+index ff9135fe2..8ba86f824 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -64,18 +64,18 @@ index ff9135fe..8ba86f82 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index ad1adba6..7d44f74f 100755
+index 8bd5449f1..01e2df61a 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
+@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
-     ggml_backend_cann_buffer_context* ctx =
+ static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-         (ggml_backend_cann_buffer_context*)buffer->context;
+     ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     delete ctx;
 +    delete buffer;
 }
 /**
-@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1560,6 +1561,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -84,10 +84,10 @@ index ad1adba6..7d44f74f 100755
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 856e9de2..c0b1e4c1 100644
+index bc396b521..aefc6935e 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -576,6 +576,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -95,7 +95,7 @@ index 856e9de2..c0b1e4c1 100644
 }
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -822,6 +823,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -831,6 +832,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -103,7 +103,7 @@ index 856e9de2..c0b1e4c1 100644
 }
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1103,6 +1105,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+@@ -1112,6 +1114,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -112,7 +112,7 @@ index 856e9de2..c0b1e4c1 100644
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 7afc881f..bf096227 100644
+index 7afc881fa..bf0962274 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
 @@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 7afc881f..bf096227 100644
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 79d21487..38c75018 100644
+index db33a4ab6..c42ee26e1 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3212,6 +3212,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3266,6 +3266,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,7 +144,7 @@ index 79d21487..38c75018 100644
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index aad48d62..a46c0f52 100644
+index a38df5a97..fd07e4a21 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
 @@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -156,10 +156,10 @@ index aad48d62..a46c0f52 100644
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 45b8c216..4ec9a592 100644
+index b695ba051..37e853120 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -334,6 +334,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -352,6 +352,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
     delete ctx;
@@ -167,7 +167,7 @@ index 45b8c216..4ec9a592 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -795,6 +796,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -813,6 +814,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -175,7 +175,7 @@ index 45b8c216..4ec9a592 100644
 }
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1137,6 +1139,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1155,6 +1157,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -184,10 +184,10 @@ index 45b8c216..4ec9a592 100644
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3cd89c71..ed83236f 100644
+index b783f7805..216dc167c 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -11600,6 +11600,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -11828,6 +11828,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index 3cd89c71..ed83236f 100644
 }
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -11743,6 +11744,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -11971,6 +11972,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);