Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal...

Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal encoding fixes (#12552) * feat: Bump llama.cpp to df1b612 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Correctly encode text chunks during mtmd tokenization There can be text chunks that appear interspersed with the image embeddings that contain template delimiter tokens for some models. These need to be correctly translated to text tokens. Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * tests: Use MtmdChunk in image_test Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Fix unnecessary conversion linting Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(ggml): Revert changes to ggml_hip.cpp These changes were done largely by our code assistant and are likely wrong Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Revert changes in mem_nvml.cpp Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update sync point to 1deee0 This brings in several more optimization commits and model support for EmbeddingGemma Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches for 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: sync for bump to 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Bad patch updates with errant `+` Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Bump llama.cpp/ggml to 7049736 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: format-patches after latest bump Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal...
Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal encoding fixes (#12552) * feat: Bump llama.cpp to df1b612 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Correctly encode text chunks during mtmd tokenization There can be text chunks that appear interspersed with the image embeddings that contain template delimiter tokens for some models. These need to be correctly translated to text tokens. Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * tests: Use MtmdChunk in image_test Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Fix unnecessary conversion linting Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(ggml): Revert changes to ggml_hip.cpp These changes were done largely by our code assistant and are likely wrong Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Revert changes in mem_nvml.cpp Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update sync point to 1deee0 This brings in several more optimization commits and model support for EmbeddingGemma Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches for 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: sync for bump to 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Bad patch updates with errant `+` Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Bump llama.cpp/ggml to 7049736 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: format-patches after latest bump Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
4987f13d · Gabe Goodhart · GitHub · e638f2ac · 4987f13d · 4987f13d
Unverified Commit 4987f13d authored Oct 13, 2025 by Gabe Goodhart Committed by GitHub Oct 13, 2025
20 changed files
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -2541,8 +2541,13 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
    if (n_non_eog == 0) {
        cur_p->size = 1;
        cur_p->data[0].id = ctx->vocab->token_eot();
+        if (cur_p->data[0].id == LLAMA_TOKEN_NULL) {
+            cur_p->data[0].id = ctx->vocab->token_eos();
+        }
        cur_p->data[0].logit = 1.0f;

+        GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL);
+
        return;
    }


--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_OLMO:
            case LLAMA_VOCAB_PRE_TYPE_JAIS:
            case LLAMA_VOCAB_PRE_TYPE_TRILLION:
+            case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
                regex_exprs = {
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                };
@@ -1950,6 +1951,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "trillion") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "granite-docling") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
+                clean_spaces = false;
            } else if (
                tokenizer_pre == "bailingmoe" ||
                tokenizer_pre == "llada-moe") {
@@ -2156,6 +2161,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|end|>"
                        || t.first == "<end_of_turn>"
                        || t.first == "<|endoftext|>"
+                        || t.first == "<|end_of_text|>" // granite
                        || t.first == "<EOT>"
                        || t.first == "_<EOT>"
                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek

--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -8,46 +8,47 @@

 // pre-tokenization types
 enum llama_vocab_pre_type {
-    LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-    LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-    LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-    LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-    LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-    LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-    LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-    LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
-    LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
-    LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
-    LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
-    LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-    LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-    LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-    LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-    LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-    LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
-    LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
-    LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
-    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
-    LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
-    LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-    LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-    LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-    LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
-    LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
-    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
-    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
-    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
-    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE  = 38,
-    LLAMA_VOCAB_PRE_TYPE_GROK_2         = 39,
+    LLAMA_VOCAB_PRE_TYPE_DEFAULT         = 0,
+    LLAMA_VOCAB_PRE_TYPE_LLAMA3          = 1,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM    = 2,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER  = 3,
+    LLAMA_VOCAB_PRE_TYPE_FALCON          = 4,
+    LLAMA_VOCAB_PRE_TYPE_MPT             = 5,
+    LLAMA_VOCAB_PRE_TYPE_STARCODER       = 6,
+    LLAMA_VOCAB_PRE_TYPE_GPT2            = 7,
+    LLAMA_VOCAB_PRE_TYPE_REFACT          = 8,
+    LLAMA_VOCAB_PRE_TYPE_COMMAND_R       = 9,
+    LLAMA_VOCAB_PRE_TYPE_STABLELM2       = 10,
+    LLAMA_VOCAB_PRE_TYPE_QWEN2           = 11,
+    LLAMA_VOCAB_PRE_TYPE_OLMO            = 12,
+    LLAMA_VOCAB_PRE_TYPE_DBRX            = 13,
+    LLAMA_VOCAB_PRE_TYPE_SMAUG           = 14,
+    LLAMA_VOCAB_PRE_TYPE_PORO            = 15,
+    LLAMA_VOCAB_PRE_TYPE_CHATGLM3        = 16,
+    LLAMA_VOCAB_PRE_TYPE_CHATGLM4        = 17,
+    LLAMA_VOCAB_PRE_TYPE_VIKING          = 18,
+    LLAMA_VOCAB_PRE_TYPE_JAIS            = 19,
+    LLAMA_VOCAB_PRE_TYPE_TEKKEN          = 20,
+    LLAMA_VOCAB_PRE_TYPE_SMOLLM          = 21,
+    LLAMA_VOCAB_PRE_TYPE_CODESHELL       = 22,
+    LLAMA_VOCAB_PRE_TYPE_BLOOM           = 23,
+    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH    = 24,
+    LLAMA_VOCAB_PRE_TYPE_EXAONE          = 25,
+    LLAMA_VOCAB_PRE_TYPE_CHAMELEON       = 26,
+    LLAMA_VOCAB_PRE_TYPE_MINERVA         = 27,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM   = 28,
+    LLAMA_VOCAB_PRE_TYPE_GPT4O           = 29,
+    LLAMA_VOCAB_PRE_TYPE_SUPERBPE        = 30,
+    LLAMA_VOCAB_PRE_TYPE_TRILLION        = 31,
+    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE      = 32,
+    LLAMA_VOCAB_PRE_TYPE_LLAMA4          = 33,
+    LLAMA_VOCAB_PRE_TYPE_PIXTRAL         = 34,
+    LLAMA_VOCAB_PRE_TYPE_SEED_CODER      = 35,
+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN         = 36,
+    LLAMA_VOCAB_PRE_TYPE_KIMI_K2         = 37,
+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE   = 38,
+    LLAMA_VOCAB_PRE_TYPE_GROK_2          = 39,
+    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
 };

 struct LLM_KV;

--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -31,6 +31,7 @@

 // vision-specific
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"

--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -183,7 +183,9 @@ struct clip_hparams {
    int32_t projection_dim;
    int32_t n_head;
    int32_t n_layer;
-    int32_t proj_scale_factor = 0; // idefics3
+    // idefics3
+    int32_t preproc_image_size = 0;
+    int32_t proj_scale_factor = 0;

    float image_mean[3];
    float image_std[3];
@@ -2263,6 +2265,7 @@ struct clip_model_loader {

            if (is_vision) {
                get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
                get_u32(KEY_PATCH_SIZE, hparams.patch_size);
                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -3590,10 +3593,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
        // res_imgs->data[0] = *res;
        res_imgs->entries.push_back(std::move(img_f32));
        return true;
-    }
-    else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
+        // The refined size has two steps:
+        // 1. Resize w/ aspect-ratio preserving such that the longer side is
+        //      the preprocessor longest size
+        // 2. Resize w/out preserving aspect ratio such that both sides are
+        //      multiples of image_size (always rounding up)
+        //
+        // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+        const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
+            original_size, params.image_size, params.preproc_image_size);
+
+        llava_uhd::slice_instructions instructions;
+        instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+        instructions.refined_size = refined_size;
+        instructions.grid_size = clip_image_size{
+            static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+            static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+        };
+        for (int y = 0; y < refined_size.height; y += params.image_size) {
+            for (int x = 0; x < refined_size.width; x += params.image_size) {
+                instructions.slices.push_back(llava_uhd::slice_coordinates{
+                    /* x    */x,
+                    /* y    */y,
+                    /* size */clip_image_size{
+                        std::min(params.image_size, refined_size.width - x),
+                        std::min(params.image_size, refined_size.height - y)
+                    }
+                });
+            }
+        }
+        auto imgs = llava_uhd::slice_image(img, instructions);
+
+        // cast and normalize to f32
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+            clip_image_f32_ptr res(clip_image_f32_init());
+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+            res_imgs->entries.push_back(std::move(res));
+        }
+
+        res_imgs->grid_x = instructions.grid_size.width;
+        res_imgs->grid_y = instructions.grid_size.height;
+        return true;
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
            || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
-            || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
            || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
    ) {
        clip_image_u8 resized_image;

--- a/llama/llama.cpp/tools/mtmd/mtmd.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd.cpp
@@ -76,7 +76,7 @@ enum mtmd_slice_tmpl {
    MTMD_SLICE_TMPL_MINICPMV_2_5,
    MTMD_SLICE_TMPL_MINICPMV_2_6,
    MTMD_SLICE_TMPL_LLAMA4,
-    // TODO @ngxson : add support for idefics (SmolVLM)
+    MTMD_SLICE_TMPL_IDEFICS3,
 };

 mtmd_input_text* mtmd_input_text_init(const char * text, bool add_special, bool parse_special) {
@@ -124,19 +124,22 @@ struct mtmd_context {
    // for llava-uhd style models, we need special tokens in-between slices
    // minicpmv calls them "slices", llama 4 calls them "tiles"
    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
-    llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
-    llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
-    llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
-    llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
-    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
-    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
-    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
-    llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
+    std::vector<llama_token> tok_ov_img_start;  // overview image
+    std::vector<llama_token> tok_ov_img_end;    // overview image
+    std::vector<llama_token> tok_slices_start;  // start of all slices
+    std::vector<llama_token> tok_slices_end;    // end of all slices
+    std::vector<llama_token> tok_sli_img_start; // single slice start
+    std::vector<llama_token> tok_sli_img_end;   // single slice end
+    std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
+    std::vector<llama_token> tok_row_end;       // end of row
    bool        tok_row_end_trail = false;
    bool        ov_img_first      = false;

    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE

+    // string template for slice image delimiters with row/col (idefics3)
+    std::string sli_img_start_tmpl;
+
    // for whisper, we pre-calculate the mel filter bank
    whisper_preprocessor::whisper_filters w_filters;

@@ -207,13 +210,13 @@ struct mtmd_context {
            // minicpmv 2.5 format:
            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
-            tok_ov_img_start  = lookup_token("<image>");
-            tok_ov_img_end    = lookup_token("</image>");
-            tok_slices_start  = lookup_token("<slice>");
-            tok_slices_end    = lookup_token("</slice>");
+            tok_ov_img_start  = {lookup_token("<image>")};
+            tok_ov_img_end    = {lookup_token("</image>")};
+            tok_slices_start  = {lookup_token("<slice>")};
+            tok_slices_end    = {lookup_token("</slice>")};
            tok_sli_img_start = tok_ov_img_start;
            tok_sli_img_end   = tok_ov_img_end;
-            tok_row_end       = lookup_token("\n");
+            tok_row_end       = {lookup_token("\n")};
            tok_row_end_trail = false; // no trailing end-of-row token
            ov_img_first      = true;

@@ -221,11 +224,11 @@ struct mtmd_context {
            // minicpmv 2.6 format:
            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
-            tok_ov_img_start  = lookup_token("<image>");
-            tok_ov_img_end    = lookup_token("</image>");
-            tok_sli_img_start = lookup_token("<slice>");
-            tok_sli_img_end   = lookup_token("</slice>");
-            tok_row_end       = lookup_token("\n");
+            tok_ov_img_start  = {lookup_token("<image>")};
+            tok_ov_img_end    = {lookup_token("</image>")};
+            tok_sli_img_start = {lookup_token("<slice>")};
+            tok_sli_img_end   = {lookup_token("</slice>")};
+            tok_row_end       = {lookup_token("\n")};
            tok_row_end_trail = false; // no trailing end-of-row token
            ov_img_first      = true;

@@ -240,9 +243,9 @@ struct mtmd_context {
            // <|image|> (overview)           <-- overview image is last
            // <|image_end|>
            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
-            tok_ov_img_start  = lookup_token("<|image|>");
-            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
-            tok_row_end       = lookup_token("<|tile_y_separator|>");
+            tok_ov_img_start  = {lookup_token("<|image|>")};
+            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
+            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
            tok_row_end_trail = true; // add trailing end-of-row token
            ov_img_first      = false; // overview image is last
        }
@@ -255,8 +258,11 @@ struct mtmd_context {

        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-            img_beg = "<fake_token_around_image><global-img>";
-            img_end = "<fake_token_around_image>";
+            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
+            tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
+            tok_row_end        = {lookup_token("\n")};
+            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";

        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
@@ -514,6 +520,7 @@ struct mtmd_tokenizer {
                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
            ) {
                const int n_col = batch_f32.grid_x;
                const int n_row = batch_f32.grid_y;
@@ -527,53 +534,45 @@ struct mtmd_tokenizer {

                // add overview image (first)
                if (ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_start});
-                    }
+                    add_text(ctx->tok_ov_img_start);
                    cur.entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_end});
-                    }
+                    add_text(ctx->tok_ov_img_end);
                }

                // add slices (or tiles)
                if (!chunks.empty()) {
                    GGML_ASSERT((int)chunks.size() == n_row * n_col);
-                    if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_slices_start});
-                    }
+                    add_text(ctx->tok_slices_start);
                    for (int y = 0; y < n_row; y++) {
                        for (int x = 0; x < n_col; x++) {
                            const bool is_last_in_row = (x == n_col - 1);
-                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
-                                add_text({ctx->tok_sli_img_start});
+                            if (!ctx->tok_sli_img_start.empty()) {
+                                add_text(ctx->tok_sli_img_start);
+                            } else if (!ctx->sli_img_start_tmpl.empty()) {
+                                // If using a template to preceed a slice image
+                                const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
+                                std::unique_ptr<char[]> buf(new char[sz]);
+                                std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
+                                add_text(std::string(buf.get(), buf.get() + sz - 1), true);
                            }
                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
-                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
-                                add_text({ctx->tok_sli_img_end});
-                            }
-                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
-                                add_text({ctx->tok_sli_img_mid});
+                            add_text(ctx->tok_sli_img_end);
+                            if (!is_last_in_row) {
+                                add_text(ctx->tok_sli_img_mid);
                            }
                        }
-                        if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
-                            add_text({ctx->tok_row_end});
+                        if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
+                            add_text(ctx->tok_row_end);
                        }
                    }
-                    if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_slices_end});
-                    }
+                    add_text(ctx->tok_slices_end);
                }

                // add overview image (last)
                if (!ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_start});
-                    }
+                    add_text(ctx->tok_ov_img_start);
                    cur.entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_end});
-                    }
+                    add_text(ctx->tok_ov_img_end);
                }

            } else {
@@ -790,7 +789,9 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
    bool ok = false;

-    if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
+    if (clip_is_llava(ctx_clip)
+        || clip_is_minicpmv(ctx_clip)
+        || clip_is_glm(ctx_clip)) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
        for (size_t i = 0; i < entries.size(); i++) {

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -504,7 +504,12 @@ func (c *MtmdContext) Free() {
 	C.mtmd_free(c.c)
 }

-func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
+type MtmdChunk struct {
+	Embed  []float32
+	Tokens []int
+}
+
+func (c *MtmdContext) MultimodalTokenize(llamaContext *Context, data []byte) ([]MtmdChunk, error) {
 	// Initialize the input chunks pointer
 	ic := C.mtmd_input_chunks_init()
 	defer C.mtmd_input_chunks_free(ic)
@@ -523,35 +528,51 @@ func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	}
 	nChunks := C.mtmd_input_chunks_size(ic)
 	numEmbed := llamaContext.Model().NEmbd()
-	embed := make([][]float32, 0)
+	outChunks := make([]MtmdChunk, 0)
 	for i := range int(nChunks) {
 		chunk := C.mtmd_input_chunks_get(ic, C.size_t(i))
 		numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk))
 		slog.Debug("chunk tokens", "index", i, "numTokens", numTokens)

-		// Encode the chunk
-		if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
-			return nil, errors.New("unable to encode mtmd image chunk")
-		}
-
-		// Get the embeddings for this chunk
-		chunkEmbed := make([][]float32, numTokens)
-		chunkEmbd := C.mtmd_get_output_embd(c.c)
-		if nil == chunkEmbd {
-			continue
-		}
-
-		// Extend the embedding array for each token
-		s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed)
-		rows := make([]float32, len(s))
-		copy(rows, s)
-		for i := range numTokens {
-			chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+		if C.mtmd_input_chunk_get_type(chunk) == C.MTMD_INPUT_CHUNK_TYPE_TEXT {
+			// If this is a text chunk, add the tokens
+			cNumTokens := C.size_t(0)
+			cTokens := C.mtmd_input_chunk_get_tokens_text(chunk, &cNumTokens)
+			cTokensArr := unsafe.Slice(cTokens, int(cNumTokens))
+			tokens := make([]int, int(cNumTokens))
+			for j := range int(cNumTokens) {
+				tokens[j] = int(cTokensArr[j])
+			}
+			outChunks = append(outChunks, MtmdChunk{Tokens: tokens})
+		} else {
+			// Otherwise, encode the image chunk to embeddings
+
+			// Encode the chunk
+			if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
+				return nil, errors.New("unable to encode mtmd image chunk")
+			}
+
+			// Get the embeddings for this chunk
+			chunkEmbed := make([][]float32, numTokens)
+			chunkEmbd := C.mtmd_get_output_embd(c.c)
+			if nil == chunkEmbd {
+				return nil, errors.New("no mtmd image embedding")
+			}
+
+			// Extend the embedding array for each token
+			s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed)
+			rows := make([]float32, len(s))
+			copy(rows, s)
+			for i := range numTokens {
+				chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+			}
+			for _, e := range chunkEmbed {
+				outChunks = append(outChunks, MtmdChunk{Embed: e})
+			}
 		}
-		embed = append(embed, chunkEmbed...)
 	}
-	slog.Debug("image embeddings", "totalEmbeddings", len(embed))
-	return embed, nil
+	slog.Debug("image tokenization chunks", "totalChunks", len(outChunks))
+	return outChunks, nil
 }

 func (c *Context) Synchronize() {

--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -64,7 +64,7 @@ index ff9135fe..8ba86f82 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index b51b554e..3ba0f5a6 100755
+index ad1adba6..7d44f74f 100755
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
 @@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@@ -84,7 +84,7 @@ index b51b554e..3ba0f5a6 100755
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index b7e81b21..fdf8c63d 100644
+index 856e9de2..c0b1e4c1 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index b7e81b21..fdf8c63d 100644
 
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index e11555a7..909e17de 100644
+index 7afc881f..bf096227 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
 @@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index e11555a7..909e17de 100644
 
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 0cf3b924..09d706b5 100644
+index 79d21487..38c75018 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3212,6 +3212,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,10 +144,10 @@ index 0cf3b924..09d706b5 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index f99681c8..59591770 100644
+index aad48d62..a46c0f52 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -156,10 +156,10 @@ index f99681c8..59591770 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 4ac919ea..447ea3c4 100644
+index 45b8c216..4ec9a592 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -334,6 +334,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -167,7 +167,7 @@ index 4ac919ea..447ea3c4 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -792,6 +793,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -795,6 +796,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -175,7 +175,7 @@ index 4ac919ea..447ea3c4 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1134,6 +1136,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1137,6 +1139,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -184,10 +184,10 @@ index 4ac919ea..447ea3c4 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 2608cbd0..061cd078 100644
+index 3cd89c71..ed83236f 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -11600,6 +11600,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index 2608cbd0..061cd078 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -11743,6 +11744,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);

--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index da938af0..2a38abf4 100644
+index 7fffd171..0b6edaf4 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,7 +31,7 @@ index da938af0..2a38abf4 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1992,7 +1983,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
                 clean_spaces = false;
             } else {

--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 210ecc88..355219a9 100644
+index 98e68af2..6699b75a 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
 @@ -28,6 +28,19 @@
@@ -33,7 +33,7 @@ index 210ecc88..355219a9 100644
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
 enum ffn_op_type {
-@@ -2759,7 +2772,29 @@ struct clip_model_loader {
+@@ -2762,7 +2775,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 210ecc88..355219a9 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -2786,7 +2821,11 @@ struct clip_model_loader {
+@@ -2789,7 +2824,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }

--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -9,13 +9,13 @@ adds support for the Solar Pro architecture
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
 src/llama-hparams.h        |   5 +
- src/llama-model-loader.cpp |   1 +
+ src/llama-model-loader.cpp |   2 +-
 src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
 src/llama-model.h          |   3 +
- 7 files changed, 248 insertions(+)
+ 7 files changed, 248 insertions(+), 1 deletion(-)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 4e8d54c4..f98a3574 100644
+index 869e4dcc..9f6b6ad2 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
 @@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -26,7 +26,7 @@ index 4e8d54c4..f98a3574 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -179,6 +180,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -34,7 +34,7 @@ index 4e8d54c4..f98a3574 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1893,6 +1895,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -59,7 +59,7 @@ index 4e8d54c4..f98a3574 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2429,6 +2449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,7 +68,7 @@ index 4e8d54c4..f98a3574 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index b5c6f3d7..aa8e0e7b 100644
+index c3ae7165..dc7a362a 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
 @@ -85,6 +85,7 @@ enum llm_arch {
@@ -79,7 +79,7 @@ index b5c6f3d7..aa8e0e7b 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -181,6 +182,7 @@ enum llm_kv {
+@@ -183,6 +184,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -87,7 +87,7 @@ index b5c6f3d7..aa8e0e7b 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -417,6 +419,7 @@ enum llm_tensor {
+@@ -432,6 +434,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -96,10 +96,10 @@ index b5c6f3d7..aa8e0e7b 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c04ac58f..24a515a0 100644
+index db65d69e..b6bf6bbf 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
     return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 
@@ -115,7 +115,7 @@ index c04ac58f..24a515a0 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 0fe4b569..eb13709f 100644
+index 4e7f73ec..80582728 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
 @@ -64,6 +64,8 @@ struct llama_hparams {
@@ -127,7 +127,7 @@ index 0fe4b569..eb13709f 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -236,6 +238,9 @@ struct llama_hparams {
+@@ -248,6 +250,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -138,22 +138,23 @@ index 0fe4b569..eb13709f 100644
 
     bool has_kv(uint32_t il) const;
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 8182a9ad..daef900c 100644
+index aa3a65f8..ee303bd5 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -465,6 +465,7 @@ namespace GGUFMeta {
-     // TODO: this is not very clever - figure out something better
+@@ -466,7 +466,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+     template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
+-
 +    template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 2470f878..0398b553 100644
+index 36d495d6..74e1d162 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1865,6 +1865,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -175,7 +176,7 @@ index 2470f878..0398b553 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5170,6 +5185,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -210,7 +211,7 @@ index 2470f878..0398b553 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
+@@ -16392,6 +16435,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
     }
 };
 
@@ -376,7 +377,7 @@ index 2470f878..0398b553 100644
 // ref: https://github.com/facebookresearch/chameleon
 // based on the original build_llama() function, changes:
 //   * qk-norm
-@@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+@@ -19827,6 +20029,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params);
             } break;
@@ -387,7 +388,7 @@ index 2470f878..0398b553 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -20057,6 +20263,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
@@ -396,7 +397,7 @@ index 2470f878..0398b553 100644
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index d73ce969..c086f94e 100644
+index 7f48662f..ec3fbd33 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
 @@ -76,6 +76,7 @@ enum llm_type {
@@ -407,9 +408,9 @@ index d73ce969..c086f94e 100644
     LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
-@@ -380,6 +381,8 @@ struct llama_layer {
-     // openai-moe
-     struct ggml_tensor * attn_sinks = nullptr;
+@@ -387,6 +388,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_act_beta    = nullptr;
+     struct ggml_tensor * ffn_act_eps     = nullptr;
 
 +    struct ggml_tensor * bskcn_tv = nullptr;
 +

--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 2a38abf4..26fa9fad 100644
+index 0b6edaf4..3de95c67 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {

--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index c8f3d859..ff6229a0 100644
+index 892c2331..09fdf5fc 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -310,6 +310,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endif()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,7 +19,7 @@ index c8f3d859..ff6229a0 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -320,6 +321,7 @@ if (GGML_CPU_ALL_VARIANTS)
     elseif (GGML_CPU_ARM_ARCH)
         message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
     endif()

--- a/llama/patches/0009-remove-amx.patch
+++ b/llama/patches/0009-remove-amx.patch
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index ff6229a0..33b3a15f 100644
+index 09fdf5fc..0609c650 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -330,10 +330,6 @@ if (GGML_CPU_ALL_VARIANTS)
         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)

--- a/llama/patches/0010-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0010-fix-string-arr-kv-loading.patch
@@ -53,10 +53,10 @@ index 8cc4ef1c..d950dbdf 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 26fa9fad..64c78a16 100644
+index 3de95c67..217ede47 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);

--- a/llama/patches/0011-ollama-debug-tensor.patch
+++ b/llama/patches/0011-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index dbc07301..f8574d01 100644
+index ba2a36d9..99509b0c 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index dbc07301..f8574d01 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2887,6 +2889,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 

--- a/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index 2186f827..8fb86009 100644
+index 55d2e355..da34526b 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
 @@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {

--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -12,10 +12,10 @@ Subject: [PATCH] add argsort and cuda copy for i32
 5 files changed, 256 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 14f7dcf4..f7f8da35 100644
+index 1c43865f..31478dd8 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 
@@ -61,7 +61,7 @@ index 14f7dcf4..f7f8da35 100644
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
-@@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
+@@ -7900,6 +7939,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
@@ -272,10 +272,10 @@ index 746f4396..911220e9 100644
         ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 96df6f0c..44dc31c0 100644
+index 74a9aa99..375a0c7f 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
+@@ -4346,8 +4346,72 @@ kernel void kernel_argsort_f32_i32(
     }
 }
 

--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@@ -23,10 +23,10 @@ index 2cb150fd..7ab3f019 100644
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 62b6d65e..fe20dca3 100644
+index f1b74078..c54ff98b 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -316,6 +316,7 @@ extern "C" {
+@@ -318,6 +318,7 @@ extern "C" {
 
     GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
     GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
@@ -35,10 +35,10 @@ index 62b6d65e..fe20dca3 100644
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index fa46f3b4..421ff7c7 100644
+index 929bc448..eee9d3b1 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
-@@ -492,6 +492,7 @@ struct node_alloc {
+@@ -486,6 +486,7 @@ struct node_alloc {
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
     struct vbuffer ** buffers; // [n_buffers]
@@ -46,7 +46,7 @@ index fa46f3b4..421ff7c7 100644
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
 
-@@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+@@ -509,6 +510,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
     GGML_ASSERT(galloc->buffers != NULL);
 
@@ -56,7 +56,7 @@ index fa46f3b4..421ff7c7 100644
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
 
-@@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+@@ -576,6 +580,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
@@ -64,7 +64,7 @@ index fa46f3b4..421ff7c7 100644
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
-@@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+@@ -869,6 +874,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         }
     }
 
@@ -73,7 +73,7 @@ index fa46f3b4..421ff7c7 100644
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
         // if the buffer type is used multiple times, we reuse the same buffer
-@@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+@@ -898,14 +905,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 
             ggml_vbuffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -96,7 +96,7 @@ index fa46f3b4..421ff7c7 100644
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+@@ -1060,6 +1072,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }
 

--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@@ -12,7 +12,7 @@ with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 3 files changed, 63 insertions(+), 6 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index fe20dca3..48777212 100644
+index c54ff98b..229bf387 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
 @@ -158,6 +158,7 @@ extern "C" {
@@ -24,7 +24,7 @@ index fe20dca3..48777212 100644
         size_t memory_total;
         // device type
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index fdf8c63d..ad389ece 100644
+index c0b1e4c1..5b852f69 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
@@ -110,7 +110,7 @@ index fdf8c63d..ad389ece 100644
         std::string device_name(prop.name);
         if (device_name == "NVIDIA GeForce MX450") {
             turing_devices_without_mma.push_back({ id, device_name });
-@@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
+@@ -3276,6 +3323,7 @@ struct ggml_backend_cuda_device_context {
     std::string name;
     std::string description;
     std::string pci_bus_id;
@@ -118,7 +118,7 @@ index fdf8c63d..ad389ece 100644
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+@@ -3288,6 +3336,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
@@ -130,7 +130,7 @@ index fdf8c63d..ad389ece 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -3304,6 +3357,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
@@ -138,7 +138,7 @@ index fdf8c63d..ad389ece 100644
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3873,6 +3927,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
@@ -147,7 +147,7 @@ index fdf8c63d..ad389ece 100644
                 char pci_bus_id[16] = {};
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 909e17de..08ab4fc9 100644
+index bf096227..f2ff9f32 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
 @@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen