llama: update to commit de4c07f93 (#10655)

0cefd46f · Jeffrey Morgan · GitHub · ad035ad5 · 0cefd46f · 0cefd46f
Unverified Commit 0cefd46f authored May 12, 2025 by Jeffrey Morgan Committed by GitHub May 12, 2025
20 changed files
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
 fixes loading vision models in llama.cpp on windows
 filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index ad3e7df1..b3218c78 100644
+index 41ba45a7..cdd8ca44 100644
--- a/examples/llava/clip.cpp
+--- a/tools/mtmd/clip.cpp
-+++ b/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp
-@@ -30,6 +30,19 @@
+@@ -31,6 +31,19 @@
- #include <array>
 #include <numeric>
+ #include <functional>
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
 +
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
- //#define CLIP_DEBUG_FUNCTIONS
+ enum ffn_op_type {
-@@ -1971,7 +1984,29 @@ struct clip_model_loader {
+@@ -2190,7 +2203,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1998,7 +2033,11 @@ struct clip_model_loader {
+@@ -2217,7 +2252,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }

--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
 };
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index ea73a8a7..a012aeae 100644
+index 4cce5166..7f6617fa 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 822e2bb2..572378c9 100644
+index 3a4e72a3..831b68c0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
     }
 };
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         ggml_tensor * cur;
-@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
             return LLAMA_ROPE_TYPE_NORM;
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 95eca002..856e6042 100644
+index 6bdec263..43746c7d 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -64,6 +64,7 @@ enum llm_type {
+@@ -65,6 +65,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
     LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
-@@ -311,6 +312,8 @@ struct llama_layer {
+@@ -315,6 +316,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale   = nullptr;
     struct ggml_tensor * ffn_down_scale = nullptr;

--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
 adds support for the llama 3.2 vision architecture
 ---
- examples/llava/llava.cpp      |   5 +-
- examples/llava/mtmd.cpp       |   6 +-
 ggml/src/ggml-backend-reg.cpp |   6 +-
 include/llama.h               |   6 +
 src/llama-arch.cpp            |  44 +++++
 src/llama-arch.h              |  10 ++
 src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  25 ++-
+ src/llama-context.cpp         |  23 ++-
 src/llama-context.h           |   1 +
 src/llama-cparams.h           |   1 +
 src/llama-graph.cpp           |  25 +++
 src/llama-graph.h             |  12 ++
 src/llama-hparams.cpp         |   4 +
 src/llama-hparams.h           |   7 +
- src/llama-kv-cache.cpp        |  12 +-
+ src/llama-kv-cache.cpp        |  14 +-
 src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           | 309 +++++++++++++++++++++++++++++++++-
+ src/llama-model.cpp           | 311 +++++++++++++++++++++++++++++++++-
 src/llama-model.h             |  12 ++
 src/llama-quant.cpp           |   4 +-
- 19 files changed, 473 insertions(+), 21 deletions(-)
+ tools/mtmd/llava.cpp          |   5 +-
+ tools/mtmd/mtmd-helper.cpp    |   7 +-
+ 19 files changed, 475 insertions(+), 22 deletions(-)
-diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp
-+++ b/examples/llava/llava.cpp
-@@ -457,7 +457,7 @@ struct llava_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -469,6 +469,7 @@ struct llava_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-             n_eval = n_batch;
-         }
-         float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
-         if (llama_decode(ctx_llama, llava_batch.batch)) {
-             LOG_ERR("%s : failed to eval\n", __func__);
-             return false;
-diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
-index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp
-+++ b/examples/llava/mtmd.cpp
-@@ -476,7 +476,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-         pos     .resize(n_tokens * n_pos_per_embd);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -487,6 +487,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
-             int32_t i_batch = 0;
-             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-             float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-+            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+            decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
-             const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
-             const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
 index 405d8e31..82ae1b5b 100644
 --- a/ggml/src/ggml-backend-reg.cpp
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
         register_backend(ggml_backend_rpc_reg());
 #endif
 diff --git a/include/llama.h b/include/llama.h
-index 06c56395..f1628e88 100644
+index abedebdb..41beef21 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -256,6 +256,7 @@ extern "C" {
+@@ -258,6 +258,7 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -358,6 +359,7 @@ extern "C" {
+@@ -365,6 +366,7 @@ extern "C" {
-         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
+         bool op_offload;  // whether to offload host tensor operations to device
 +        bool cross_attn;  // whether to use cross attention
+     };
-         // Abort callback
+     // model quantization parameters
-         // if it returns true, execution of llama_decode() will be aborted
+@@ -464,6 +466,10 @@ extern "C" {
-@@ -459,6 +461,10 @@ extern "C" {
             struct llama_context_params   params),
             "use llama_init_from_model instead");
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
-index 01d5ca57..8682b0e6 100644
+index a88b2fe3..241b316e 100644
 --- a/src/llama-batch.cpp
 +++ b/src/llama-batch.cpp
-@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
+@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 9c1fe93f..cd06ad91 100644
+index dca22d8b..c22687e4 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
+@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
         }
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
+@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
-@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
+@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     const int64_t n_embd = hparams.n_embd;
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+-    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
-@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
     const llama_batch & batch = batch_allocr.batch;
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
-@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         // make the outputs have the same order they had in the user-provided batch
-     const bool logits_all = n_outputs_all == n_tokens_all;
+         // note: this is mostly relevant for recurrent models atm
+         if (!sorted_output) {
-    sbatch.from_batch(batch, n_embd,
+-            const uint32_t n_vocab = model.vocab.n_tokens();
-+    sbatch.from_batch(batch, batch.n_embd,
+            const uint32_t n_vocab = model.hparams.n_vocab;
-             /* simple_split */ !kv_self->recurrent,
+             const uint32_t n_embd  = model.hparams.n_embd;
-             /* logits_all   */ logits_all);
-@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
+             GGML_ASSERT((size_t) n_outputs == out_ids.size());
+@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
     const auto n_embd  = hparams.n_embd;
     // TODO: use a per-batch flag for logits presence instead
-@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
- void llama_context::output_reorder() {
-     auto & out_ids = sbatch.out_ids;
-     if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-+        const uint32_t n_vocab = model.hparams.n_vocab;
-         const uint32_t n_embd  = model.hparams.n_embd;
-         GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
         io.write(&logits_size, sizeof(logits_size));
-@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
+@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
-         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
+         /*.op_offload                  =*/ true,
 +        /*.cross_attn                  =*/ false,
-         /*.abort_callback              =*/ nullptr,
-         /*.abort_callback_data         =*/ nullptr,
     };
-@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
+     return result;
+@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
     ctx->synchronize();
 }
 diff --git a/src/llama-context.h b/src/llama-context.h
-index 5457f077..a50c4afa 100644
+index c0ceacb1..c4ab242a 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
-@@ -65,6 +65,7 @@ struct llama_context {
+@@ -71,6 +71,7 @@ struct llama_context {
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
     void set_adapter_lora(
             llama_adapter_lora * adapter,
 diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 30e550f0..85ad91b9 100644
+index 246fa577..7a6156ce 100644
 --- a/src/llama-cparams.h
 +++ b/src/llama-cparams.h
-@@ -29,6 +29,7 @@ struct llama_cparams {
+@@ -31,6 +31,7 @@ struct llama_cparams {
-     bool offload_kqv;
-     bool flash_attn;
     bool no_perf;
-+    bool cross_attn;
     bool warmup;
+     bool op_offload;
+    bool cross_attn;
     enum llama_pooling_type pooling_type;
 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index fabb9ca2..b67216a4 100644
+index b0e3f635..f14869cf 100644
 --- a/src/llama-graph.cpp
 +++ b/src/llama-graph.cpp
-@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
+@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     }
 }
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
 //
 // llm_graph_context
 //
-@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
         llm_graph_input_attn_cross * inp,
         ggml_cgraph * gf,
 diff --git a/src/llama-graph.h b/src/llama-graph.h
-index d0c8d321..0fe18150 100644
+index 832a8c09..5a322785 100644
 --- a/src/llama-graph.h
 +++ b/src/llama-graph.h
-@@ -86,6 +86,7 @@ public:
+@@ -87,6 +87,7 @@ public:
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
 };
 class llm_graph_input_pos : public llm_graph_input_i {
-@@ -283,6 +284,16 @@ public:
+@@ -284,6 +285,16 @@ public:
     const llama_cross * cross = nullptr;
 };
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
 //
 // llm_graph_result
 //
-@@ -491,6 +502,7 @@ struct llm_graph_context {
+@@ -495,6 +506,7 @@ struct llm_graph_context {
     ggml_tensor * build_inp_cls() const;
     ggml_tensor * build_inp_s_copy() const;
     ggml_tensor * build_inp_s_mask() const;
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
 };
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 7c9d46d8..69f8d35a 100644
+index 3dcad65b..a7b0a7eb 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
+@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
-             return false;
+             throw std::runtime_error("failed to create ggml context for kv cache");
         }
 -        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         k_l.push_back(k);
+@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
+ llama_sbatch llama_kv_cache_unified::sbatch_init(
+         const llama_batch & batch,
+         bool logits_all) {
+-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
+ }
+ llama_ubatch llama_kv_cache_unified::ubatch_next(
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index a012aeae..2e11507d 100644
+index 7f6617fa..2acfd4a8 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -315,6 +315,8 @@ namespace GGUFMeta {
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 572378c9..9d099f11 100644
+index 831b68c0..e8298f56 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
-@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.use_kq_norm = false;
                 }
             } break;
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
-@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
+@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
     }
 };
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
 +                // self attention layer
 +
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
 struct llm_build_deci : public llm_graph_context {
     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
+             // self-attention
+             {
+                 // rope freq factors for llama3; may return nullptr for llama2 and other models
+-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                 // compute Q and K and RoPE them
+                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
         case LLM_ARCH_DECI:
             {
                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_LLAMA4:
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 856e6042..6be91282 100644
+index 43746c7d..9281e629 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
 @@ -11,6 +11,7 @@
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
 struct llama_cparams;
 struct llama_ubatch;
-@@ -73,6 +74,7 @@ enum llm_type {
+@@ -74,6 +75,7 @@ enum llm_type {
     LLM_TYPE_40B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
     LLM_TYPE_236B,
     LLM_TYPE_290B,
     LLM_TYPE_314B,
-@@ -314,6 +316,16 @@ struct llama_layer {
+@@ -318,6 +320,16 @@ struct llama_layer {
     struct ggml_tensor * bskcn_tv = nullptr;
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
     struct llama_layer_convnext convnext;
 diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 7dc54227..223e1f3f 100644
+index 820d5128..56531980 100644
 --- a/src/llama-quant.cpp
 +++ b/src/llama-quant.cpp
 @@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
     }
     size_t total_size_org = 0;
+diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
+index ebef8b3c..b0eb79bb 100644
+--- a/tools/mtmd/llava.cpp
+++ b/tools/mtmd/llava.cpp
+@@ -462,7 +462,7 @@ struct llava_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -474,6 +474,7 @@ struct llava_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+             n_eval = n_batch;
+         }
+         float * embd = image_embed->embed+i*n_embd;
+-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+         if (llama_decode(ctx_llama, llava_batch.batch)) {
+             LOG_ERR("%s : failed to eval\n", __func__);
+             return false;
+diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
+index 7a328867..61ebdd43 100644
+--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
+@@ -58,7 +58,7 @@ struct decode_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+         pos     .resize(n_tokens * n_pos_per_embd);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -69,6 +69,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -131,6 +132,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
+            /*n_embd         =*/ batch.n_embd,
+             /*pos            =*/ pos_ptr,
+             /*n_seq_id       =*/ batch.n_seq_id + offset,
+             /*seq_id         =*/ batch.seq_id   + offset,
+@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
+     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+     int32_t i_batch = 0;
+     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+-    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+    int n_embd  = llama_model_n_embd(llama_get_model(lctx));
+    decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
+     const int nx = mtmd_image_tokens_get_nx(image_tokens);
+     const int ny = mtmd_image_tokens_get_ny(image_tokens);
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@@ -18,7 +18,7 @@ adds the unpad operator to GGML
 10 files changed, 223 insertions(+), 2 deletions(-)
 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index 1b8603e7..53ef31b2 100644
+index e91dedf1..8dc107ba 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
 @@ -489,6 +489,7 @@ extern "C" {
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1777,6 +1778,15 @@ extern "C" {
+@@ -1781,6 +1782,15 @@ extern "C" {
             int                   p0,
             int                   p1);
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 64405449..34624cca 100644
+index a30e67f2..835e6495 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
-@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 7413192b..becdae07 100644
+index 955fec59..1868a10c 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
+@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 04ce764e..491acccb 100644
+index cb0d8528..6fe86674 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 425524d0..112abef6 100644
+index 1b56f858..7641247e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
+@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
-@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
+@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
                 const int nth = MIN(1024, ne0);
@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9f4147e9..6ceb3cef 100644
+index 9cfddf45..080a943b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7654ae17..3c57aff8 100644
+index 8a654624..6b034d35 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
 @@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
-@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }

--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
@@ -12,10 +12,10 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a9ee9f03..1306864e 100644
+index 806c1b3d..10f34d33 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",

--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
- src/llama-context.cpp  | 105 +++++++++++++----------------------------
+ src/llama-context.h    |   1 +
- src/llama-context.h    |   4 +-
+ src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
- src/llama-kv-cache.cpp |  39 +++------------
+ src/llama-kv-cache.h   |  12 ++++-
- src/llama-kv-cache.h   |   9 +++-
+ 3 files changed, 47 insertions(+), 73 deletions(-)
- 4 files changed, 51 insertions(+), 106 deletions(-)
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+diff --git a/src/llama-context.h b/src/llama-context.h
-index cd06ad91..77177c5e 100644
+index c4ab242a..9970dfc6 100644
--- a/src/llama-context.cpp
+--- a/src/llama-context.h
-+++ b/src/llama-context.cpp
+++ b/src/llama-context.h
-@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
- llm_graph_result_ptr llama_context::build_kv_self_defrag(
+ #include "llama-graph.h"
-         ggml_context * ctx0,
+ #include "llama-adapter.h"
-        ggml_cgraph * gf) const {
+#include "llama-kv-cache.h"
-+        ggml_cgraph * gf,
-+        const std::vector<struct llama_kv_defrag_move> & moves) const {
+ #include "ggml-cpp.h"
-     auto res = std::make_unique<llm_graph_result>();
+ #include "ggml-opt.h"
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index a7b0a7eb..1a50c034 100644
+--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
+ }
+ bool llama_kv_cache_unified::update(llama_context & lctx) {
+-    bool need_reserve = false;
+-
+     auto * sched = lctx.get_sched();
+     if (has_shift) {
+@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+             res->set_inputs(nullptr);
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+         {
+@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+     if (do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = lctx.graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return false;
+        }
+
+        for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, defrag_info.moves.size());
+            chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
+-        if (defrag_prepare(lctx.graph_max_nodes())) {
+             ggml_backend_sched_reset(sched);
+             auto * gf = lctx.graph_init();
+-            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
+             ggml_backend_sched_alloc_graph(sched, gf);
+             res->set_inputs(nullptr);
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+         do_defrag = false;
+     }
-     const auto & hparams = model.hparams;
+-    return need_reserve;
+    // we never need to reserve a worst case graph
+    return false;
+ }
-    const auto & ids = kv_self->defrag_info.ids;
+ void llama_kv_cache_unified::defrag_sched(float thold) {
+@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+         const llama_cparams & cparams,
+                ggml_context * ctx,
+-                ggml_cgraph * gf) const {
+                ggml_cgraph * gf,
+                const std::vector<struct llama_kv_defrag_move> & moves) const {
+     auto res = std::make_unique<llm_graph_result>();
+-    const auto & ids = defrag_info.ids;
 -
 #if 0
     // CPU defrag
     //
-@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
-             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
 +                        n_embd_v_gqa, move.len,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
-+                        n_embd_v_gqa, move.len,
+                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+                        ggml_row_size(v_l[il]->type, move.src));
             } else {
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+                         ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(kv_self->v_l[il]->type, i));
+-                        ggml_row_size(v_l[il]->type, i));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+                        ggml_row_size(v_l[il]->type, move.src));
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+                         ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(kv_self->v_l[il]->type, id));
+-                        ggml_row_size(v_l[il]->type, id));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+                        ggml_row_size(v_l[il]->type, move.dst));
             }
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
         }
 -
 -        i += nm - 1;
     }
-
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
- #endif
-     return res;
-@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
- void llama_context::kv_self_update() {
-     auto & kv = kv_self;
-    bool need_reserve = false;
-
-     if (kv->has_shift) {
-         if (!kv->get_can_shift()) {
-             GGML_ABORT("The current context does not support K-shift");
-@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
-             res->set_inputs(nullptr);
-             graph_compute(gf, false);
-
-            need_reserve = true;
-         }
-         {
-@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
-     // defragment the KV cache if needed
-     if (kv->do_defrag) {
-         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-+        const uint32_t n_max_nodes = graph_max_nodes();
-+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
-+        if (!kv->defrag_prepare(n_max_nodes)) {
-+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
-+            return;
-+        }
-        if (kv->defrag_prepare(graph_max_nodes())) {
-            ggml_backend_sched_reset(sched.get());
-+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
-+            std::vector<struct llama_kv_defrag_move> chunk;
-+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
-+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
-+            ggml_backend_sched_reset(sched.get());
-             auto * gf = graph_init();
-
-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
-+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
-             ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-             res->set_inputs(nullptr);
-
-             graph_compute(gf, false);
-
-            need_reserve = true;
-         }
-         kv->do_defrag = false;
-     }
-
-    // reserve a worst case graph if needed
-    if (need_reserve) {
-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        // simulate full KV cache
-        kv_self->n = kv_self->size;
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        auto * gf = graph_init();
-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(sched.get());
-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
- }
- enum llama_pooling_type llama_context::pooling_type() const {
-@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
-         // find KV slot
-         {
-             if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                return 1;
-+                kv_self->defrag();
-+                kv_self_update();
-+                if (!kv_self->find_slot(ubatch)) {
-+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-+                    return 1;
-+                }
-             }
-             if (!kv_self->recurrent) {
-diff --git a/src/llama-context.h b/src/llama-context.h
-index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -5,6 +5,7 @@
- #include "llama-cparams.h"
- #include "llama-graph.h"
- #include "llama-adapter.h"
-+#include "llama-kv-cache.h"
- #include "ggml-cpp.h"
-@@ -179,7 +180,8 @@ private:
+     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
-     llm_graph_result_ptr build_kv_self_defrag(
-             ggml_context * ctx0,
-            ggml_cgraph * gf) const;
-+            ggml_cgraph * gf,
-+            const std::vector<struct llama_kv_defrag_move> & moves) const;
-     // TODO: read/write lora adapters and cvec
-     size_t state_write_data(llama_io_write_i & io);
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     assert(n_used <= n_kv);
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
     // determine which KV cells to move where
     //
-@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     //
     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
     //
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
         const auto & cell0 = cells[i0];
-@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
         // are we moving a continuous block of memory?
         bool cont = false;
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
                 cont = false;
                 continue;
             }
-@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             head = n_used;
             if (!cont) {
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
             }
             nf++;
-@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             }
         }
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
         return false;
     }
-    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
 -
-    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+-    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
 +    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
     return true;
 }
 diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
-index 56c74035..25cbcb56 100644
+index bf3b4b6a..928b9712 100644
 --- a/src/llama-kv-cache.h
 +++ b/src/llama-kv-cache.h
-@@ -43,6 +43,13 @@ private:
+@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
+ private:
     llama_kv_cache * kv;
 };
+ 
 +// block of KV slots to move when defragging
 +struct llama_kv_defrag_move {
 +    uint32_t src;
 +    uint32_t dst;
 +    uint32_t len;
 +};
-+
- struct llama_kv_cell {
-     llama_pos pos   = -1;
-     llama_pos delta =  0;
-@@ -131,7 +138,7 @@ public:
-     // defrag
+ //
+ // llama_kv_cache_unified
+@@ -207,7 +214,7 @@ private:
+     // defrag
     struct {
 -        std::vector<uint32_t> ids;
 +        std::vector<llama_kv_defrag_move> moves;
     } defrag_info;
     // return true if cells have been moved
+@@ -249,7 +256,8 @@ private:
+     llm_graph_result_ptr build_graph_defrag(
+             const llama_cparams & cparams,
+                    ggml_context * ctx,
+-                    ggml_cgraph * gf) const;
+                    ggml_cgraph * gf,
+                    const std::vector<llama_kv_defrag_move> & moves) const;
+     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+     void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 43d9fc4f..4c0d3824 100644
+index ddea5ad3..45918bf6 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)

--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 4c0d3824..79c26312 100644
+index 45918bf6..0beaed86 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)

--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
 }
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1306864e..d6515ff6 100644
+index 10f34d33..b098bb25 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+                 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+-                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
-             if (precompiled_charsmap_keyidx != -1) {
+                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
-                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
-+                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 34624cca..59bd3c62 100644
+index 835e6495..3902894b 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         ggml_compute_forward(&params, node);

--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index c0a5f934..75731053 100644
+index 804b11e0..15a10ca8 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
 @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {

--- a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@kernel.org>
-Date: Thu, 1 May 2025 13:46:10 -0700
-Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
-The following scenario will cause an assertion failure in the graph
-allocator:
- - Build and allocate a graph containing a tensor with a non-NULL data
-   pointer
- - Build and allocate a new graph where that data is NULL
-Result:
-ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
-This happens during revalidation because we think that memory should
-have been previously allocated based on the current graph but in
-reality the previous graph was different. In this situation, we
-should do a full reallocation pass.
---
- ggml/src/ggml-alloc.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
-+++ b/ggml/src/ggml-alloc.c
-@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-     size_t node_size = 0;
-     if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
-+        // If we previously had data but don't now then reallocate
-+        if (talloc->buffer_id < 0) {
-+            return false;
-+        }
-         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
-     }
-     return talloc->size_max >= node_size;
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.int(len(schedBackends)),
 			C.size_t(maxGraphNodes),
 			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
+			C._Bool(false),
 		),
 		schedBackends: schedBackends,
 		schedBufts:    schedBufts,

--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -38,7 +38,7 @@ extern "C" {
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
@@ -59,7 +59,7 @@ extern "C" {
    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
@@ -248,7 +248,7 @@ extern "C" {
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph

--- a/ml/backend/ggml/ggml/include/ggml-cpp.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpp.h
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
 struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
 // ggml-backend

--- a/ml/backend/ggml/ggml/include/ggml-opt.h
+++ b/ml/backend/ggml/ggml/include/ggml-opt.h
@@ -37,13 +37,16 @@ extern "C" {
    // ====== Dataset ======
    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
+            enum ggml_type type_data,    // the type for the internal data tensor
-            int64_t ne_label,     // number of elements per label
+            enum ggml_type type_label,   // the type for the internal labels tensor
-            int64_t ndata,        // total number of datapoints/labels
+            int64_t        ne_datapoint, // number of elements per datapoint
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
    // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
@@ -56,13 +59,19 @@ extern "C" {
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);
    // ====== Model / Context ======
    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
-        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
-        GGML_OPT_BUILD_TYPE_OPT,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
-    // returns the default optimizer params (constant)
+    // returns the default optimizer params (constant, hard-coded values)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
-        // the forward graph is defined by inputs and outputs
+        struct ggml_context * ctx_compute;
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor  * inputs;
-        struct ggml_tensor * inputs;
+        struct ggml_tensor  * outputs;
-        struct ggml_tensor * outputs;
        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {
    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
+            ggml_backend_sched_t    backend_sched,
-            struct ggml_context     * ctx_compute,
+            enum ggml_opt_loss_type loss_type);
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
    // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
    // ====== Optimization Result ======
-    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
@@ -144,11 +154,20 @@ extern "C" {
    // ====== Computation ======
-    // do forward pass, increment result if not NULL
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
-    // do forward pass, increment result if not NULL, do backward pass
+    // do forward pass, increment result if not NULL, do backward pass if allocated
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)

--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -674,11 +674,15 @@ extern "C" {
    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
    GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+    // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
+    GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
    GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
@@ -765,7 +769,7 @@ extern "C" {
    // Tensor flags
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
    //
@@ -935,7 +939,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
    // concat a and b along dim
    // used in stable-diffusion
@@ -2055,15 +2059,14 @@ extern "C" {
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct ggml_context *  ctx,        // context for gradient computation
-        struct ggml_context * ctx_compute, // context for gradient computation
+        struct ggml_cgraph  *  cgraph,
-        struct ggml_cgraph  * cgraph,
+        struct ggml_tensor  ** grad_accs);
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);

--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -214,7 +214,7 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl stdc++fs)
+    target_link_libraries(ggml PRIVATE dl)
 endif()
 function(ggml_add_backend_library backend)

--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
    return SIZE_MAX;
 }
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
    // get_alloc_size is optional, defaults to ggml_nbytes
    if (buft->iface.get_alloc_size) {
        size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
 }
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
    char * context_buffer;
    size_t context_buffer_size;
+    bool op_offload;
    int debug;
 };
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                for (int b = 0; b < src_backend_id; b++) {
                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            const int node_backend_id = tensor_backend_id(node);
-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
    }
    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;
    ggml_backend_sched_reset(sched);

--- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
        endif()
        if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()
        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()
        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")