llama: update to commit 71e90e88 (#10192)

943464cc · Jeffrey Morgan · GitHub · 369de832 · 943464cc · 943464cc
Unverified Commit 943464cc authored Apr 16, 2025 by Jeffrey Morgan Committed by GitHub Apr 16, 2025
20 changed files
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -2,7 +2,9 @@

 #include "llama.h"
 #include "llama-arch.h"
+#include "llama-graph.h"
 #include "llama-hparams.h"
+#include "llama-memory.h"
 #include "llama-vocab.h"

 #include <memory>
@@ -11,6 +13,8 @@
 #include <vector>
 #include <stdexcept>

+struct llama_cparams;
+struct llama_ubatch;
 struct llama_model_loader;

 // available models
@@ -26,6 +30,7 @@ enum llm_type {
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_160M,
+    LLM_TYPE_190M,
    LLM_TYPE_220M,
    LLM_TYPE_250M,
    LLM_TYPE_270M,
@@ -40,8 +45,10 @@ enum llm_type {
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
    LLM_TYPE_1_6B,
+    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
    LLM_TYPE_2_8B,
+    LLM_TYPE_2_9B,
    LLM_TYPE_3B,
    LLM_TYPE_4B,
    LLM_TYPE_6B,
@@ -81,6 +88,9 @@ enum llm_type {
    LLM_TYPE_10B_128x3_66B,
    LLM_TYPE_57B_A14B,
    LLM_TYPE_27B,
+    LLM_TYPE_290B,
+    LLM_TYPE_17B_16E, // llama4 Scout
+    LLM_TYPE_17B_128E, // llama4 Maverick
 };

 struct llama_layer_posnet {
@@ -259,6 +269,20 @@ struct llama_layer {
    struct ggml_tensor * time_mix_receptance_b = nullptr;
    struct ggml_tensor * time_mix_gate         = nullptr;

+    // rwkv7
+    struct ggml_tensor * time_mix_w0         = nullptr;
+    struct ggml_tensor * time_mix_a0         = nullptr;
+    struct ggml_tensor * time_mix_a1         = nullptr;
+    struct ggml_tensor * time_mix_a2         = nullptr;
+    struct ggml_tensor * time_mix_v0         = nullptr;
+    struct ggml_tensor * time_mix_v1         = nullptr;
+    struct ggml_tensor * time_mix_v2         = nullptr;
+    struct ggml_tensor * time_mix_g1         = nullptr;
+    struct ggml_tensor * time_mix_g2         = nullptr;
+    struct ggml_tensor * time_mix_k_k        = nullptr;
+    struct ggml_tensor * time_mix_k_a        = nullptr;
+    struct ggml_tensor * time_mix_r_k        = nullptr;
+
    struct ggml_tensor * time_mix_ln     = nullptr;
    struct ggml_tensor * time_mix_ln_b   = nullptr;
    struct ggml_tensor * time_mix_output = nullptr;
@@ -362,7 +386,7 @@ struct llama_model {
    std::string desc() const;

    size_t size() const;
-    size_t max_nodes() const;
+    size_t n_tensors() const;
    size_t n_devices() const;

    // total number of parameters in the model
@@ -375,11 +399,26 @@ struct llama_model {

    ggml_backend_buffer_type_t select_buft(int il) const;

+    bool has_tensor_overrides() const;
+
    const struct ggml_tensor * get_tensor(const char * name) const;

+    // TODO: move this to new llm_arch_model_i interface
+    llama_memory_i * create_memory() const; // TODO: params
+
+    // TODO: move this to new llm_arch_model_i interface
+    llm_graph_result_ptr build_graph(
+            const llm_graph_params & params,
+                       ggml_cgraph * gf,
+                    llm_graph_type   type) const;
+
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
 };

 const char * llm_type_name(llm_type type);
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <regex>
 #include <thread>
 #include <unordered_map>

@@ -47,8 +48,14 @@ struct quantize_state_impl {
        {}
 };

+// changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void llama_tensor_dequantize_impl(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
@@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    model.load_hparams(ml);
    model.load_stats  (ml);

-    struct quantize_state_impl qs(model, params);
+    quantize_state_impl qs(model, params);

    if (params->only_copy) {
        ftype = ml.ftype;
@@ -663,7 +670,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    // populate the original tensors so we get an initial meta data
    for (const auto * it : tensors) {
        uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+        ggml_tensor * tensor = it->tensor;
        if (!ctx_outs[i_split]) {
            ctx_outs[i_split].reset(gguf_init_empty());
        }
@@ -712,7 +719,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    new_ofstream(0);
    for (const auto * it : tensors) {
        const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
+        ggml_tensor * tensor = weight.tensor;
        if (weight.idx != cur_split && params->keep_split) {
            close_ofstream();
            new_ofstream(weight.idx);
@@ -762,10 +769,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;

-        // do not quantize RWKV's time_mix_first tensors
+        // do not quantize RWKV's small yet 2D weights
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
@@ -773,7 +789,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;

-        enum ggml_type new_type;
+        ggml_type new_type;
        void * new_data;
        size_t new_size;

@@ -783,6 +799,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // unless the user specifies a type
+                if (params->tensor_types) {
+                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    for (const auto & [tname, qtype] : tensor_types) {
+                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
+                            if (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            }
+                            new_type = qtype;
+                            break;
+                        }
+                    }
+                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
@@ -907,8 +936,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 // interface implementation
 //

-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+llama_model_quantize_params llama_model_quantize_default_params() {
+    llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
@@ -920,6 +949,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.keep_split                  =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
+        /*.tensor_type                 =*/ nullptr,
    };

    return result;

--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                     const char ** trigger_words,
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens);
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns);

 static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
        return;
    }

-    std::vector<const char *>  trigger_words;
-    for (auto & word : ctx->grammar->trigger_words) {
-        trigger_words.push_back(word.c_str());
+    std::vector<const char *>  trigger_patterns_c;
+    trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
+    for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
+        trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
    }
+
    auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
-                                                 ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
+                                                 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                 ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());

    llama_grammar_free_impl(ctx->grammar);
@@ -1472,7 +1476,8 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;

-    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
+    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
+    GGML_ASSERT(result);

    // copy the state
    {
@@ -1516,16 +1521,38 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                     const char ** trigger_words,
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens) {
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns) {
    auto * ctx = new llama_sampler_grammar;

    if (grammar_str != nullptr && grammar_str[0] != '\0') {
+        // TODO: remove trigger_words support.
+        if (trigger_words != nullptr && num_trigger_words > 0) {
+            GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
+            std::string trigger_pattern("[\\s\\S]*?(");
+            for (size_t i = 0; i < num_trigger_words; ++i) {
+                static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+                if (i > 0) {
+                    trigger_pattern += "|";
+                }
+                trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
+            }
+            trigger_pattern += ")[\\s\\S]*";
+            auto trigger_pattern_c = trigger_pattern.c_str();
+            trigger_patterns = &trigger_pattern_c;
+            num_trigger_patterns = 1;
+        }
        *ctx = {
            /* .vocab        = */ vocab,
            /* .grammar_str  = */ grammar_str,
            /* .grammar_root = */ grammar_root,
-            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
        };
+        if (!ctx->grammar) {
+            delete ctx;
+            return nullptr;
+        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
@@ -1545,7 +1572,7 @@ struct llama_sampler * llama_sampler_init_grammar(
        const struct llama_vocab * vocab,
                      const char * grammar_str,
                      const char * grammar_root) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
 }

 struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1556,7 +1583,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
 }

 // penalties

--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -16,6 +16,7 @@
 #include <queue>
 #include <set>
 #include <unordered_map>
+#include <cctype>

 //
 // helpers
@@ -341,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_MPT:
            case LLAMA_VOCAB_PRE_TYPE_OLMO:
            case LLAMA_VOCAB_PRE_TYPE_JAIS:
+            case LLAMA_VOCAB_PRE_TYPE_TRILLION:
                regex_exprs = {
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                };
@@ -393,10 +395,24 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
-                // original regex from tokenizer.json
-                // [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
                regex_exprs = {
-                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    // original regex from tokenizer.json
+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
+                regex_exprs = {
+                    "\\p{N}+",
+                    "(?=(\\d{3})+(?!\\d))",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+                    // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
+                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
            default:
@@ -1547,6 +1563,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
                clean_spaces = false;
            } else if (
+                tokenizer_pre == "glm4" ||
                tokenizer_pre == "chatglm-bpe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
                special_bos_id = LLAMA_TOKEN_NULL;
@@ -1591,9 +1608,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "megrez") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else if (
-                tokenizer_pre == "gpt-4o") {
+                    tokenizer_pre == "gpt-4o" ||
+                    tokenizer_pre == "llama4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "superbpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "trillion") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "bailingmoe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -1772,6 +1802,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_turn>"
                        || t.first == "<|endoftext|>"
                        || t.first == "<EOT>"
+                        || t.first == "_<EOT>"
                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                   ) {
                    special_eot_id = t.second;
@@ -1804,6 +1835,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-prefix>"
                        || t.first == "<｜fim▁begin｜>" // DeepSeek
                        || t.first == "<PRE>"
+                        || t.first == "▁<PRE>"          // CodeLlama
                        ) {
                    special_fim_pre_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1821,6 +1853,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-suffix>"
                        || t.first == "<｜fim▁hole｜>" // DeepSeek
                        || t.first == "<SUF>"
+                        || t.first == "▁<SUF>"         // CodeLlama
                        ) {
                    special_fim_suf_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1838,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-middle>"
                        || t.first == "<｜fim▁end｜>"  // DeepSeek
                        || t.first == "<MID>"
+                        || t.first == "▁<MID>"         // CodeLlama
                        ) {
                    special_fim_mid_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1922,6 +1956,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
                    || t.first == "<EOT>"
+                    || t.first == "_<EOT>"
               ) {
                special_eog_ids.insert(t.second);
                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2180,14 +2215,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
                    // find the first occurrence of a given special token in this fragment
                    //  passing offset argument only limit the "search area" but match coordinates
                    //  are still relative to the source full raw_text
-                    auto match = raw_text.find(text, raw_text_base_offset);
+                    //  string_view begins at pos 0 for the same reason
+                    auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);

                    // no occurrences found, stop processing this fragment for a given special token
                    if (match == std::string::npos) break;

-                    // check if match is within bounds of offset <-> length
-                    if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
-
 #ifdef PRETOKENIZERDEBUG
                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif

--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -220,7 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
    free(wbuf);
    return ret;
 #else
-
 #if defined(__clang__)
    // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -147,27 +147,27 @@ func (c *Context) Model() *Model {
 }

 func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
-	C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
+	C.llama_kv_self_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
 }

 func (c *Context) KvCacheSeqRm(seqId int, p0 int, p1 int) bool {
-	return bool(C.llama_kv_cache_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
+	return bool(C.llama_kv_self_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
 }

 func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
-	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
+	C.llama_kv_self_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }

 func (c *Context) KvCacheClear() {
-	C.llama_kv_cache_clear(c.c)
+	C.llama_kv_self_clear(c.c)
 }

 func (c *Context) KvCacheDefrag() {
-	C.llama_kv_cache_defrag(c.c)
+	C.llama_kv_self_defrag(c.c)
 }

 func (c *Context) KvCacheCanShift() bool {
-	return bool(C.llama_kv_cache_can_shift(c.c))
+	return bool(C.llama_kv_self_can_shift(c.c))
 }

 // Get the embeddings for a sequence id

--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,10 +24,10 @@ problem.
 9 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index dba7be33..65e150d6 100644
+index 273075f4..dd11f304 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
@@ -35,7 +35,7 @@ index dba7be33..65e150d6 100644
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-@@ -542,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -544,6 +543,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
     free(ctx->buffers);
     free(ctx);
@@ -43,7 +43,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1865,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1912,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -65,7 +65,7 @@ index dba7be33..65e150d6 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index d410c024..a207ab1e 100644
+index cec36b36..4b057973 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
 @@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@@ -76,7 +76,7 @@ index d410c024..a207ab1e 100644
 }
 
 /**
-@@ -1198,6 +1199,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -85,10 +85,10 @@ index d410c024..a207ab1e 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index ebb2ccae..dfff21a2 100644
+index fafe9633..59a49560 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -96,7 +96,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -783,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -104,7 +104,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1055,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index c550142a..fd9a4e77 100644
+index 9f1c6c6c..310afe8a 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@@ -137,10 +137,10 @@ index c550142a..fd9a4e77 100644
 
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index f5906246..062e93b8 100644
+index b8b5cbd3..14d4561b 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -1203,6 +1203,7 @@ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -149,10 +149,10 @@ index f5906246..062e93b8 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 97873acc..893ee0b9 100644
+index 862b9b66..34536681 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -419,6 +419,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@@ -161,10 +161,10 @@ index 97873acc..893ee0b9 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 792e0569..5e233e8b 100644
+index 3e48a924..a3d182fc 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -311,6 +311,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -172,7 +172,7 @@ index 792e0569..5e233e8b 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -720,6 +721,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -180,7 +180,7 @@ index 792e0569..5e233e8b 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1053,6 +1055,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 792e0569..5e233e8b 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index abe3e790..1dad714b 100644
+index 783a0ff8..8ac1e07e 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -7914,6 +7914,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -200,7 +200,7 @@ index abe3e790..1dad714b 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8056,6 +8057,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);

--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -3,15 +3,17 @@ From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] pretokenizer

+allow for an unset pretokenizer with a warning in the
+logs instead of throwing an error
 ---
 src/llama-vocab.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index ad9ffe66..a4eee9b8 100644
+index 464ff01e..0125ee53 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -29,9 +31,9 @@ index ad9ffe66..a4eee9b8 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:28:34 -0700
 Subject: [PATCH] embeddings

+allow a loaded model in llama.cpp to be used for
+both embeddings and causal attention text generation
+instead of forcing one or the error
 ---
- src/llama-context.cpp | 2 +-
- src/llama.cpp         | 6 ++++--
- 2 files changed, 5 insertions(+), 3 deletions(-)
+ src/llama-context.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 671d2a81..47e79ed4 100644
+index 4735e98e..65135172 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+     int64_t n_outputs_all = 0;
 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
-             res  = nullptr;
-             embd = nullptr;
-         } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-             embd = nullptr;
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
-                     break;
-                 }
-             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
+     // count outputs
+-    if (batch.logits && !embd_pooled) {
+    if (batch.logits) {
+         for (uint32_t i = 0; i < n_tokens_all; ++i) {
+             n_outputs_all += batch.logits[i] != 0;
         }
+@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+         //}
+ 
+-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = cparams.causal_attn ? res->get_logits() : nullptr;
+         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-+
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+         if (t_embd && res->get_embd_pooled()) {
+@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+     const auto n_embd  = hparams.n_embd;
+ 
+     // TODO: use a per-batch flag for logits presence instead
+-    bool has_logits = !cparams.embeddings;
+    bool has_logits =  cparams.causal_attn;
+     bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
-         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+     // TODO: hacky enc-dec support
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:15 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:34:37 -0700
 Subject: [PATCH] clip-unicode

+fixes loading vision models in llama.cpp on windows
+filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 39 insertions(+), 1 deletion(-)
+ examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 39 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 76d4a785..205af1eb 100644
+index 49c90b75..4b72ea9f 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -58,6 +58,19 @@
- #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
- #endif // defined(LLAVA_LOG_OFF)
+@@ -28,6 +28,19 @@
+ #include <cinttypes>
+ #include <limits>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -28,49 +30,48 @@ index 76d4a785..205af1eb 100644
 +#endif
 +#endif
 +
+ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
+ 
 //#define CLIP_DEBUG_FUNCTIONS
+@@ -1429,7 +1442,29 @@ struct clip_model_loader {
+         {
+             std::vector<uint8_t> read_buf;
 
- // RGB uint8 image
-@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-             gguf_free(ctx);
-             return nullptr;
-         }
-
 +#ifdef _WIN32
-+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-+        if (!wlen) {
-+            return NULL;
-+        }
-+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-+        if (!wlen) {
-+            free(wbuf);
-+            return NULL;
-+        }
+            int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+            if (!wlen) {
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
+            wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+            wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen);
+            if (!wlen) {
+                free(wbuf);
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
 +#if __GLIBCXX__
-+        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-+        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-+        std::istream fin(&buffer);
+            int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
+            __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
+            std::istream fin(&buffer);
 +#else // MSVC
-+        // unused in our current build
-+        auto fin = std::ifstream(wbuf, std::ios::binary);
+            // unused in our current build
+            auto fin = std::ifstream(wbuf, std::ios::binary);
 +#endif
-+        free(wbuf);
+            free(wbuf);
 +#else
-         auto fin = std::ifstream(fname, std::ios::binary);
+             auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
-         if (!fin) {
-             LOG_ERR("cannot open model file for loading tensors\n");
-             clip_free(new_clip);
-@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+             if (!fin) {
+                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+             }
+@@ -1456,7 +1491,11 @@ struct clip_model_loader {
+                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+                 }
             }
-         }
 +#if defined(_WIN32) && defined(__GLIBCXX__)
-+        close(fd);
+            close(fd);
 +#else
-         fin.close();
+             fin.close();
 +#endif
-     }
 
-     // vision model
+             LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+         }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 16:03:51 -0700
 Subject: [PATCH] solar-pro

-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
+adds support for the Solar Pro architecture
 ---
- src/llama-arch.cpp         |  21 +++++
+ src/llama-arch.cpp         |  21 ++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 ++
+ src/llama-hparams.h        |   5 +
 src/llama-model-loader.cpp |   1 +
- src/llama-model.cpp        |  44 +++++++++++
+ src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
 src/llama-model.h          |   3 +
- src/llama.cpp              | 152 ++++++++++++++++++++++++++++++++++++-
- 8 files changed, 236 insertions(+), 1 deletion(-)
+ 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 97a1e7e5..a1e0ebcc 100644
+index a6fddc7f..0b0fedcd 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
 +    { LLM_ARCH_SOLAR,            "solar"            },
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
-@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -66,7 +59,7 @@ index 97a1e7e5..a1e0ebcc 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -75,18 +68,18 @@ index 97a1e7e5..a1e0ebcc 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 122fdceb..77919578 100644
+index 2c2099b3..74aa3dd0 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -65,6 +65,7 @@ enum llm_arch {
+@@ -72,6 +72,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_WAVTOKENIZER_DEC,
-     LLM_ARCH_UNKNOWN,
- };
-@@ -129,6 +130,7 @@ enum llm_kv {
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
+@@ -144,6 +145,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -94,7 +87,7 @@ index 122fdceb..77919578 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -311,6 +313,7 @@ enum llm_tensor {
+@@ -340,6 +342,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -103,14 +96,13 @@ index 122fdceb..77919578 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index ea87b295..f3955de9 100644
+index 90dfe7a7..8a667960 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
-     // corresponds to Mamba's ssm_states size
+@@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
     return ssm_d_state * ssm_d_inner;
 }
-+
+ 
 +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
 +    if (il < n_layer) {
 +        return n_bskcn_arr[n][il] > 0;
@@ -118,12 +110,15 @@ index ea87b295..f3955de9 100644
 +
 +    GGML_ABORT("fatal error");
 +}
-\ No newline at end of file
+
+ bool llama_hparams::is_swa(uint32_t il) const {
+     if (il < n_layer) {
+         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 1fe45410..1bdcdfd5 100644
+index 4e0b5719..c3147cbc 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -50,6 +50,8 @@ struct llama_hparams {
+@@ -51,6 +51,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -132,18 +127,18 @@ index 1fe45410..1bdcdfd5 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -133,6 +135,9 @@ struct llama_hparams {
- 
+@@ -149,6 +151,9 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
-+
+ 
 +    // Block skip connection
 +    bool n_bskcn(uint32_t n, uint32_t il) const;
+
+     bool is_swa(uint32_t il) const;
 };
 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 05d58ad9..1252aca1 100644
+index ea73a8a7..a012aeae 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -155,10 +150,10 @@ index 05d58ad9..1252aca1 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 36a0a009..ad1315c6 100644
+index b74dd72c..5fbd0055 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -180,7 +175,7 @@ index 36a0a009..ad1315c6 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -215,54 +210,12 @@ index 36a0a009..ad1315c6 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-model.h b/src/llama-model.h
-index a7c30444..1afb0024 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -55,6 +55,7 @@ enum llm_type {
-     LLM_TYPE_15B,
-     LLM_TYPE_16B,
-     LLM_TYPE_20B,
-+    LLM_TYPE_22B,
-     LLM_TYPE_30B,
-     LLM_TYPE_32B,
-     LLM_TYPE_34B,
-@@ -281,6 +282,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_up_scale   = nullptr;
-     struct ggml_tensor * ffn_down_scale = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
+@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
+     }
+ };
 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -7953,9 +7953,155 @@ struct llm_build_context {
-         cb(img_logits, "img_logits", -1);
-         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
-         cb(cur, "result_output", -1);
-
-         ggml_build_forward_expand(gf, cur);
-+        return gf;
-+   }
-+
-+   ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
+struct llm_build_solar : public llm_graph_context {
+    llm_build_solar(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -270,13 +223,15 @@ index ac85bfed..6d320ea4 100644
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 +
 +        struct ggml_tensor * bskcn_1;
 +        struct ggml_tensor * bskcn_2;
@@ -305,88 +260,94 @@ index ac85bfed..6d320ea4 100644
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
- 
+
 +            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
 +                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
 +                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 +
 +                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                
+                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 +
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(inp_attn, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
 +                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
 +            }
 +
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
 +            cb(ffn_inp, "ffn_inp", il);
 +
 +            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
 +                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "ffn_norm", il);
 +
-+            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
 +                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = ggml_add(ctx0, cur, ffn_inp);
 +            cb(cur, "ffn_out", il);
 +
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = build_cvec(cur, il);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
@@ -394,25 +355,64 @@ index ac85bfed..6d320ea4 100644
 +        }
 +
 +        cur = inpL;
-+        cur = llm_build_norm(ctx0, cur, hparams,
+
+        cur = build_norm(cur,
 +                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
+
 +        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
 +        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
+
 +        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
 +        ggml_build_forward_expand(gf, cur);
-         return gf;
-     }
- 
-@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
+    }
+};
+
+ struct llm_build_wavtokenizer_dec : public llm_graph_context {
+     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+         ggml_tensor * cur;
+@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
-                 result = llm.build_chameleon();
+                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
-+                result = llm.build_solar();
+                llm = std::make_unique<llm_build_solar>(*this, params, gf);
 +            } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                 result = llm.build_wavtokenizer_dec();
+                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
+@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+diff --git a/src/llama-model.h b/src/llama-model.h
+index 0f18dac1..e08d4ae4 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -62,6 +62,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_30B,
+     LLM_TYPE_32B,
+     LLM_TYPE_34B,
+@@ -305,6 +306,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_up_scale   = nullptr;
+     struct ggml_tensor * ffn_down_scale = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index dfff21a2..1b0d074b 100644
+index 59a49560..b70c6a32 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2284,9 +2284,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;

--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Thu, 17 Oct 2024 15:18:22 -0700
+Date: Tue, 8 Apr 2025 19:27:12 -0700
 Subject: [PATCH] add mllama support

-mllama adds cross-attention layers to the standard llama architecture
-it also requires a way to input a new tensor: cross_attention_state
-once per generation
-
-cross-attention layers don't change and so they are cached in the
-kv cache once per run
-
-remaining is to implement the cross attention mask
+adds support for the llama 3.2 vision architecture
 ---
+ examples/llava/gemma3-cli.cpp |   3 +-
 examples/llava/llava.cpp      |   5 +-
+ examples/llava/mtmd.cpp       |   6 +-
 ggml/src/ggml-backend-reg.cpp |   6 +-
 include/llama.h               |   6 +
- src/llama-arch.cpp            |  44 ++++++
+ src/llama-arch.cpp            |  44 +++++
 src/llama-arch.h              |  10 ++
 src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  28 ++--
- src/llama-context.h           |   2 +
+ src/llama-context.cpp         |  25 ++-
+ src/llama-context.h           |   1 +
 src/llama-cparams.h           |   1 +
- src/llama-hparams.cpp         |   6 +
- src/llama-hparams.h           |   5 +
- src/llama-kv-cache.cpp        |  13 +-
+ src/llama-graph.cpp           |  25 +++
+ src/llama-graph.h             |  12 ++
+ src/llama-hparams.cpp         |   4 +
+ src/llama-hparams.h           |   7 +
+ src/llama-kv-cache.cpp        |  12 +-
 src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           |  65 ++++++++-
+ src/llama-model.cpp           | 309 +++++++++++++++++++++++++++++++++-
 src/llama-model.h             |  12 ++
 src/llama-quant.cpp           |   4 +-
- src/llama.cpp                 | 262 +++++++++++++++++++++++++++++++++-
- 17 files changed, 452 insertions(+), 22 deletions(-)
+ 20 files changed, 475 insertions(+), 22 deletions(-)

+diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
+index 91a07e2a..13127c7b 100644
+--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
+@@ -106,7 +106,7 @@ struct decode_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -118,6 +118,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 518aad3f..f0e484a1 100644
+index 03a22cbb..5eb40bcd 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -445,7 +445,7 @@ struct llava_embd_batch {
+@@ -456,7 +456,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
@@ -44,7 +61,7 @@ index 518aad3f..f0e484a1 100644
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -457,6 +457,7 @@ struct llava_embd_batch {
+@@ -468,6 +468,7 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@@ -52,7 +69,7 @@ index 518aad3f..f0e484a1 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
@@ -61,11 +78,42 @@ index 518aad3f..f0e484a1 100644
         if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
+diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
+index 114c274b..a0e649ad 100644
+--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
+@@ -213,7 +213,7 @@ struct decode_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -225,6 +225,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
+ 
+             int32_t n_tokens = chunk.tokens_image->n_tokens();
+             float * embd = mtmd_get_output_embd(ctx);
+-            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
+            decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
+             int64_t t1 = ggml_time_ms();
+             ret = llama_decode(lctx, batch_img.batch);
+             if (ret != 0) {
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 955ed505..95036ef8 100644
+index 405d8e31..82ae1b5b 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -171,9 +171,9 @@ struct ggml_backend_registry {
+@@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
@@ -79,10 +127,10 @@ index 955ed505..95036ef8 100644
         register_backend(ggml_backend_rpc_reg());
 #endif
 diff --git a/include/llama.h b/include/llama.h
-index 47919602..cc948005 100644
+index 5657fbf0..f91896e4 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -249,6 +249,7 @@ extern "C" {
+@@ -255,6 +255,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@@ -90,7 +138,7 @@ index 47919602..cc948005 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -343,6 +344,7 @@ extern "C" {
+@@ -357,6 +358,7 @@ extern "C" {
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
@@ -98,7 +146,7 @@ index 47919602..cc948005 100644
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
-@@ -443,6 +445,10 @@ extern "C" {
+@@ -458,6 +460,10 @@ extern "C" {
             struct llama_context_params   params),
             "use llama_init_from_model instead");
 
@@ -110,7 +158,7 @@ index 47919602..cc948005 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index a1e0ebcc..b6f20286 100644
+index 0b0fedcd..c1f78618 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
 @@ -6,6 +6,7 @@
@@ -118,19 +166,19 @@ index a1e0ebcc..b6f20286 100644
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,            "llama"            },
 +    { LLM_ARCH_MLLAMA,           "mllama"           },
+     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
     { LLM_ARCH_FALCON,           "falcon"           },
-     { LLM_ARCH_GROK,             "grok"             },
-@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
-+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
+@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+@@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
 +    {
@@ -170,7 +218,7 @@ index a1e0ebcc..b6f20286 100644
     {
         LLM_ARCH_DECI,
         {
-@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -186,18 +234,18 @@ index a1e0ebcc..b6f20286 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 77919578..ec742224 100644
+index 74aa3dd0..f987844d 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -10,6 +10,7 @@
- 
+@@ -11,6 +11,7 @@
 enum llm_arch {
     LLM_ARCH_LLAMA,
+     LLM_ARCH_LLAMA4,
 +    LLM_ARCH_MLLAMA,
     LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
-@@ -131,6 +132,7 @@ enum llm_kv {
+@@ -146,6 +147,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -205,7 +253,7 @@ index 77919578..ec742224 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -314,6 +316,14 @@ enum llm_tensor {
+@@ -343,6 +345,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -249,39 +297,66 @@ index 01d5ca57..8682b0e6 100644
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 47e79ed4..7b22fe13 100644
+index 65135172..afe6f552 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
-     }
+@@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) {
+             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+         }
 
-     if (ubatch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-+        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
-+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
-+            // zero out inp_embd since it's not used
-+            float * inp_embd_data = (float *)lctx.inp_embd->data;
-+            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
-+                inp_embd_data[i] = 0.0f;
-+            }
-+        } else {
-+            const int64_t n_embd   = hparams.n_embd;
-+            const int64_t n_tokens = ubatch.n_tokens;
+-        return logits + j*model.vocab.n_tokens();
+        return logits + j*model.hparams.n_vocab;
+     } catch (const std::exception & err) {
+         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+ #ifndef NDEBUG
+@@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) {
+     cparams.warmup = value;
+ }
 
-        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
-+            ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
-+        }
-     }
+void llama_context::set_cross_attn(bool value) {
+    cparams.cross_attn = value;
+}
+
+ void llama_context::set_adapter_lora(
+             llama_adapter_lora * adapter,
+             float scale) {
+@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
-     if (ubatch.pos && lctx.inp_pos) {
-@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
- size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-     const auto & cparams = lctx.cparams;
-     const auto & hparams = lctx.model.hparams;
-    const auto & vocab   = lctx.model.vocab;
+     const int64_t n_embd = hparams.n_embd;
+ 
+-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+ 
+     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+ 
+@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) {
 
-     const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+     const llama_batch & batch = batch_allocr.batch;
+ 
+-    const auto & vocab   = model.vocab;
+     const auto & hparams = model.hparams;
+ 
+-    const int32_t n_vocab = vocab.n_tokens();
+    const int32_t n_vocab = hparams.n_vocab;
+ 
+     const int64_t n_tokens_all = batch.n_tokens;
+     const int64_t n_embd       = hparams.n_embd;
+@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+ 
+     const bool logits_all = n_outputs_all == n_tokens_all;
+ 
+-    sbatch.from_batch(batch, n_embd,
+    sbatch.from_batch(batch, batch.n_embd,
+             /* simple_split */ !kv_self->recurrent,
+             /* logits_all   */ logits_all);
+ 
+@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) {
+ 
+ int32_t llama_context::output_reserve(int32_t n_outputs) {
+     const auto & hparams = model.hparams;
+-    const auto & vocab   = model.vocab;
+ 
+     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
 
     const auto n_batch = cparams.n_batch;
 -    const auto n_vocab = vocab.n_tokens();
@@ -289,59 +364,57 @@ index 47e79ed4..7b22fe13 100644
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
- void llama_output_reorder(struct llama_context & ctx) {
-     std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
+@@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+ void llama_context::output_reorder() {
+     auto & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
-+        const uint32_t n_vocab = ctx.model.hparams.n_vocab;
-         const uint32_t n_embd  = ctx.model.hparams.n_embd;
+-        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_vocab = model.hparams.n_vocab;
+         const uint32_t n_embd  = model.hparams.n_embd;
+ 
+         GGML_ASSERT((size_t) n_outputs == out_ids.size());
+@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
+     {
+         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
+ 
+-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
 
-         const int32_t n_outputs = ctx.n_outputs;
-@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
-     ctx->cparams.causal_attn = causal_attn;
+         io.write(&logits_size, sizeof(logits_size));
+ 
+@@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() {
+         /*.offload_kqv                 =*/ true,
+         /*.flash_attn                  =*/ false,
+         /*.no_perf                     =*/ true,
+        /*.cross_attn                  =*/ false,
+         /*.abort_callback              =*/ nullptr,
+         /*.abort_callback_data         =*/ nullptr,
+     };
+@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
+     ctx->set_warmup(warmup);
 }
 
 +void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-+    ctx->cparams.cross_attn = cross_attention;
+    ctx->set_cross_attn(cross_attention);
 +}
 +
- void llama_synchronize(struct llama_context * ctx) {
-     ggml_backend_sched_synchronize(ctx->sched.get());
- 
-@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
-             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
-         }
- 
-        return ctx->logits + j*ctx->model.vocab.n_tokens();
-+        return ctx->logits + j*ctx->model.hparams.n_vocab;
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
- #ifndef NDEBUG
-@@ -886,7 +898,7 @@ struct llama_data_write {
-     }
- 
-     void write_logits(const struct llama_context * ctx) {
-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
-+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
- 
-         write(&logits_size, sizeof(logits_size));
- 
+ void llama_synchronize(llama_context * ctx) {
+     ctx->synchronize();
+ }
 diff --git a/src/llama-context.h b/src/llama-context.h
-index a9268b29..cf12c9d7 100644
+index 04facb54..baa03276 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
-@@ -107,6 +107,8 @@ struct llama_context {
-     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
-+
-+    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
- };
+@@ -65,6 +65,7 @@ struct llama_context {
+     void set_embeddings (bool value);
+     void set_causal_attn(bool value);
+     void set_warmup(bool value);
+    void set_cross_attn(bool value);
 
- // TODO: make these methods of llama_context
+     void set_adapter_lora(
+             llama_adapter_lora * adapter,
 diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 252012f3..9681e5a0 100644
+index 30e550f0..85ad91b9 100644
 --- a/src/llama-cparams.h
 +++ b/src/llama-cparams.h
 @@ -29,6 +29,7 @@ struct llama_cparams {
@@ -349,37 +422,115 @@ index 252012f3..9681e5a0 100644
     bool flash_attn;
     bool no_perf;
 +    bool cross_attn;
+     bool warmup;
 
     enum llama_pooling_type pooling_type;
+diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
+index cd955d63..83f3c5a8 100644
+--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
+@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
+     }
+ }
+ 
+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->embd) {
+        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
+    }
+}
+
+ //
+ // llm_graph_context
+ //
+@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
+ }
 
+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
+    const int64_t n_embd = hparams.n_embd;
+
+    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
+
+    ggml_tensor * cur = nullptr;
+
+    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
+    ggml_set_input(inp->cross_attn_state);
+
+    cur = inp->cross_attn_state;
+
+    cb(cur, "inp_cross_attn_state", -1);
+
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
+ ggml_tensor * llm_graph_context::build_attn(
+         llm_graph_input_attn_cross * inp,
+         ggml_cgraph * gf,
+diff --git a/src/llama-graph.h b/src/llama-graph.h
+index 5b6618f9..51993998 100644
+--- a/src/llama-graph.h
+++ b/src/llama-graph.h
+@@ -86,6 +86,7 @@ public:
+ 
+     ggml_tensor * tokens = nullptr; // I32 [n_batch]
+     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+ };
+ 
+ class llm_graph_input_pos : public llm_graph_input_i {
+@@ -285,6 +286,16 @@ public:
+     const llama_cross * cross = nullptr;
+ };
+ 
+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
+public:
+    llm_graph_input_cross_attn_state()          = default;
+    virtual ~llm_graph_input_cross_attn_state() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
+ //
+ // llm_graph_result
+ //
+@@ -493,6 +504,7 @@ struct llm_graph_context {
+     ggml_tensor * build_inp_cls() const;
+     ggml_tensor * build_inp_s_copy() const;
+     ggml_tensor * build_inp_s_mask() const;
+    ggml_tensor * build_inp_cross_attn_state() const;
+ 
+     ggml_tensor * build_inp_cross_embd() const;
+     ggml_tensor * build_inp_pos_bucket_enc() const;
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index f3955de9..0b841028 100644
+index 8a667960..6a02de03 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -2,6 +2,8 @@
- 
- #include "ggml.h"
- 
-+#include <algorithm>
-+
- uint32_t llama_hparams::n_head(uint32_t il) const {
-     if (il < n_layer) {
-         return n_head_arr[il];
-@@ -76,4 +78,8 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
-     }
+@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
 
     GGML_ABORT("fatal error");
-+}
+ }
 +
 +bool llama_hparams::cross_attention_layers(uint32_t il) const {
 +    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
- }
-\ No newline at end of file
+}
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 1bdcdfd5..05383046 100644
+index c3147cbc..4567a0e9 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -41,6 +41,7 @@ struct llama_hparams {
+@@ -2,6 +2,8 @@
+ 
+ #include "llama.h"
+ 
+#include <algorithm>
+
+ #include <array>
+ 
+ // bump if necessary
+@@ -42,6 +44,7 @@ struct llama_hparams {
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
@@ -387,7 +538,7 @@ index 1bdcdfd5..05383046 100644
 
     // for WavTokenizer
     struct llama_hparams_posnet   posnet;
-@@ -51,6 +52,7 @@ struct llama_hparams {
+@@ -52,6 +55,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
@@ -395,21 +546,21 @@ index 1bdcdfd5..05383046 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
-@@ -138,6 +140,9 @@ struct llama_hparams {
- 
+@@ -154,6 +158,9 @@ struct llama_hparams {
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
-+
+ 
 +    // cross attention layers
 +    bool cross_attention_layers(uint32_t il) const;
+
+     bool is_swa(uint32_t il) const;
 };
 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index feffdf0d..b541c5a3 100644
+index dbf5f118..9310f262 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -91,8 +91,17 @@ bool llama_kv_cache_init(
+@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
             return false;
         }
 
@@ -425,12 +576,11 @@ index feffdf0d..b541c5a3 100644
 +            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
 +            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
 +        }
-+
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
-         cache.k_l.push_back(k);
+         k_l.push_back(k);
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 1252aca1..45d08721 100644
+index a012aeae..2e11507d 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -315,6 +315,8 @@ namespace GGUFMeta {
@@ -443,10 +593,10 @@ index 1252aca1..45d08721 100644
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index ad1315c6..21819080 100644
+index 5fbd0055..d5ad466e 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
@@ -454,7 +604,7 @@ index ad1315c6..21819080 100644
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
-@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -462,7 +612,7 @@ index ad1315c6..21819080 100644
 
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
@@ -474,7 +624,7 @@ index ad1315c6..21819080 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@@ -483,8 +633,8 @@ index ad1315c6..21819080 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     }
+@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+                     hparams.use_kq_norm = false;
                 }
             } break;
 +        case LLM_ARCH_MLLAMA:
@@ -500,7 +650,7 @@ index ad1315c6..21819080 100644
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
@@ -509,7 +659,7 @@ index ad1315c6..21819080 100644
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
-@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
@@ -562,107 +712,12 @@ index ad1315c6..21819080 100644
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
- 
-         // use what we call a normal RoPE, operating on pairs of consecutive head values
-         case LLM_ARCH_LLAMA:
-+        case LLM_ARCH_MLLAMA:
-         case LLM_ARCH_DECI:
-         case LLM_ARCH_BAICHUAN:
-         case LLM_ARCH_STARCODER:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 1afb0024..7cf57587 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -9,6 +9,7 @@
- #include <string>
- #include <unordered_map>
- #include <vector>
-+#include <stdexcept>
- 
- struct llama_model_loader;
- 
-@@ -63,6 +64,7 @@ enum llm_type {
-     LLM_TYPE_40B,
-     LLM_TYPE_65B,
-     LLM_TYPE_70B,
-+    LLM_TYPE_90B,
-     LLM_TYPE_236B,
-     LLM_TYPE_314B,
-     LLM_TYPE_671B,
-@@ -284,6 +286,16 @@ struct llama_layer {
- 
-     struct ggml_tensor * bskcn_tv = nullptr;
- 
-+    // cross attention
-+    struct ggml_tensor * cross_attn_k_norm = nullptr;
-+    struct ggml_tensor * cross_attn_k_proj = nullptr;
-+    struct ggml_tensor * cross_attn_o_proj = nullptr;
-+    struct ggml_tensor * cross_attn_q_norm = nullptr;
-+    struct ggml_tensor * cross_attn_q_proj = nullptr;
-+    struct ggml_tensor * cross_attn_v_proj = nullptr;
-+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index fb798265..6eb1da08 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (qs.n_attention_wv != n_attn_layer) {
-+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-+        }
-     }
- 
-     size_t total_size_org = 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 6d320ea4..8f7902df 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
-     return inpL;
- }
- 
-+static struct ggml_tensor * llm_build_inp_cross_attn_state(
-+        struct ggml_context * ctx,
-+       struct llama_context & lctx,
-+        const llama_hparams & hparams,
-+         const llm_build_cb & cb) {
-+    const int64_t n_embd = hparams.n_embd;
-+
-+    struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
-+    cb(inpCAS, "inp_cross_attn_state", -1);
-+    ggml_set_input(inpCAS);
-+    lctx.inp_cross_attn_state = inpCAS;
-+
-+    return inpCAS;
-+}
-+
- static void llm_build_kv_store(
-         struct ggml_context * ctx,
-         const llama_hparams & hparams,
-@@ -1157,6 +1172,7 @@ struct llm_build_context {
-         lctx.inp_pos_bucket    = nullptr;
-         lctx.inp_embd_enc      = nullptr;
-         lctx.inp_KQ_mask_cross = nullptr;
-+        lctx.inp_cross_attn_state = nullptr;
-     }
- 
-     void free() {
-@@ -1639,6 +1655,240 @@ struct llm_build_context {
-         return gf;
+@@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context {
     }
+ };
 
-+    struct ggml_cgraph * build_mllama() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-+
+struct llm_build_mllama: public llm_graph_context {
+    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
 +
@@ -670,26 +725,26 @@ index 6d320ea4..8f7902df 100644
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
 +
-+        struct ggml_tensor * cur;
-+        struct ggml_tensor * inpL;
-+        struct ggml_tensor * inpCAS;
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+        ggml_tensor * inpCAS;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-+        inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
+        inpL = build_inp_embd(model.tok_embd);
+        inpCAS = build_inp_cross_attn_state();
 +
-+        // inp_pos - contains the positions
-+        struct ggml_tensor * inp_pos = build_inp_pos();
+          // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
 +
-+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        auto * inp_attn = build_attn_inp_kv_unified();
+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
 +
 +        for (int il = 0; il < n_layer; ++il) {
-+            struct ggml_tensor * inpSA = inpL;
+            ggml_tensor * inpSA = inpL;
 +
 +            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
 +                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "attn_norm", il);
 +
 +            if (hparams.cross_attention_layers(il)) {
@@ -698,7 +753,7 @@ index 6d320ea4..8f7902df 100644
 +                }
 +
 +                // cross attention layer
-+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
 +                cb(Qcur, "Qcur", il);
 +
 +                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -707,10 +762,10 @@ index 6d320ea4..8f7902df 100644
 +                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
 +                cb(Qcur, "Qcur", il);
 +
-+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
+                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
 +                cb(Qcur, "Qcur", il);
 +
-+                struct ggml_tensor * Kcur, * Vcur;
+                ggml_tensor * Kcur, * Vcur;
 +                if (ubatch.embd) {
 +                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
 +                    cb(Kcur, "Kcur", il);
@@ -721,10 +776,10 @@ index 6d320ea4..8f7902df 100644
 +                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 +                    cb(Kcur, "Kcur", il);
 +
-+                    Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
+                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
 +                    cb(Kcur, "Kcur", il);
 +
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
 +
 +                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
 +                    cb(Vcur, "Vcur", il);
@@ -735,12 +790,12 @@ index 6d320ea4..8f7902df 100644
 +                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
 +                    cb(Vcur, "Vcur", il);
 +
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
 +                } else {
-+                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
+                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
 +                    cb(Kcur, "Kcur (view)", il);
 +
-+                    Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
+                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
 +                    cb(Vcur, "Vcur (view)", il);
 +                }
 +
@@ -773,24 +828,24 @@ index 6d320ea4..8f7902df 100644
 +                cb(ffn_inp, "ffn_inp", il);
 +
 +                // feed-forward network
-+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
 +                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
 +                cb(cur, "ffn_norm", il);
 +
-+                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
 +                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
 +                cb(cur, "ffn_out", il);
 +
 +                // TODO: do this inplace once?
 +                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
 +                cb(cur, "ffn_out", il);
 +
-+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = build_cvec(cur, il);
 +                cb(cur, "l_out", il);
 +
 +                // input for next layer
@@ -799,48 +854,53 @@ index 6d320ea4..8f7902df 100644
 +                // self attention layer
 +
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
 +                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 +
 +                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                
+                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 +
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(inp_attn, gf,
 +                    model.layers[il].wo, model.layers[il].bo,
-+                    Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-+
+                    Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
 +
 +                if (il == n_layer - 1) {
 +                    // skip computing output for unused tokens
@@ -854,23 +914,23 @@ index 6d320ea4..8f7902df 100644
 +                cb(ffn_inp, "ffn_inp", il);
 +
 +                // feed-forward network
-+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
 +                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
 +                cb(cur, "ffn_norm", il);
 +
-+                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
 +                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
 +                cb(cur, "ffn_out", il);
 +
 +                cur = ggml_add(ctx0, cur, ffn_inp);
 +                cb(cur, "ffn_out", il);
 +
-+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = build_cvec(cur, il);
 +                cb(cur, "l_out", il);
 +
 +                // input for next layer
@@ -880,74 +940,93 @@ index 6d320ea4..8f7902df 100644
 +
 +        cur = inpL;
 +
-+        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
 +                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
 +        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
 +
 +        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
+
 +        cb(cur, "result_output", -1);
+        res->t_logits = cur;
 +
 +        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
 +    }
+};
 +
-     struct ggml_cgraph * build_deci() {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- 
-@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
+ struct llm_build_deci : public llm_graph_context {
+     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+         const int64_t n_embd_head = hparams.n_embd_head_v;
+@@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
-                 result = llm.build_llama();
+                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
 +        case LLM_ARCH_MLLAMA:
 +            {
-+                result = llm.build_mllama();
+                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
 +            } break;
         case LLM_ARCH_DECI:
             {
-                 result = llm.build_deci();
-@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
-         n_outputs = 1;
-     }
- 
-    lctx.sbatch.from_batch(batch, n_embd,
-+    lctx.sbatch.from_batch(batch, batch.n_embd,
-         /* simple_split */ !lctx.kv_self.recurrent,
-         /* logits_all   */ n_outputs == n_tokens_all);
- 
-@@ -8749,7 +9003,6 @@ static int llama_decode_impl(
-     const llama_batch & batch = batch_allocr.batch;
- 
-     const auto & model   = lctx.model;
-    const auto & vocab   = model.vocab;
-     const auto & hparams = model.hparams;
-     const auto & cparams = lctx.cparams;
- 
-@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
-     llama_kv_slot_restorer kv_slot_restorer(kv_self);
- 
-     const int64_t n_embd  = hparams.n_embd;
-    const int64_t n_vocab = vocab.n_tokens();
-+    const int64_t n_vocab = hparams.n_vocab;
+                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
+@@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         // use what we call a normal RoPE, operating on pairs of consecutive head values
+         case LLM_ARCH_LLAMA:
+         case LLM_ARCH_LLAMA4:
+        case LLM_ARCH_MLLAMA:
+         case LLM_ARCH_DECI:
+         case LLM_ARCH_BAICHUAN:
+         case LLM_ARCH_STARCODER:
+diff --git a/src/llama-model.h b/src/llama-model.h
+index e08d4ae4..21c4617b 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -11,6 +11,7 @@
+ #include <string>
+ #include <unordered_map>
+ #include <vector>
+#include <stdexcept>
 
-     uint32_t n_outputs = 0;
-     uint32_t n_outputs_prev = 0;
-@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
+ struct llama_cparams;
+ struct llama_ubatch;
+@@ -70,6 +71,7 @@ enum llm_type {
+     LLM_TYPE_40B,
+     LLM_TYPE_65B,
+     LLM_TYPE_70B,
+    LLM_TYPE_90B,
+     LLM_TYPE_236B,
+     LLM_TYPE_314B,
+     LLM_TYPE_671B,
+@@ -308,6 +310,16 @@ struct llama_layer {
 
-     const int64_t n_embd = hparams.n_embd;
+     struct ggml_tensor * bskcn_tv = nullptr;
 
-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+    // cross attention
+    struct ggml_tensor * cross_attn_k_norm = nullptr;
+    struct ggml_tensor * cross_attn_k_proj = nullptr;
+    struct ggml_tensor * cross_attn_o_proj = nullptr;
+    struct ggml_tensor * cross_attn_q_norm = nullptr;
+    struct ggml_tensor * cross_attn_q_proj = nullptr;
+    struct ggml_tensor * cross_attn_v_proj = nullptr;
+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
+     struct llama_layer_posnet posnet;
 
-     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
+     struct llama_layer_convnext convnext;
+diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
+index 7dc54227..223e1f3f 100644
+--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
+@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
+         if (llama_model_has_encoder(&model)) {
+             n_attn_layer *= 3;
+         }
+-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+        if (qs.n_attention_wv != n_attn_layer) {
+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+        }
+     }
 
-@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
-         /*.offload_kqv                 =*/ true,
-         /*.flash_attn                  =*/ false,
-         /*.no_perf                     =*/ true,
-+        /*.cross_attn                  =*/ false,
-         /*.abort_callback              =*/ nullptr,
-         /*.abort_callback_data         =*/ nullptr,
-     };
+     size_t total_size_org = 0;
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Thu, 17 Oct 2024 17:19:25 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 13 Apr 2025 22:10:06 -0400
 Subject: [PATCH] add unpad operator

+adds the unpad operator to GGML
 ---
 ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         | 58 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
+ ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ops.h              |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
+ ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
- ggml/src/ggml.c                      | 25 +++++++++++-
- 8 files changed, 220 insertions(+), 2 deletions(-)
+ ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 ++++++++++++-
+ 10 files changed, 223 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index dd0c6a96..8d269a9c 100644
+index 8fcc16df..d19fc167 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -487,6 +487,7 @@ extern "C" {
+@@ -488,6 +488,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
@@ -26,7 +29,7 @@ index dd0c6a96..8d269a9c 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1743,6 +1744,15 @@ extern "C" {
+@@ -1757,6 +1758,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 
@@ -43,13 +46,38 @@ index dd0c6a96..8d269a9c 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 72325349..2f606d82 100644
+index 50400328..432942bf 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
+@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad_reflect_1d(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 6050147b..66b8da68 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
+// ggml_compute_forward_unpad
+
 +static void ggml_compute_forward_unpad_f32(
 +    const struct ggml_compute_params *params,
 +    struct ggml_tensor *dst) {
@@ -85,7 +113,7 @@ index 72325349..2f606d82 100644
 +    }
 +}
 +
-+static void ggml_compute_forward_unpad(
+void ggml_compute_forward_unpad(
 +    const struct ggml_compute_params * params,
 +    struct ggml_tensor * dst) {
 +
@@ -106,30 +134,23 @@ index 72325349..2f606d82 100644
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
-@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
+index 410a3720..3eca1cf8 100644
+--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
+@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+ void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 1b0d074b..c7a957c8 100644
+index b70c6a32..67208cba 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2200,6 +2200,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -139,16 +160,16 @@ index 1b0d074b..c7a957c8 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3199,6 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-             return ggml_is_contiguous(op->src[0]);
+@@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
+             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
 diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index aba539e8..b4b87409 100644
+index 77432b04..7d45a7e1 100644
 --- a/ggml/src/ggml-cuda/pad.cu
 +++ b/ggml/src/ggml-cuda/pad.cu
 @@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -212,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index fd9a4e77..e4c093f9 100644
+index 310afe8a..b121ab9e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -223,23 +244,23 @@ index fd9a4e77..e4c093f9 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_UPSCALE:
+@@ -998,6 +999,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
+@@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
+         case GGML_OP_LEAKY_RELU:
+@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -277,10 +298,10 @@ index fd9a4e77..e4c093f9 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index d092a169..f38909d0 100644
+index b08666e2..e3185e5b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
@@ -331,12 +352,12 @@ index d092a169..f38909d0 100644
 +
 kernel void kernel_arange_f32(
     device        char * dst,
-     constant   int64_t & ne0,
+     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7fc06724..635aa299 100644
+index 950772c7..2276b631 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
@@ -344,16 +365,16 @@ index 7fc06724..635aa299 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
@@ -361,16 +382,16 @@ index 7fc06724..635aa299 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 

--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Fri, 25 Oct 2024 16:25:18 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:06 -0700
 Subject: [PATCH] fix deepseek deseret regex

-On windows compiled with gcc the c++ regex library failed to handle
-the characters
+on some systems, deepseek's regex would throw an error
+on windows due to the deseret characters in the matching
+regex
 ---
 src/llama-vocab.cpp |  2 +-
- src/unicode.cpp     | 22 ++++++++++++++++++++++
- 2 files changed, 23 insertions(+), 1 deletion(-)
+ src/unicode.cpp     | 21 +++++++++++++++++++++
+ 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a4eee9b8..1ca827eb 100644
+index 0125ee53..d74919d2 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
@@ -24,7 +25,7 @@ index a4eee9b8..1ca827eb 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index e63bb4ab..9dd53b9a 100644
+index e63bb4ab..73cb2b1a 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
 @@ -2,6 +2,11 @@
@@ -39,7 +40,7 @@ index e63bb4ab..9dd53b9a 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -200,6 +205,21 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -58,11 +59,10 @@ index e63bb4ab..9dd53b9a 100644
 +    free(wbuf);
 +    return ret;
 +#else
-+
 #if defined(__clang__)
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
-@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+@@ -213,6 +233,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
 #endif
 
     return conv.from_bytes(s);

--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: ParthSareen <parth.sareen@ollama.com>
-Date: Wed, 11 Dec 2024 15:37:32 -0800
-Subject: [PATCH] Maintain ordering for rules for grammar
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:40 -0700
+Subject: [PATCH] maintain ordering for rules for grammar

 ---
 common/json-schema-to-grammar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index 3ebcc3d9..30c28808 100644
+index 90679822..56043678 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
 @@ -346,7 +346,7 @@ private:

--- a/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 15 Apr 2025 14:27:40 -0400
+Subject: [PATCH] ensure KV cache is fully defragmented
+
+Sometimes the KV cache requires defragmentation even without
+triggering the threshold heuristic. In this case, decoding
+will not being able to find a KV cache slot. This is particularly
+difficult for the caller to handle if it happens in between
+ubatches. To avoid this, we should immediately trigger a defrag.
+
+In addition, a heavily fragmented cache can require more than
+max_moves to defragment. Currently, we stop when we hit the limit
+but this can leave a cache that still does not have adequate space
+even after defragmentation is triggered. Instead, we should do
+multiple batches of processing until everything is complete.
+---
+ src/llama-context.cpp  | 105 +++++++++++++----------------------------
+ src/llama-context.h    |   4 +-
+ src/llama-kv-cache.cpp |  39 +++------------
+ src/llama-kv-cache.h   |   9 +++-
+ 4 files changed, 51 insertions(+), 106 deletions(-)
+
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index afe6f552..d6e7b3af 100644
+--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+ 
+ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_context * ctx0,
+-        ggml_cgraph * gf) const {
+        ggml_cgraph * gf,
+        const std::vector<struct llama_kv_defrag_move> & moves) const {
+     auto res = std::make_unique<llm_graph_result>();
+ 
+     const auto & hparams = model.hparams;
+ 
+-    const auto & ids = kv_self->defrag_info.ids;
+-
+ #if 0
+     // CPU defrag
+     //
+@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+     }
+ #else
+-    for (uint32_t i = 0; i < ids.size(); ++i) {
+-        const uint32_t id = ids[i];
+-
+-        if (i == id || id == ids.size()) {
+-            continue;
+-        }
+-
+-        uint32_t nm = 1;
+-
+-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+-            nm++;
+-        }
+-
+    for (const auto & move : moves) {
+         for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ 
+             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+ 
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+ 
+             ggml_tensor * view_v_src;
+             ggml_tensor * view_v_dst;
+@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+             if (cparams.flash_attn) {
+                 // NOTE: the V cache is not transposed when using flash attention
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+             } else {
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, i));
+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, id));
+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+             }
+ 
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+         }
+-
+-        i += nm - 1;
+     }
+-
+-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+ #endif
+ 
+     return res;
+@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+ void llama_context::kv_self_update() {
+     auto & kv = kv_self;
+ 
+-    bool need_reserve = false;
+-
+     if (kv->has_shift) {
+         if (!kv->get_can_shift()) {
+             GGML_ABORT("The current context does not support K-shift");
+@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
+             res->set_inputs(nullptr);
+ 
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         {
+@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
+     // defragment the KV cache if needed
+     if (kv->do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!kv->defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return;
+        }
+ 
+-        if (kv->defrag_prepare(graph_max_nodes())) {
+-            ggml_backend_sched_reset(sched.get());
+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ 
+            ggml_backend_sched_reset(sched.get());
+             auto * gf = graph_init();
+-
+-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
+-
+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
+             ggml_backend_sched_alloc_graph(sched.get(), gf);
+-
+             res->set_inputs(nullptr);
+-
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         kv->do_defrag = false;
+     }
+-
+-    // reserve a worst case graph if needed
+-    if (need_reserve) {
+-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+-
+-        // build worst-case graph
+-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+-
+-        // simulate full KV cache
+-        kv_self->n = kv_self->size;
+-
+-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+-
+-        auto * gf = graph_init();
+-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+-
+-        // initialize scheduler with the worst-case graph
+-        ggml_backend_sched_reset(sched.get());
+-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+-        }
+-    }
+ }
+ 
+ enum llama_pooling_type llama_context::pooling_type() const {
+@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+         // find KV slot
+         {
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                return 1;
+                kv_self->defrag();
+                kv_self_update();
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    return 1;
+                }
+             }
+ 
+             if (!kv_self->recurrent) {
+diff --git a/src/llama-context.h b/src/llama-context.h
+index baa03276..a59ff8fd 100644
+--- a/src/llama-context.h
+++ b/src/llama-context.h
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
+ #include "llama-graph.h"
+ #include "llama-adapter.h"
+#include "llama-kv-cache.h"
+ 
+ #include "ggml-cpp.h"
+ 
+@@ -180,7 +181,8 @@ private:
+ 
+     llm_graph_result_ptr build_kv_self_defrag(
+             ggml_context * ctx0,
+-            ggml_cgraph * gf) const;
+            ggml_cgraph * gf,
+            const std::vector<struct llama_kv_defrag_move> & moves) const;
+ 
+     // TODO: read/write lora adapters and cvec
+     size_t state_write_data(llama_io_write_i & io);
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 9310f262..5c941e7c 100644
+--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+ 
+     assert(n_used <= n_kv);
+ 
+-    //const int64_t t_start = ggml_time_us();
+-
+-    // number of cells moved
+-    uint32_t n_moves = 0;
+-
+-    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+-    //   - source view, destination view, copy operation
+-    //   - x2 for keys and values
+-    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+-    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+    defrag_info.moves.clear();
+ 
+     // determine which KV cells to move where
+     //
+@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+     //
+     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+     //
+-    auto & ids = defrag_info.ids;
+-
+-    ids.clear();
+-    ids.resize(n_kv, n_kv);
+    std::vector<uint32_t> ids(n_kv, n_kv);
+ 
+     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+         const auto & cell0 = cells[i0];
+@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+         // are we moving a continuous block of memory?
+         bool cont = false;
+ 
+-        // should we stop searching for the next move?
+-        bool stop = false;
+-
+         // go back and move the nf cells to the hole
+         for (; i1 < n_kv; ++i1) {
+             auto & cell1 = cells[i1];
+ 
+             if (cell1.is_empty() || ids[i1] != n_kv) {
+-                if (n_moves == max_moves) {
+-                    stop = true;
+-                    break;
+-                }
+-
+                 cont = false;
+                 continue;
+             }
+@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             head = n_used;
+ 
+             if (!cont) {
+-                n_moves++;
+                defrag_info.moves.push_back({i1, i0 + nf, 1});
+                 cont = true;
+            } else {
+                defrag_info.moves.back().len++;
+             }
+ 
+             nf++;
+@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             }
+         }
+ 
+-        if (stop || n_moves == max_moves) {
+-            break;
+-        }
+-
+         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+ 
+         i0 += nh - 1;
+     }
+ 
+-    if (n_moves == 0) {
+    if (defrag_info.moves.size() == 0) {
+         return false;
+     }
+ 
+-    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-
+-    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+ 
+     return true;
+ }
+diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
+index 56c74035..25cbcb56 100644
+--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
+@@ -43,6 +43,13 @@ private:
+     llama_kv_cache * kv;
+ };
+ 
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
+ struct llama_kv_cell {
+     llama_pos pos   = -1;
+     llama_pos delta =  0;
+@@ -131,7 +138,7 @@ public:
+     // defrag
+ 
+     struct {
+-        std::vector<uint32_t> ids;
+        std::vector<llama_kv_defrag_move> moves;
+     } defrag_info;
+ 
+     // return true if cells have been moved
--- a/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Fri, 13 Dec 2024 16:11:59 -0800
-Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
-
-Sometimes the KV cache requires defragmentation even without
-triggering the threshold heuristic. In this case, decoding
-will not being able to find a KV cache slot. This is particularly
-difficult for the caller to handle if it happens in between
-ubatches. To avoid this, we should immediately trigger a defrag.
-
-In addition, a heavily fragmented cache can require more than
-max_moves to defragment. Currently, we stop when we hit the limit
-but this can leave a cache that still does not have adequate space
-even after defragmentation is triggered. Instead, we should do
-multiple batches of processing until everything is complete.
---
- src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
- 1 file changed, 46 insertions(+), 53 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8f7902df..01854fce 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
- }
- 
-+// block of KV slots to move when defragging
-+struct llama_kv_defrag_move {
-+    uint32_t src;
-+    uint32_t dst;
-+    uint32_t len;
-+};
-+
- struct llm_build_context {
-     const llama_model    & model;
-           llama_context  & lctx;
-@@ -1230,35 +1237,23 @@ struct llm_build_context {
-         return gf;
-     }
- 
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-+    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- 
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-+        for (const auto & move : moves) {
-             for (int il = 0; il < n_layer; ++il) {
-                 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
- 
-                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
- 
-                 ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
- 
-                 ggml_tensor * view_v_src;
-                 ggml_tensor * view_v_dst;
-@@ -1266,31 +1261,29 @@ struct llm_build_context {
-                 if (flash_attn) {
-                     // NOTE: the V cache is not transposed when using flash attention
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
-                 } else {
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.dst));
-                 }
- 
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-             }
-
-            i += nm - 1;
-         }
- 
-         //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -8508,7 +8501,7 @@ struct llm_build_context {
-     }
- };
- 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
-     llama_ubatch dummy = {};
-     dummy.equal_seqs = true;
- 
-@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
- 
-     llm.init();
- 
-    struct ggml_cgraph * result = llm.build_defrag(ids);
-+    struct ggml_cgraph * result = llm.build_defrag(moves);
- 
-     llm.free();
- 
-@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
-             kv_self.head = 0;
-         }
- 
-        const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        if (!slot) {
-+            llama_kv_cache_defrag(kv_self);
-+            llama_kv_cache_update(&lctx);
-+            slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        }
-         if (!slot) {
-             return 1;
-         }
-@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- 
-     //const int64_t t_start = ggml_time_us();
- 
-    // number of cells moved
-    uint32_t n_moves = 0;
-+    // groups of cells moved
-+    std::vector<struct llama_kv_defrag_move> moves;
- 
-     // each move requires 6*n_layer tensors (see build_defrag)
-     //   - source view, destination view, copy operation
-@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-         // are we moving a continuous block of memory?
-         bool cont = false;
- 
-        // should we stop searching for the next move?
-        bool stop = false;
-
-         // go back and move the nf cells to the hole
-         for (; i1 < n_kv; ++i1) {
-             auto & cell1 = kv_self.cells[i1];
- 
-             if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                 cont = false;
-                 continue;
-             }
-@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             kv_self.head = n_used;
- 
-             if (!cont) {
-                n_moves++;
-+                moves.push_back({i1, i0 + nf, 1});
-                 cont = true;
-+            } else {
-+                moves.back().len++;
-             }
- 
-             nf++;
-@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             }
-         }
- 
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
- 
-         i0 += nh - 1;
-     }
- 
-    if (n_moves == 0) {
-+    if (moves.size() == 0) {
-         return;
-     }
- 
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n",  moves.size());
- 
- #if 0
-     // CPU defrag
-@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- #else
-     // ggml_graph defrag
- 
-    ggml_backend_sched_reset(lctx.sched.get());
-+    for (std::size_t i = 0; i < moves.size(); i += max_moves) {
-+        std::vector<struct llama_kv_defrag_move> chunk;
-+        auto end = std::min(i + max_moves, moves.size());
-+        chunk.assign(moves.begin() + i, moves.begin() + end);
- 
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-+        ggml_backend_sched_reset(lctx.sched.get());
-+
-+        //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
-+        ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+    }
- #endif
- 
-     //const int64_t t_end = ggml_time_us();