llama: update vendor code to commit ba1cb19c (#8101)

7a81daf0 · Jeffrey Morgan · GitHub · 60f75560 · 7a81daf0 · 7a81daf0
Unverified Commit 7a81daf0 authored Dec 14, 2024 by Jeffrey Morgan Committed by GitHub Dec 14, 2024
20 changed files
--- a/llama/llama-sampling.h
+++ b/llama/llama-sampling.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *

--- a/llama/llama-vocab.cpp
+++ b/llama/llama-vocab.cpp
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -444,6 +444,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+            case LLAMA_VOCAB_PRE_TYPE_MINERVA:
                regex_exprs = {
                    "\\p{N}",
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

--- a/llama/llama-vocab.h
+++ b/llama/llama-vocab.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *

--- a/llama/llama.cpp
+++ b/llama/llama.cpp
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -190,6 +190,7 @@ enum llm_arch {
    LLM_ARCH_QWEN,
    LLM_ARCH_QWEN2,
    LLM_ARCH_QWEN2MOE,
+    LLM_ARCH_QWEN2VL,
    LLM_ARCH_PHI2,
    LLM_ARCH_PHI3,
    LLM_ARCH_PLAMO,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_QWEN,            "qwen"         },
    { LLM_ARCH_QWEN2,           "qwen2"        },
    { LLM_ARCH_QWEN2MOE,        "qwen2moe"     },
+    { LLM_ARCH_QWEN2VL,         "qwen2vl"      },
    { LLM_ARCH_PHI2,            "phi2"         },
    { LLM_ARCH_PHI3,            "phi3"         },
    { LLM_ARCH_PLAMO,           "plamo"        },
@@ -340,6 +342,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
  
    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
@@ -458,6 +461,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers"   },
  
    { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
    { LLM_KV_ROPE_SCALE_LINEAR,                "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,                "%s.rope.scaling.type"                    },
@@ -975,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_QWEN2VL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_QWEN2MOE,
        {
@@ -1113,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
@@ -1778,9 +1801,10 @@ struct LLM_TN {
 //
  
 static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
+    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
+    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
+    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
+    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
  
 static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -1886,7 +1910,7 @@ private:
        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                    NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
        if (!bufLen) {
-            ret = format("Win32 error code: %s", error_code);
+            ret = format("Win32 error code: %lx", error_code);
        } else {
            ret = lpMsgBuf;
            LocalFree(lpMsgBuf);
@@ -2224,7 +2248,7 @@ struct llama_mmap {
            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
  
            // may fail on pre-Windows 8 systems
-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
+            pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
  
            if (pPrefetchVirtualMemory) {
                // advise the kernel to preload the mapped memory
@@ -2571,11 +2595,12 @@ struct llama_hparams {
    uint32_t time_decay_extra_dim = 0;
    uint32_t wkv_head_size = 0;
  
-    float    rope_attn_factor = 1.0f;
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_ctx_orig_yarn;
-    float    rope_yarn_log_mul;
+    float     rope_attn_factor = 1.0f;
+    float     rope_freq_base_train;
+    float     rope_freq_scale_train;
+    uint32_t  n_ctx_orig_yarn;
+    float     rope_yarn_log_mul;
+    int       rope_sections[4];
  
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -2634,6 +2659,9 @@ struct llama_hparams {
  
        if (this->rope_finetuned  != other.rope_finetuned)  return true;
        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
+        if (std::equal(std::begin(this->rope_sections),
+                       std::end(this->rope_sections),
+                       std::begin(other.rope_sections)))    return true;
  
        if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
        if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -3504,6 +3532,11 @@ struct llama_context {
    // whether we are computing encoder output or decoder output
    bool is_encoding = false;
  
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    // number of position id each token get, 1 for each token in most cases.
+    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+    int n_pos_per_token = 1;
+
    // output of the encoder part of the encoder-decoder models
    std::vector<float> embd_enc;
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -4739,9 +4772,6 @@ struct llama_model_loader {
                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
-                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
-                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -5505,9 +5535,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
  
        default: return "unknown, may not work";
    }
@@ -5756,8 +5783,12 @@ static void llm_load_hparams(
        case LLM_ARCH_MINICPM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  
                switch (hparams.n_layer) {
+                    case 52: model.type = e_model::MODEL_1B; break;
                    case 40: model.type = e_model::MODEL_2B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
@@ -5922,6 +5953,13 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                std::array<int, 4> section_dims;
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
+                std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
+            }
+            // fall through
        case LLM_ARCH_QWEN2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6654,6 +6692,9 @@ static void llm_load_vocab(
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
                vocab.tokenizer_add_bos = true;
                vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "minerva-7b") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -7248,7 +7289,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
    }
  
-    if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
+    if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7882,7 +7923,13 @@ static bool llm_load_tensors(
  
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
+                        else {
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
  
                        if (n_expert == 0) {
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
@@ -8396,6 +8443,7 @@ static bool llm_load_tensors(
                    }
                } break;
            case LLM_ARCH_QWEN2:
+            case LLM_ARCH_QWEN2VL:
                {
                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
@@ -13064,6 +13112,124 @@ struct llm_build_context {
        return gf;
    }
  
+    struct ggml_cgraph * build_qwen2vl() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
+        cb(lctx.inp_pos, "inp_pos", -1);
+        ggml_set_input(lctx.inp_pos);
+        struct ggml_tensor * inp_pos = lctx.inp_pos;
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        int sections[4];
+        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
    struct ggml_cgraph * build_qwen2moe() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  
@@ -14015,153 +14181,6 @@ struct llm_build_context {
        return gf;
    }
  
-    // ref: https://arxiv.org/abs/2203.03466
-    //      https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
-    // based on the original build_llama() function
-    struct ggml_cgraph * build_minicpm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        const int64_t n_embd = hparams.n_embd;
-        //TODO: if the model varies, these parameters need to be read from the model
-        const int64_t n_embd_base = 256;
-        const float scale_embd  = 12.0f;
-        const float scale_depth = 1.4f;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-
-        // scale the input embeddings
-        inpL = ggml_scale(ctx0, inpL, scale_embd);
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // scale_res - scale the hidden states for residual connection
-            const float scale_res = scale_depth/sqrtf(float(n_layer));
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled", -1);
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // scale the hidden states for residual connection
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled_ffn", -1);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head scaling
-        const float scale_lmhead = float(n_embd_base)/float(n_embd);
-        cur = ggml_scale(ctx0, cur, scale_lmhead);
-        cb(cur, "lmhead_scaling", -1);
-
-        // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
    struct ggml_cgraph * build_minicpm3() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  
@@ -17412,6 +17431,7 @@ static struct ggml_cgraph * llama_build_graph(
  
    switch (model.arch) {
        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_MINICPM:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
            {
@@ -17467,6 +17487,11 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_qwen2();
            } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                lctx.n_pos_per_token = 4;
+                result = llm.build_qwen2vl();
+            } break;
        case LLM_ARCH_QWEN2MOE:
            {
                result = llm.build_qwen2moe();
@@ -17499,10 +17524,6 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_internlm2();
            } break;
-        case LLM_ARCH_MINICPM:
-            {
-                result = llm.build_minicpm();
-            } break;
        case LLM_ARCH_MINICPM3:
            {
                result = llm.build_minicpm3();
@@ -17702,8 +17723,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
  
    if (ubatch.pos && lctx.inp_pos) {
        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+        auto n_pos = lctx.n_pos_per_token;
+        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
    }
  
    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -19191,10 +19212,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                new_type = GGML_TYPE_IQ3_S;
            }
-            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
-                     new_type == GGML_TYPE_Q4_0_8_8) {
-                new_type = GGML_TYPE_Q4_0;
-            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                new_type = GGML_TYPE_Q4_K;
            }
@@ -19517,9 +19534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
  
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@@ -19860,14 +19874,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }
  
-            int chunk_size_multiplier = 1;
-            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
-                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
-                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
-                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
-                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
-            }
-
            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
            fflush(stdout);
  
@@ -19880,8 +19886,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const int64_t nrows = tensor->ne[1];
  
            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
-                                       chunk_size_multiplier;
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
  
            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -20859,6 +20864,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_MINICPM3:
            return LLAMA_ROPE_TYPE_NEOX;
  
+        case LLM_ARCH_QWEN2VL:
+            return LLAMA_ROPE_TYPE_MROPE;
+
        // all model arches should be listed explicitly here
        case LLM_ARCH_UNKNOWN:
            GGML_ABORT("unknown architecture");
@@ -22434,7 +22442,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
            }
        } else if ((size_t) i >= ctx->output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
        } else {
            j = ctx->output_ids[i];
        }

--- a/llama/llama.h
+++ b/llama/llama.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -130,12 +130,15 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
    };

    enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM = 0,
-        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_NONE   = -1,
+        LLAMA_ROPE_TYPE_NORM   = 0,
+        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
    };

    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -197,9 +200,9 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
+        //LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // removed from gguf files, use Q4_0 and runtime repack
+        //LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // removed from gguf files, use Q4_0 and runtime repack
+        //LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors

@@ -211,7 +214,8 @@ extern "C" {
        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
+        LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
+        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
    };

    enum llama_pooling_type {
@@ -485,6 +489,7 @@ extern "C" {
    // Functions to access the model's GGUF metadata scalar values
    // - The functions return the length of the string on success, or -1 on failure
    // - The output string is always null-terminated and cleared on failure
+    // - When retrieving a string, an extra byte must be allocated to account for the null terminator
    // - GGUF array values are not supported by these functions

    // Get metadata value as a string by key name

--- a/llama/llava.cpp
+++ b/llama/llava.cpp
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -285,25 +285,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);

-    if (clip_is_minicpmv(ctx_clip)) {
+    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
        std::vector<float *> image_embd_v;
        image_embd_v.resize(img_res_v.size);
        struct clip_image_size * load_image_size = clip_image_size_init();
+
        for (size_t i = 0; i < img_res_v.size; i++) {
            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
            int patch_size=14;
            load_image_size->width = img_res_v.data[i].nx;
            load_image_size->height = img_res_v.data[i].ny;
            clip_add_load_image_size(ctx_clip, load_image_size);
+
            bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else if (has_minicpmv_projector == 3) {
+            if (clip_is_qwen2vl(ctx_clip)) {
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
+            else {
+                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+                if (has_minicpmv_projector == 2) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                }
+                else if (has_minicpmv_projector == 3) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+                }
+            }
+
            if (!encoded) {
                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
@@ -316,8 +324,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
-            n_img_pos_out += clip_n_patches(ctx_clip);
+            std::memcpy(
+                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
+                image_embd_v[i],
+                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -413,7 +424,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    float * image_embd;
+    if (clip_is_qwen2vl(ctx_clip)) {
+        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
+        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
+    } else {
+        image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    }
    if (!image_embd) {
        LOG_ERR("Unable to allocate memory for image embeddings\n");
        return false;

--- a/llama/llava.h
+++ b/llama/llava.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *

--- a/llama/log.cpp
+++ b/llama/log.cpp
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *

--- a/llama/log.h
+++ b/llama/log.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *

--- a/llama/mmq.cpp
+++ b/llama/mmq.cpp
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -44,10 +44,6 @@
 #include <unistd.h>
 #endif

-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 #if (defined(_WIN32) || defined(_WIN64))
 #define RESTRICT __restrict
 #else
@@ -1408,13 +1404,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
 #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size

 template<typename TB, int BLOCK_K>
-void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
+void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
    const int NB = N / TILE_N;
    const int KB = K / BLOCK_K;
    const int TILE_SIZE = get_tile_size<TB>();

    // parallel on NB should be enough
-    parallel_for(n_threads, NB, [&](int begin, int end) {
+    parallel_for(NB, [&](int begin, int end) {
        for (int n = begin; n < end; ++n) {
            for (int k = 0; k < KB; ++k) {
                int n0 = n * TILE_N;
@@ -2360,15 +2356,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
    const int K = tensor->ne[0]; // ne0: in_features
    const int N = tensor->ne[1]; // ne1: out_features

-#if defined(_OPENMP)
-    // the buffer ctx is not initialized when .set_tensor is called
-    int n_threads = omp_get_num_threads();
-#else
-    int n_threads = 1;
-#endif
-
    GGML_DISPATCH_QTYPES(TYPE, [&] {
-        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
+        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
    });
 }


--- a/llama/mmq.h
+++ b/llama/mmq.h
 /**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
+ * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
 *
 * MIT License
 *
@@ -27,16 +27,10 @@
 #pragma once
 #include "common.h"

-#ifdef __cplusplus
-extern "C" {
-#endif
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);

 size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);

 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

 void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index d6e4bfdd..52aec229 100644
+index c180adc8..000f1777 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {

--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 6a6f4c2a..fa09f3b3 100644
+index abc1252e..626c3e3f 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
+@@ -6400,16 +6400,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
-                 vocab.tokenizer_add_bos = true;
-                 vocab.tokenizer_clean_spaces = false;
+@@ -6514,7 +6505,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "minerva-7b") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings
 1 file changed, 6 insertions(+), 3 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index fa09f3b3..d1791af0 100644
+index 626c3e3f..9e292c4f 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
+@@ -17714,7 +17714,6 @@ static int llama_decode_internal(
             res  = nullptr;
             embd = nullptr;
         } else if (cparams.embeddings) {
@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644
             embd = nullptr;
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
+@@ -17722,11 +17721,15 @@ static int llama_decode_internal(
                     break;
                 }
             }

--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
 1 file changed, 39 insertions(+), 1 deletion(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index d7c94352..427d5e02 100644
+index ba28c07c..46998e4c 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
 @@ -56,6 +56,19 @@
@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             gguf_free(ctx);
             return nullptr;
         }
@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
-@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
             }
         }

--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
 1 file changed, 253 insertions(+), 14 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index d1791af0..b01770d0 100644
+index 9e292c4f..26be6254 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -195,6 +195,7 @@ enum llm_arch {
+@@ -196,6 +196,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
     LLM_ARCH_UNKNOWN,
 };
 
-@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
-@@ -306,6 +308,7 @@ enum llm_kv {
+@@ -308,6 +310,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     LLM_KV_ROPE_DIMENSION_SECTIONS,
+@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 
@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
-@@ -603,6 +607,7 @@ enum llm_tensor {
+     { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
+@@ -607,6 +611,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
-@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
     {
         LLM_ARCH_UNKNOWN,
         {
-@@ -2401,6 +2424,7 @@ enum e_model {
+@@ -2425,6 +2448,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
     MODEL_30B,
     MODEL_32B,
     MODEL_34B,
-@@ -2451,6 +2475,8 @@ struct llama_hparams {
+@@ -2475,6 +2499,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-@@ -2521,6 +2547,7 @@ struct llama_hparams {
+@@ -2546,6 +2572,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2630,6 +2657,14 @@ struct llama_hparams {
+@@ -2658,6 +2685,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2816,6 +2851,8 @@ struct llama_layer {
+@@ -2844,6 +2879,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
 };
 
 // very similar to llama_batch,
-@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
+@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
         default: (void)0;
     }
 
-@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
-@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
+@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -16652,6 +16734,158 @@ struct llm_build_context {
+@@ -16671,6 +16753,158 @@ struct llm_build_context {
 
         return gf;
     }
@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
         default:
             GGML_ABORT("fatal error");
     }
-@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:

--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 52aec229..cbf4fddf 100644
+index 000f1777..8fd7c1a3 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg

--- a/llama/patches/0008-add-mllama-support.patch
+++ b/llama/patches/0008-add-mllama-support.patch
@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
 3 files changed, 467 insertions(+), 20 deletions(-)

 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 4ca53a0b..d56644a8 100644
+index 16f30c56..0f0f3f62 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -412,7 +412,7 @@ struct llava_embd_batch {
+@@ -429,7 +429,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -424,6 +424,7 @@ struct llava_embd_batch {
+@@ -441,6 +441,7 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/include/llama.h b/include/llama.h
-index e85f459f..aba85f86 100644
+index c67988a3..0f266283 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -245,6 +245,7 @@ extern "C" {
+@@ -249,6 +249,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -419,6 +420,10 @@ extern "C" {
+@@ -423,6 +424,10 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
 
@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama.cpp b/src/llama.cpp
-index b01770d0..46881642 100644
+index 26be6254..4778a9ed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
 @@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
     LLM_ARCH_GROK,
-@@ -201,6 +202,7 @@ enum llm_arch {
+@@ -202,6 +203,7 @@ enum llm_arch {
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,           "llama"        },
@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
     { LLM_ARCH_FALCON,          "falcon"       },
     { LLM_ARCH_GROK,            "grok"         },
     { LLM_ARCH_GPT2,            "gpt2"         },
-@@ -309,6 +311,7 @@ enum llm_kv {
+@@ -311,6 +313,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
 +    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     LLM_KV_ROPE_DIMENSION_SECTIONS,
+@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 +    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers"   },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
-@@ -608,6 +612,14 @@ enum llm_tensor {
+     { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
+@@ -612,6 +616,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
-@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
     {
         LLM_ARCH_BAICHUAN,
         {
-@@ -2432,6 +2478,7 @@ enum e_model {
+@@ -2456,6 +2502,7 @@ enum e_model {
     MODEL_40B,
     MODEL_65B,
     MODEL_70B,
@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
     MODEL_236B,
     MODEL_314B,
     MODEL_SMALL,
-@@ -2476,6 +2523,7 @@ struct llama_hparams {
+@@ -2500,6 +2547,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
-@@ -2544,10 +2592,11 @@ struct llama_hparams {
+@@ -2569,10 +2617,11 @@ struct llama_hparams {
         if (this->n_expert      != other.n_expert)      return true;
         if (this->n_expert_used != other.n_expert_used) return true;
 
@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2665,6 +2714,10 @@ struct llama_hparams {
+@@ -2693,6 +2742,10 @@ struct llama_hparams {
 
         GGML_ABORT("fatal error");
     }
@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2694,6 +2747,9 @@ struct llama_cparams {
+@@ -2722,6 +2775,9 @@ struct llama_cparams {
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
 
     enum llama_pooling_type pooling_type;
 
-@@ -2853,6 +2909,16 @@ struct llama_layer {
+@@ -2881,6 +2937,16 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
 
     struct ggml_tensor * bskcn_tv;
@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
 };
 
 // very similar to llama_batch,
-@@ -3439,6 +3505,8 @@ struct llama_context {
+@@ -3472,6 +3538,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
 };
 
 struct llama_lora_weight {
-@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
+@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
 
     for (int i = 0; i < (int) n_layer; i++) {
@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
+@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
     }
 
     // zero-out the per-layer hparams
@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
+@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
+@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
                     }
                 }
             } break;
@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
-@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
+@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
                         }
                     }
                 } break;
@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
             case LLM_ARCH_MINICPM3:
                 {
                     const int64_t n_embd_head_qk_rope = hparams.n_rot;
-@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
         }
 
         if (params.vocab_only) {
-@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
+@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -10513,6 +10696,7 @@ struct llm_build_context {
+@@ -10561,6 +10744,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
     }
 
     void free() {
-@@ -10992,6 +11176,240 @@ struct llm_build_context {
+@@ -11040,6 +11224,240 @@ struct llm_build_context {
         return gf;
     }
 
@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
     struct ggml_cgraph * build_baichuan() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
-@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
+@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
     if (ubatch.embd) {
@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
     }
 
     if (ubatch.pos && lctx.inp_pos) {
-@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
+@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
         n_outputs = 1;
     }
 
@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 
-@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
+@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
 
     const int64_t n_embd = hparams.n_embd;
 
@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
 
     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
 
-@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
     }
 
     size_t total_size_org = 0;
-@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
-@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
                  int32_t   n_tokens) {
-@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
+@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);

--- a/llama/patches/0009-add-unpad-operator.patch
+++ b/llama/patches/0009-add-unpad-operator.patch
@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator

 ---
 ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         | 57 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ggml-cpu.c         | 58 ++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
 ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
 ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
 ggml/src/ggml.c                      | 25 +++++++++++-
- 8 files changed, 219 insertions(+), 2 deletions(-)
+ 8 files changed, 220 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index 65cb92c4..acbcccc6 100644
+index b0c1ac9c..091e6e6b 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
 @@ -499,6 +499,7 @@ extern "C" {
-         GGML_OP_POOL_2D_BACK,
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
+         GGML_OP_PAD_REFLECT_1D,
 +        GGML_OP_UNPAD,
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1695,6 +1696,15 @@ extern "C" {
-             int                  p2,
-             int                  p3);
+@@ -1718,6 +1719,15 @@ extern "C" {
+             int                   p0,
+             int                   p1);
 
 +    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
 +    GGML_API struct ggml_tensor * ggml_unpad(
@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 23ae2e10..111ff3b0 100644
+index 67e67a08..bebff207 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
+@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644
 +            }
 +    }
 +}
- 
+
 // ggml_compute_forward_arange
 
-@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+ static void ggml_compute_forward_arange_f32(
+@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
-                 ggml_compute_forward_pad(params, tensor);
+                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
 +        case GGML_OP_UNPAD:
 +            {
@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
-@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-             } break;
+@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cbf4fddf..9ca6cb77 100644
+index 8fd7c1a3..7c351b89 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 093ae900..cb9a1307 100644
+index 28f590f9..787fc713 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
+@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
+     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
 +    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,     conv_transpose_1d_f16_f32,      true);
+@@ -910,6 +911,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                        true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_POOL_2D:
+@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
+@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 5caa0846..47038c31 100644
+index 8ba43904..204c93e6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
+@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644
     device        char * dst,
     constant   int64_t & ne0,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 1a9a7efa..ea2b259b 100644
+index 51cc8566..0e74e554 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "POOL_2D_BACK",
+@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
+     "PAD_REFLECT_1D",
 +    "UNPAD",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
-+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "pool_2d_back(x)",
+@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
+     "pad_reflect_1d(x)",
 +    "unpad(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
-+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
+@@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 

--- a/llama/patches/0010-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0010-fix-deepseek-deseret-regex.patch
@@ -11,7 +11,7 @@ the characters
 2 files changed, 23 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index d1dc9627..05ef0e71 100644
+index 8c9aaf5a..3e372dc3 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {