Merge branch 'main' into drifkin/array-head-count-simple

b2b270ad · Devon Rifkin · 20c5fd39 · 2bb69b40 · b2b270ad · 20c5fd39
Commit b2b270ad authored Jun 23, 2025 by Devon Rifkin
20 changed files
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -6,7 +6,7 @@ package llama
 #cgo CXXFLAGS: -std=c++17
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
-#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
 #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include

@@ -17,7 +17,6 @@ package llama
 #include "llava.h"
 #include "gguf.h"

-#include "mllama.h"
 #include "sampling_ext.h"

 extern bool llamaProgressCallback(float progress, void *user_data);
@@ -40,8 +39,8 @@ import (
 	"unsafe"

 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
-	_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
+	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )

@@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	return embed, nil
 }

-type MllamaContext struct {
-	c *C.struct_mllama_ctx
-}
-
-func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
-	mp := C.CString(modelPath)
-	defer C.free(unsafe.Pointer(mp))
-	c := C.mllama_model_load(mp, 1)
-	if c == nil {
-		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
-	}
-
-	projEmbedSize := int(C.mllama_n_embd(c))
-	modelEmbedSize := llamaContext.Model().NEmbd()
-	if projEmbedSize != modelEmbedSize {
-		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
-	}
-
-	return &MllamaContext{c: c}, nil
-}
-
-func (m *MllamaContext) Free() {
-	C.mllama_free(m.c)
-}
-
-func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
-	img := C.mllama_image_init()
-	defer C.mllama_image_free(img)
-
-	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
-	if !ok {
-		return nil, errors.New("unable to load mllama image data")
-	}
-
-	rows := make([]float32, m.EmbedSize(llamaContext))
-	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
-	if !ok {
-		return nil, errors.New("unable to make mllama embedding from image")
-	}
-
-	embed := make([][]float32, 1)
-	embed[0] = rows
-
-	return embed, nil
-}
-
-func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
-	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
-	numEmbed := llamaContext.Model().NEmbd()
-
-	return numTokens * numEmbed
-}
-
-func (c *Context) SetCrossAttention(state bool) {
-	C.llama_set_cross_attention(c.c, C.bool(state))
-}
-
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
@@ -602,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyFreq)
+	cparams.penalty_present = C.float(params.PenaltyPresent)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -637,8 +579,8 @@ func SchemaToGrammar(schema []byte) []byte {
 	cStr := C.CString(string(schema))
 	defer C.free(unsafe.Pointer(cStr))

-	// Allocate buffer for grammar output with reasonable size
-	const maxLen = 32768 // 32KB
+	// Allocate buffer for grammar based on schema length but with upper bound
+	maxLen := max(32768, min(1024*1024, len(schema)*4))
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
@@ -660,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }

-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))

@@ -680,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}

-	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}

--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
-// NOTE: This is modified from clip.cpp for Mllama only
-#include "mllama.h"
-
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "ggml-cpu.h"
-#include "ggml.h"
-#include "gguf.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#include <algorithm>
-#include <cmath>
-#include <cstdarg>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <stdexcept>
-#include <vector>
-
-#define REQUIRE(x)                                           \
-    do {                                                     \
-        if (!(x)) {                                          \
-            throw std::runtime_error("REQUIRE failed: " #x); \
-        }                                                    \
-    } while (0)
-
-#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-#if __GLIBCXX__
-#include <cstdio>
-#include <ext/stdio_filebuf.h>
-#include <fcntl.h>
-#endif
-#endif
-
-struct mllama_image {
-    int width;
-    int height;
-
-    int num_channels = 3;
-    int num_tiles = 4;
-
-    int aspect_ratio_id;
-
-    std::vector<float> data;
-};
-
-static std::string format(const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    std::vector<char> b(128);
-    int n = vsnprintf(b.data(), b.size(), fmt, args);
-    REQUIRE(n >= 0 && n < b.size());
-    va_end(args);
-    return std::string(b.data(), b.size());
-}
-
-//
-// utilities to get data from a gguf file
-//
-
-static int get_key_index(const gguf_context *ctx, const char *key) {
-    int key_index = gguf_find_key(ctx, key);
-    REQUIRE(key_index != -1);
-    return key_index;
-}
-
-static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
-    const int i = get_key_index(ctx, key.c_str());
-    const int n = gguf_get_arr_n(ctx, i);
-    const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
-
-    std::vector<uint32_t> s(n);
-    for (size_t j = 0; j < s.size(); j++) {
-        s[j] = data[j];
-    }
-
-    return s;
-}
-
-static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static float get_f32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static std::string get_ftype(int ftype) {
-    return ggml_type_name(static_cast<ggml_type>(ftype));
-}
-
-//
-// mllama layers
-//
-
-struct mllama_hparams {
-    uint32_t image_size;
-    uint32_t patch_size;
-    uint32_t hidden_size;
-    uint32_t n_intermediate;
-    uint32_t projection_dim;
-    uint32_t n_head;
-    uint32_t n_layer;
-    uint32_t n_global_layer;
-    uint32_t n_tiles;
-
-    float eps;
-
-    std::vector<bool> intermediate_layers;
-};
-
-struct mllama_layer {
-    // attention
-    struct ggml_tensor *k_w;
-    struct ggml_tensor *k_b;
-    struct ggml_tensor *q_w;
-    struct ggml_tensor *q_b;
-    struct ggml_tensor *v_w;
-    struct ggml_tensor *v_b;
-
-    struct ggml_tensor *o_w;
-    struct ggml_tensor *o_b;
-
-    struct ggml_tensor *attn_gate;
-
-    // layernorm 1
-    struct ggml_tensor *ln_1_w;
-    struct ggml_tensor *ln_1_b;
-
-    // ff
-    struct ggml_tensor *ff_i_w;
-    struct ggml_tensor *ff_i_b;
-
-    struct ggml_tensor *ff_o_w;
-    struct ggml_tensor *ff_o_b;
-
-    struct ggml_tensor *ff_gate;
-
-    // layernorm 2
-    struct ggml_tensor *ln_2_w;
-    struct ggml_tensor *ln_2_b;
-};
-
-struct mllama_vision_model {
-    struct mllama_hparams hparams;
-
-    // embeddings
-    struct ggml_tensor *class_embedding;
-    struct ggml_tensor *patch_embeddings;
-    struct ggml_tensor *position_embeddings;
-    struct ggml_tensor *position_embeddings_gate;
-    struct ggml_tensor *tile_position_embeddings;
-    struct ggml_tensor *tile_position_embeddings_gate;
-    struct ggml_tensor *pre_tile_position_embeddings;
-    struct ggml_tensor *pre_tile_position_embeddings_gate;
-    struct ggml_tensor *post_tile_position_embeddings;
-    struct ggml_tensor *post_tile_position_embeddings_gate;
-
-    struct ggml_tensor *pre_ln_w;
-    struct ggml_tensor *pre_ln_b;
-
-    std::vector<mllama_layer> layers;
-    std::vector<mllama_layer> global_layers;
-
-    struct ggml_tensor *post_ln_w;
-    struct ggml_tensor *post_ln_b;
-
-    struct ggml_tensor *mm_0_w;
-    struct ggml_tensor *mm_0_b;
-};
-
-struct mllama_ctx {
-    struct mllama_vision_model vision_model;
-
-    uint32_t ftype = 1;
-
-    struct gguf_context *ctx_gguf;
-    struct ggml_context *ctx_data;
-
-    std::vector<uint8_t> buf_compute_meta;
-
-    // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = nullptr;
-
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t compute_alloc = nullptr;
-};
-
-static ggml_tensor *mllama_image_build_encoder_layer(
-    struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
-    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
-    struct ggml_tensor *cur = embeddings;
-
-    {
-        // layernorm1
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
-        ggml_set_name(cur, format("%d pre layernorm", il).c_str());
-    }
-
-    {
-        // self-attention
-        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
-        if (layer.q_b != nullptr) {
-            Q = ggml_add(ctx0, Q, layer.q_b);
-        }
-
-        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
-        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-        ggml_set_name(Q, format("%d query", il).c_str());
-
-        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
-        if (layer.k_b != nullptr) {
-            K = ggml_add(ctx0, K, layer.k_b);
-        }
-
-        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
-        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-        ggml_set_name(K, format("%d key", il).c_str());
-
-        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
-        if (layer.v_b != nullptr) {
-            V = ggml_add(ctx0, V, layer.v_b);
-        }
-
-        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
-        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-        ggml_set_name(V, format("%d value", il).c_str());
-
-        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
-        KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
-        KQ = ggml_soft_max_inplace(ctx0, KQ);
-        ggml_set_name(KQ, format("%d KQ", il).c_str());
-
-        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
-        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
-        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
-        ggml_set_name(KQV, format("%d KQV", il).c_str());
-
-        cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
-        if (layer.o_b != nullptr) {
-            cur = ggml_add(ctx0, cur, layer.o_b);
-        }
-        ggml_set_name(cur, format("%d self attention", il).c_str());
-
-        if (layer.attn_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
-            ggml_set_name(cur, format("%d self attention gate", il).c_str());
-        }
-    }
-
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    {
-        // layernorm2
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
-        ggml_set_name(cur, format("%d post layernorm", il).c_str());
-    }
-
-    {
-        // feed forward
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
-        cur = ggml_gelu_inplace(ctx0, cur);
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
-        ggml_set_name(cur, format("%d feed forward", il).c_str());
-
-        if (layer.ff_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
-            ggml_set_name(cur, format("%d feed forward gate", il).c_str());
-        }
-    }
-
-    // residual 2
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    return embeddings;
-}
-
-static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    const int image_size_width = image_size;
-    const int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-    const int hidden_size = hparams.hidden_size;
-    const int n_head = hparams.n_head;
-    const int d_head = hidden_size / n_head;
-
-    const int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    int num_tiles = 4;
-    int num_channels = 3;
-    if (imgs->data != nullptr) {
-        num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
-        num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
-    }
-
-    struct ggml_init_params params = {
-        ctx->buf_compute_meta.size(), // mem_size
-        ctx->buf_compute_meta.data(), // mem_buffer
-        true,                         // no_alloc
-    };
-
-    struct ggml_context *ctx0 = ggml_init(params);
-    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-    struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
-    ggml_set_name(aspect_ratios, "aspect_ratios");
-    ggml_set_input(aspect_ratios);
-
-    if (model.pre_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
-
-        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.pre_tile_position_embeddings_gate != nullptr) {
-            pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
-        }
-
-        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
-    }
-
-    struct ggml_tensor *embeddings = inp;
-
-    if (model.class_embedding != nullptr) {
-        // concat class_embeddings and patch_embeddings
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
-        ggml_set_name(embeddings, "embeddings");
-        ggml_set_input(embeddings);
-        for (int i = 0; i < num_tiles; ++i) {
-            // repeat class embeddings for each tile
-            embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
-        }
-
-        embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
-
-    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
-    if (model.position_embeddings_gate != nullptr) {
-        position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
-    }
-
-    embeddings = ggml_add(ctx0, embeddings, position_embd);
-
-    if (model.tile_position_embeddings != nullptr) {
-        struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
-        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
-
-        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
-        if (model.tile_position_embeddings_gate != nullptr) {
-            tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
-    }
-
-    // pre-layernorm
-    if (model.pre_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
-        if (model.pre_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
-        }
-
-        ggml_set_name(embeddings, "pre layernorm");
-    }
-
-    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
-
-    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
-
-    std::vector<struct ggml_tensor *> intermediate_embeddings;
-
-    // encoder
-    for (size_t il = 0; il < model.layers.size(); il++) {
-        if (hparams.intermediate_layers[il]) {
-            intermediate_embeddings.push_back(embeddings);
-        }
-
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    // post-layernorm
-    if (model.post_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
-        if (model.post_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
-        }
-
-        ggml_set_name(embeddings, "post layernorm");
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-
-    if (model.post_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
-
-        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.post_tile_position_embeddings_gate != nullptr) {
-            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
-
-    // global encoder
-    for (size_t il = 0; il < model.global_layers.size(); il++) {
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.global_layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
-    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
-        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
-    }
-
-    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
-    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
-
-    // mllama projector
-    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
-    ggml_set_name(embeddings, "multi modal projector");
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
-    struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
-    REQUIRE(cur != nullptr || optional);
-    return cur;
-}
-
-static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
-    std::vector<struct mllama_layer> layers(n);
-    for (size_t i = 0; i < layers.size(); i++) {
-        auto &layer = layers[i];
-        layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
-        layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
-        layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
-        layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
-
-        layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
-        layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
-        layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
-        layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
-        layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
-        layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
-        layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
-        layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
-
-        layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
-        layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
-        layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
-        layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
-
-        layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
-        layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
-    }
-
-    return layers;
-}
-
-// read and create ggml_context containing the tensors and their data
-struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
-    struct ggml_context *meta = nullptr;
-
-    struct gguf_init_params params = {
-        true,  // no_alloc
-        &meta, // ctx
-    };
-
-    struct gguf_context *ctx = gguf_init_from_file(fname, params);
-    REQUIRE(ctx != nullptr);
-
-    if (verbosity >= 1) {
-        const int n_tensors = gguf_get_n_tensors(ctx);
-        const int n_kv = gguf_get_n_kv(ctx);
-        const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
-        const int idx_desc = get_key_index(ctx, "general.description");
-        const std::string description = gguf_get_val_str(ctx, idx_desc);
-        const int idx_name = gguf_find_key(ctx, "general.name");
-        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
-            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG("model name:   %s", name.c_str());
-        }
-        LOG("description:  %s", description.c_str());
-        LOG("GGUF version: %d", gguf_get_version(ctx));
-        LOG("alignment:    %zu", gguf_get_alignment(ctx));
-        LOG("n_tensors:    %d", n_tensors);
-        LOG("n_kv:         %d", n_kv);
-        LOG("ftype:        %s", ftype.c_str());
-        LOG("");
-    }
-    const int n_tensors = gguf_get_n_tensors(ctx);
-
-    mllama_ctx *new_mllama = new mllama_ctx{};
-
-    ggml_backend_t backend = ggml_backend_init_best();
-    if (backend == nullptr) {
-        LOG("%s: failed to initialize backend\n", __func__);
-        mllama_free(new_mllama);
-        gguf_free(ctx);
-        return nullptr;
-    }
-    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-    new_mllama->backend = backend;
-
-    // load tensors
-    {
-        std::vector<uint8_t> read_buf;
-        struct ggml_init_params params = {
-            (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
-            nullptr,                                  // mem_buffer
-            true,                                     // no_alloc
-        };
-
-        new_mllama->ctx_data = ggml_init(params);
-        if (!new_mllama->ctx_data) {
-            LOG("ggml_init() failed");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-#ifdef _WIN32
-        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-        if (!wlen) {
-            return NULL;
-        }
-        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-        if (!wlen) {
-            free(wbuf);
-            return NULL;
-        }
-#if __GLIBCXX__
-        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-        std::istream fin(&buffer);
-#else // MSVC
-        // unused in our current build
-        auto fin = std::ifstream(wbuf, std::ios::binary);
-#endif
-        free(wbuf);
-#else
-        auto fin = std::ifstream(fname, std::ios::binary);
-#endif
-        if (!fin) {
-            LOG("cannot open model file for loading tensors\n");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-        // add tensors to context
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *t = ggml_get_tensor(meta, name);
-            struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
-            ggml_set_name(cur, name);
-        }
-
-        // alloc memory and offload data
-        new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
-            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
-            fin.seekg(offset, std::ios::beg);
-            if (!fin) {
-                LOG("failed to seek for tensor %s\n", name);
-                mllama_free(new_mllama);
-                gguf_free(ctx);
-                return nullptr;
-            }
-            int num_bytes = ggml_nbytes(cur);
-            if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
-                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
-            }
-        }
-
-#if defined(_WIN32) && defined(__GLIBCXX__)
-        close(fd);
-#else
-        fin.close();
-#endif
-    }
-
-    // vision model
-    // load vision model
-    auto &vision_model = new_mllama->vision_model;
-    auto &hparams = vision_model.hparams;
-    hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
-    hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
-    hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
-    hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
-    hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
-    hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
-    hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
-    hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
-    hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
-    hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
-
-    std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
-    hparams.intermediate_layers.resize(hparams.n_layer);
-    for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
-        hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
-    }
-
-    if (verbosity >= 2) {
-        LOG("");
-        LOG("vision model hparams");
-        LOG("image_size         %d", hparams.image_size);
-        LOG("patch_size         %d", hparams.patch_size);
-        LOG("v_hidden_size      %d", hparams.hidden_size);
-        LOG("v_n_intermediate   %d", hparams.n_intermediate);
-        LOG("v_projection_dim   %d", hparams.projection_dim);
-        LOG("v_n_head           %d", hparams.n_head);
-        LOG("v_n_layer          %d", hparams.n_layer);
-        LOG("v_n_global_layer   %d", hparams.n_global_layer);
-        LOG("v_eps              %f", hparams.eps);
-    }
-
-    vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
-    vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
-
-    vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
-    vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
-
-    vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
-    vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
-    vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
-    vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
-
-    vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
-    vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
-
-    vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
-    vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
-
-    vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
-    vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
-
-    vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
-    vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
-
-    vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
-    vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
-
-    ggml_free(meta);
-
-    new_mllama->ctx_gguf = ctx;
-
-    {
-        // measure mem requirement and allocate
-        new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
-        struct mllama_image_batch batch;
-        batch.size = 1;
-        ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
-        ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
-        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
-        LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
-    }
-
-    return new_mllama;
-}
-
-struct mllama_image *mllama_image_init() {
-    return new mllama_image();
-}
-
-void mllama_image_free(struct mllama_image *img) { delete img; }
-void mllama_image_batch_free(struct mllama_image_batch *batch) {
-    if (batch->size > 0) {
-        delete[] batch->data;
-        batch->size = 0;
-    }
-}
-
-bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
-    img->width = width;
-    img->height = height;
-    img->num_channels = num_channels;
-    img->num_tiles = num_tiles;
-    img->aspect_ratio_id = aspect_ratio_id;
-    img->data.resize(n);
-
-    memcpy(img->data.data(), data, n);
-    return true;
-}
-
-inline int mllama(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
-void mllama_free(mllama_ctx *ctx) {
-    ggml_free(ctx->ctx_data);
-    gguf_free(ctx->ctx_gguf);
-
-    ggml_backend_buffer_free(ctx->params_buffer);
-    ggml_backend_free(ctx->backend);
-    ggml_gallocr_free(ctx->compute_alloc);
-    delete ctx;
-}
-
-bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
-    mllama_image_batch imgs{};
-    imgs.size = 1;
-    imgs.data = img;
-    return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
-}
-
-bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
-    int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    // build the inference graph
-    ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
-    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
-
-    // set inputs
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    int image_size_width = image_size;
-    int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-
-    {
-        struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
-        ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
-    }
-
-    {
-        struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
-        if (embeddings != nullptr) {
-            void *zeros = malloc(ggml_nbytes(embeddings));
-            memset(zeros, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
-            free(zeros);
-        }
-    }
-
-    {
-        struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
-        if (positions != nullptr) {
-            int *positions_data = (int *)malloc(ggml_nbytes(positions));
-            for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = i;
-            }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }
-    }
-
-    {
-        struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
-        if (aspect_ratios != nullptr) {
-            int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
-            aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
-            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
-            free(aspect_ratios_data);
-        }
-    }
-
-    if (ggml_backend_is_cpu(ctx->backend)) {
-        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
-    }
-
-    ggml_backend_graph_compute(ctx->backend, gf);
-
-    // the last node is the embedding tensor
-    struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
-
-    // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
-
-    return true;
-}
-
-int32_t mllama_image_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.image_size;
-}
-
-int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.patch_size;
-}
-
-int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.hidden_size;
-}
-
-int mllama_n_patches(const struct mllama_ctx *ctx) {
-    const auto &hparams = ctx->vision_model.hparams;
-    return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
-}
-
-int mllama_n_positions(const struct mllama_ctx *ctx) {
-    return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
-}
-
-int mllama_n_tiles(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.n_tiles;
-}
-
-int mllama_n_embd(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.projection_dim;
-}
-
-size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
-    return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
-}
--- a/llama/mllama.h
+++ b/llama/mllama.h
-#ifndef MLLAMA_H
-#define MLLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef LLAMA_SHARED
-#if defined(_WIN32) && !defined(__MINGW32__)
-#ifdef LLAMA_BUILD
-#define MLLAMA_API __declspec(dllexport)
-#else
-#define MLLAMA_API __declspec(dllimport)
-#endif
-#else
-#define MLLAMA_API __attribute__((visibility("default")))
-#endif
-#else
-#define MLLAMA_API
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct mllama_ctx;
-
-struct mllama_image_batch {
-    struct mllama_image *data;
-    size_t size;
-};
-
-MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
-MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
-
-MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
-
-MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
-
-MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
-MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
-
-MLLAMA_API struct mllama_image *mllama_image_init();
-
-MLLAMA_API void mllama_image_free(struct mllama_image *img);
-MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
-
-MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
-
-MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
-MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // MLLAMA_H
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,7 +24,7 @@ problem.
 9 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 273075f4..dd11f304 100644
+index b30b4cb3..0ce73a99 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 9fb2134f..04ce764e 100644
+index b4b85abc..cb0d8528 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -96,7 +96,7 @@ index 9fb2134f..04ce764e 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -104,7 +104,7 @@ index 9fb2134f..04ce764e 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index d92392ed..425524d0 100644
+index 576f9581..1b56f858 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 140a775f..e33c4ba0 100644
+index 4f0abb5a..de1ec184 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 66b6f2cc..e3e6deae 100644
+index 0ea72994..ae3a3c33 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index c0bdb9e1..03d03064 100644
+index e2b357fd..68768029 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);

--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 50ded286..a9ee9f03 100644
+index 9389ca80..806c1b3d 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index 50ded286..a9ee9f03 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -11,10 +11,10 @@ instead of forcing one or the error
 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 5a2eef9b..9c1fe93f 100644
+index 62246c10..dca22d8b 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_all = 0;
 
     // count outputs
@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
         if (t_embd && res->get_embd_pooled()) {
-@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead

--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
 fixes loading vision models in llama.cpp on windows
 filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index ad3e7df1..b3218c78 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -30,6 +30,19 @@
- #include <array>
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index 41ba45a7..cdd8ca44 100644
+--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
+@@ -31,6 +31,19 @@
 #include <numeric>
+ #include <functional>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
 +
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
- //#define CLIP_DEBUG_FUNCTIONS
-@@ -1971,7 +1984,29 @@ struct clip_model_loader {
+ enum ffn_op_type {
+@@ -2190,7 +2203,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1998,7 +2033,11 @@ struct clip_model_loader {
+@@ -2217,7 +2252,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }

--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
 };
 
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index ea73a8a7..a012aeae 100644
+index 4cce5166..7f6617fa 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 822e2bb2..572378c9 100644
+index 3a4e72a3..831b68c0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
     }
 };
 
@@ -270,7 +270,7 @@ index 822e2bb2..572378c9 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         ggml_tensor * cur;
-@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
             return LLAMA_ROPE_TYPE_NORM;
 
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 95eca002..856e6042 100644
+index 6bdec263..43746c7d 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -64,6 +64,7 @@ enum llm_type {
+@@ -65,6 +65,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
     LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
-@@ -311,6 +312,8 @@ struct llama_layer {
+@@ -315,6 +316,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale   = nullptr;
     struct ggml_tensor * ffn_down_scale = nullptr;
 

--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 20 Apr 2025 16:12:36 -0700
-Subject: [PATCH] add mllama support
-
-adds support for the llama 3.2 vision architecture
---
- examples/llava/llava.cpp      |   5 +-
- examples/llava/mtmd.cpp       |   6 +-
- ggml/src/ggml-backend-reg.cpp |   6 +-
- include/llama.h               |   6 +
- src/llama-arch.cpp            |  44 +++++
- src/llama-arch.h              |  10 ++
- src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  25 ++-
- src/llama-context.h           |   1 +
- src/llama-cparams.h           |   1 +
- src/llama-graph.cpp           |  25 +++
- src/llama-graph.h             |  12 ++
- src/llama-hparams.cpp         |   4 +
- src/llama-hparams.h           |   7 +
- src/llama-kv-cache.cpp        |  12 +-
- src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           | 309 +++++++++++++++++++++++++++++++++-
- src/llama-model.h             |  12 ++
- src/llama-quant.cpp           |   4 +-
- 19 files changed, 473 insertions(+), 21 deletions(-)
-
-diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp
-+++ b/examples/llava/llava.cpp
-@@ -457,7 +457,7 @@ struct llava_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -469,6 +469,7 @@ struct llava_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-             n_eval = n_batch;
-         }
-         float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
-         if (llama_decode(ctx_llama, llava_batch.batch)) {
-             LOG_ERR("%s : failed to eval\n", __func__);
-             return false;
-diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
-index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp
-+++ b/examples/llava/mtmd.cpp
-@@ -476,7 +476,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-         pos     .resize(n_tokens * n_pos_per_embd);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -487,6 +487,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
-             int32_t i_batch = 0;
-             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-             float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-+            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+            decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
- 
-             const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
-             const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -178,9 +178,9 @@ struct ggml_backend_registry {
- #ifdef GGML_USE_CANN
-         register_backend(ggml_backend_cann_reg());
- #endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
-+// #ifdef GGML_USE_BLAS
-+//         register_backend(ggml_backend_blas_reg());
-+// #endif
- #ifdef GGML_USE_RPC
-         register_backend(ggml_backend_rpc_reg());
- #endif
-diff --git a/include/llama.h b/include/llama.h
-index 06c56395..f1628e88 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -256,6 +256,7 @@ extern "C" {
- 
-         llama_token  *  token;
-         float        *  embd;
-+        int32_t         n_embd;
-         llama_pos    *  pos;
-         int32_t      *  n_seq_id;
-         llama_seq_id ** seq_id;
-@@ -358,6 +359,7 @@ extern "C" {
-         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-         bool no_perf;     // whether to measure performance timings
-+        bool cross_attn;  // whether to use cross attention
- 
-         // Abort callback
-         // if it returns true, execution of llama_decode() will be aborted
-@@ -459,6 +461,10 @@ extern "C" {
-             struct llama_context_params   params),
-             "use llama_init_from_model instead");
- 
-+    // TODO (jmorganca): this should most likely be passed in as part of a batch
-+    // and not set on the context for all batches.
-+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-+
-     // Frees all allocated memory
-     LLAMA_API void llama_free(struct llama_context * ctx);
- 
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 5ab3f572..eb7b5325 100644
--- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -6,6 +6,7 @@
- 
- static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_LLAMA,            "llama"            },
-+    { LLM_ARCH_MLLAMA,           "mllama"           },
-     { LLM_ARCH_LLAMA4,           "llama4"           },
-     { LLM_ARCH_DECI,             "deci"             },
-     { LLM_ARCH_FALCON,           "falcon"           },
-@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
-     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
-+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
-     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
-     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
- 
-@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
-         },
-     },
-+    {
-+        LLM_ARCH_MLLAMA,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-+            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
-+            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
-+        },
-+    },
-     {
-         LLM_ARCH_DECI,
-         {
-@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
-     // this tensor is loaded for T5, but never used
-     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
-     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
-     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 525c1b7d..bc8a4f0b 100644
--- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -11,6 +11,7 @@
- enum llm_arch {
-     LLM_ARCH_LLAMA,
-     LLM_ARCH_LLAMA4,
-+    LLM_ARCH_MLLAMA,
-     LLM_ARCH_DECI,
-     LLM_ARCH_FALCON,
-     LLM_ARCH_BAICHUAN,
-@@ -148,6 +149,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-     LLM_KV_ATTENTION_SCALE,
-     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
-+    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
-     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
-     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
- 
-@@ -349,6 +351,14 @@ enum llm_tensor {
-     LLM_TENSOR_CLS,
-     LLM_TENSOR_CLS_OUT,
-     LLM_TENSOR_BSKCN_TV,
-+    LLM_TENSOR_CROSS_ATTN_K_NORM,
-+    LLM_TENSOR_CROSS_ATTN_K_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_O_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_Q_NORM,
-+    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_V_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
-+    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
-     LLM_TENSOR_CONV1D,
-     LLM_TENSOR_CONVNEXT_DW,
-     LLM_TENSOR_CONVNEXT_NORM,
-diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
-index 01d5ca57..8682b0e6 100644
--- a/src/llama-batch.cpp
-+++ b/src/llama-batch.cpp
-@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
-         /*n_tokens       =*/ n_tokens,
-         /*tokens         =*/ tokens,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
-         /*n_tokens       =*/ 0,
-         /*tokens         =*/ nullptr,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
- 
-     if (embd) {
-         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-+        batch.n_embd = embd;
-     } else {
-         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
-     }
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 9c1fe93f..cd06ad91 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
-             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-         }
- 
-        return logits + j*model.vocab.n_tokens();
-+        return logits + j*model.hparams.n_vocab;
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
- #ifndef NDEBUG
-@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
-     cparams.warmup = value;
- }
- 
-+void llama_context::set_cross_attn(bool value) {
-+    cparams.cross_attn = value;
-+}
-+
- void llama_context::set_adapter_lora(
-             llama_adapter_lora * adapter,
-             float scale) {
-@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
- 
-     const int64_t n_embd = hparams.n_embd;
- 
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
- 
-     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
- 
-@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
-     const llama_batch & batch = batch_allocr.batch;
- 
-    const auto & vocab   = model.vocab;
-     const auto & hparams = model.hparams;
- 
-    const int32_t n_vocab = vocab.n_tokens();
-+    const int32_t n_vocab = hparams.n_vocab;
- 
-     const int64_t n_tokens_all = batch.n_tokens;
-     const int64_t n_embd       = hparams.n_embd;
-@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
-     const bool logits_all = n_outputs_all == n_tokens_all;
- 
-    sbatch.from_batch(batch, n_embd,
-+    sbatch.from_batch(batch, batch.n_embd,
-             /* simple_split */ !kv_self->recurrent,
-             /* logits_all   */ logits_all);
- 
-@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
- int32_t llama_context::output_reserve(int32_t n_outputs) {
-     const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;
- 
-     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
- 
-     const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-+    const auto n_vocab = hparams.n_vocab;
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
-@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
- void llama_context::output_reorder() {
-     auto & out_ids = sbatch.out_ids;
-     if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-+        const uint32_t n_vocab = model.hparams.n_vocab;
-         const uint32_t n_embd  = model.hparams.n_embd;
- 
-         GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
-     {
-         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- 
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
-+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
- 
-         io.write(&logits_size, sizeof(logits_size));
- 
-@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
-         /*.offload_kqv                 =*/ true,
-         /*.flash_attn                  =*/ false,
-         /*.no_perf                     =*/ true,
-+        /*.cross_attn                  =*/ false,
-         /*.abort_callback              =*/ nullptr,
-         /*.abort_callback_data         =*/ nullptr,
-     };
-@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
-     ctx->set_warmup(warmup);
- }
- 
-+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-+    ctx->set_cross_attn(cross_attention);
-+}
-+
- void llama_synchronize(llama_context * ctx) {
-     ctx->synchronize();
- }
-diff --git a/src/llama-context.h b/src/llama-context.h
-index 5457f077..a50c4afa 100644
--- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -65,6 +65,7 @@ struct llama_context {
-     void set_embeddings (bool value);
-     void set_causal_attn(bool value);
-     void set_warmup(bool value);
-+    void set_cross_attn(bool value);
- 
-     void set_adapter_lora(
-             llama_adapter_lora * adapter,
-diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 30e550f0..85ad91b9 100644
--- a/src/llama-cparams.h
-+++ b/src/llama-cparams.h
-@@ -29,6 +29,7 @@ struct llama_cparams {
-     bool offload_kqv;
-     bool flash_attn;
-     bool no_perf;
-+    bool cross_attn;
-     bool warmup;
- 
-     enum llama_pooling_type pooling_type;
-diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index fabb9ca2..b67216a4 100644
--- a/src/llama-graph.cpp
-+++ b/src/llama-graph.cpp
-@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
-     }
- }
- 
-+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
-+    if (ubatch->embd) {
-+        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
-+    }
-+}
-+
- //
- // llm_graph_context
- //
-@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
-     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
- }
- 
-+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
-+    const int64_t n_embd = hparams.n_embd;
-+
-+    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
-+
-+    ggml_tensor * cur = nullptr;
-+
-+    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
-+    ggml_set_input(inp->cross_attn_state);
-+
-+    cur = inp->cross_attn_state;
-+
-+    cb(cur, "inp_cross_attn_state", -1);
-+
-+    res->add_input(std::move(inp));
-+
-+    return cur;
-+}
-+
- ggml_tensor * llm_graph_context::build_attn(
-         llm_graph_input_attn_cross * inp,
-         ggml_cgraph * gf,
-diff --git a/src/llama-graph.h b/src/llama-graph.h
-index d0c8d321..0fe18150 100644
--- a/src/llama-graph.h
-+++ b/src/llama-graph.h
-@@ -86,6 +86,7 @@ public:
- 
-     ggml_tensor * tokens = nullptr; // I32 [n_batch]
-     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
- };
- 
- class llm_graph_input_pos : public llm_graph_input_i {
-@@ -283,6 +284,16 @@ public:
-     const llama_cross * cross = nullptr;
- };
- 
-+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
-+public:
-+    llm_graph_input_cross_attn_state()          = default;
-+    virtual ~llm_graph_input_cross_attn_state() = default;
-+
-+    void set_input(const llama_ubatch * ubatch) override;
-+
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
-+};
-+
- //
- // llm_graph_result
- //
-@@ -491,6 +502,7 @@ struct llm_graph_context {
-     ggml_tensor * build_inp_cls() const;
-     ggml_tensor * build_inp_s_copy() const;
-     ggml_tensor * build_inp_s_mask() const;
-+    ggml_tensor * build_inp_cross_attn_state() const;
- 
-     ggml_tensor * build_inp_cross_embd() const;
-     ggml_tensor * build_inp_pos_bucket_enc() const;
-diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 8a667960..6a02de03 100644
--- a/src/llama-hparams.cpp
-+++ b/src/llama-hparams.cpp
-@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
- 
-     GGML_ABORT("fatal error");
- }
-+
-+bool llama_hparams::cross_attention_layers(uint32_t il) const {
-+    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-+}
-diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 48dce407..b6fc7e6d 100644
--- a/src/llama-hparams.h
-+++ b/src/llama-hparams.h
-@@ -2,6 +2,8 @@
- 
- #include "llama.h"
- 
-+#include <algorithm>
-+
- #include <array>
- 
- // bump if necessary
-@@ -42,6 +44,7 @@ struct llama_hparams {
-     uint32_t n_expert = 0;
-     uint32_t n_expert_used = 0;
-     uint32_t n_rel_attn_bkts = 0;
-+    uint32_t n_vocab = 0;
- 
-     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-     uint32_t n_embd_head_k_mla = 0;
-@@ -56,6 +59,7 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
-+    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
- 
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q           = 0;
-@@ -159,6 +163,9 @@ struct llama_hparams {
-     // Block skip connection
-     bool n_bskcn(uint32_t n, uint32_t il) const;
- 
-+    // cross attention layers
-+    bool cross_attention_layers(uint32_t il) const;
-+
-     bool is_swa(uint32_t il) const;
- };
- 
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 7c9d46d8..69f8d35a 100644
--- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
-             return false;
-         }
- 
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        ggml_tensor * k, *v;
-+
-+        // for cross attention layers
-+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-+            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-+        } else {
-+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        }
-         ggml_format_name(k, "cache_k_l%d", i);
-         ggml_format_name(v, "cache_v_l%d", i);
-         k_l.push_back(k);
-diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index a012aeae..2e11507d 100644
--- a/src/llama-model-loader.cpp
-+++ b/src/llama-model-loader.cpp
-@@ -315,6 +315,8 @@ namespace GGUFMeta {
-         return true;
-     }
- 
-+    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
-+
-     template<typename T, size_t N_MAX>
-     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-         const int kid = gguf_find_key(meta.get(), key.c_str());
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 572378c9..9d099f11 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-     // get general kv
-     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
- 
-     // everything past this point is not vocab-related
-     if (hparams.vocab_only) {
-@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
-     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
-     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
- 
-     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
-         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
-     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
-     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-+    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- 
-     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-+    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
- 
-     // n_head_kv is optional, default to n_head
-     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- 
-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-             if (hparams.n_rot != hparams.n_embd_head_k) {
-                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
-             }
-@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     hparams.use_kq_norm = false;
-                 }
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                switch (hparams.n_layer) {
-+                    case 40: type = LLM_TYPE_11B; break;
-+                    case 100: type = LLM_TYPE_90B; break;
-+                    default: type = LLM_TYPE_UNKNOWN;
-+                }
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-         const int64_t n_embd_head_v = hparams.n_embd_head_v;
-         const int64_t n_ff          = hparams.n_ff();
-         const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = vocab.n_tokens();
-+        const int64_t n_vocab       = hparams.n_vocab;
-         const int64_t n_token_types = vocab.n_token_types();
-         const int64_t n_rot         = hparams.n_rot;
-         const int64_t n_expert      = hparams.n_expert;
-@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         }
-                     }
-                 } break;
-+            case LLM_ARCH_MLLAMA:
-+                {
-+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
-+
-+                    // output
-+                    {
-+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+
-+                        // if output is NULL, init from the input tok embed
-+                        if (output == NULL) {
-+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-+                        }
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = layers[i];
-+
-+                        if (hparams.cross_attention_layers(i)) {
-+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
-+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
-+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
-+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                        } else {
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                        }
-+                    }
-+                } break;
-             case LLM_ARCH_DECI:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
-     }
- };
- 
-+struct llm_build_mllama: public llm_graph_context {
-+    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        ggml_tensor * cur;
-+        ggml_tensor * inpL;
-+        ggml_tensor * inpCAS;
-+
-+        inpL = build_inp_embd(model.tok_embd);
-+        inpCAS = build_inp_cross_attn_state();
-+
-+          // inp_pos - contains the positions
-+        ggml_tensor * inp_pos = build_inp_pos();
-+
-+        auto * inp_attn = build_attn_inp_kv_unified();
-+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            ggml_tensor * inpSA = inpL;
-+
-+            // norm
-+            cur = build_norm(inpL,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, il);
-+            cb(cur, "attn_norm", il);
-+
-+            if (hparams.cross_attention_layers(il)) {
-+                if (!ubatch.embd && !cparams.cross_attn) {
-+                    continue;
-+                }
-+
-+                // cross attention layer
-+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
-+                cb(Qcur, "Qcur", il);
-+
-+                ggml_tensor * Kcur, * Vcur;
-+                if (ubatch.embd) {
-+                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
-+
-+                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
-+                } else {
-+                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
-+                    cb(Kcur, "Kcur (view)", il);
-+
-+                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
-+                    cb(Vcur, "Vcur (view)", il);
-+                }
-+
-+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
-+                cb(kq, "kq", il);
-+
-+                // TODO: apply causal masks
-+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-+                cb(kq_soft_max, "kq_soft_max", il);
-+
-+                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
-+                cb(Vcur, "Vcur", il);
-+
-+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
-+                cb(kqv, "kqv", il);
-+
-+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-+                cb(kqv_merged, "kqv_merged", il);
-+
-+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-+                cb(cur, "kqv_merged_cont", il);
-+
-+                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
-+                cb(cur, "cur", il);
-+
-+                // TODO: do this in place once?
-+                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                // TODO: do this inplace once?
-+                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            } else {
-+                // self attention layer
-+
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
-+
-+                // compute Q and K and RoPE them
-+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-+
-+                Qcur = ggml_rope_ext(
-+                        ctx0, Qcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                Kcur = ggml_rope_ext(
-+                        ctx0, Kcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                cb(Qcur, "Qcur", il);
-+                cb(Kcur, "Kcur", il);
-+                cb(Vcur, "Vcur", il);
-+
-+                cur = build_attn(inp_attn, gf,
-+                    model.layers[il].wo, model.layers[il].bo,
-+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-+
-+                if (il == n_layer - 1) {
-+                    // skip computing output for unused tokens
-+                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                    n_tokens = n_outputs;
-+                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+                }
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = ggml_add(ctx0, cur, ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            }
-+        }
-+
-+        cur = inpL;
-+
-+        cur = build_norm(cur,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, -1);
-+        cb(cur, "result_norm", -1);
-+        res->t_embd = cur;
-+
-+        // lm_head
-+        cur = build_lora_mm(model.output, cur);
-+
-+        cb(cur, "result_output", -1);
-+        res->t_logits = cur;
-+
-+        ggml_build_forward_expand(gf, cur);
-+    }
-+};
-+
- struct llm_build_deci : public llm_graph_context {
-     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
-             {
-                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         // use what we call a normal RoPE, operating on pairs of consecutive head values
-         case LLM_ARCH_LLAMA:
-         case LLM_ARCH_LLAMA4:
-+        case LLM_ARCH_MLLAMA:
-         case LLM_ARCH_DECI:
-         case LLM_ARCH_BAICHUAN:
-         case LLM_ARCH_STARCODER:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 856e6042..6be91282 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -11,6 +11,7 @@
- #include <string>
- #include <unordered_map>
- #include <vector>
-+#include <stdexcept>
- 
- struct llama_cparams;
- struct llama_ubatch;
-@@ -73,6 +74,7 @@ enum llm_type {
-     LLM_TYPE_40B,
-     LLM_TYPE_65B,
-     LLM_TYPE_70B,
-+    LLM_TYPE_90B,
-     LLM_TYPE_236B,
-     LLM_TYPE_290B,
-     LLM_TYPE_314B,
-@@ -314,6 +316,16 @@ struct llama_layer {
- 
-     struct ggml_tensor * bskcn_tv = nullptr;
- 
-+    // cross attention
-+    struct ggml_tensor * cross_attn_k_norm = nullptr;
-+    struct ggml_tensor * cross_attn_k_proj = nullptr;
-+    struct ggml_tensor * cross_attn_o_proj = nullptr;
-+    struct ggml_tensor * cross_attn_q_norm = nullptr;
-+    struct ggml_tensor * cross_attn_q_proj = nullptr;
-+    struct ggml_tensor * cross_attn_v_proj = nullptr;
-+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 7dc54227..223e1f3f 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (qs.n_attention_wv != n_attn_layer) {
-+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-+        }
-     }
- 
-     size_t total_size_org = 0;
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
@@ -12,10 +12,10 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a9ee9f03..1306864e 100644
+index 806c1b3d..10f34d33 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",

--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 13 Apr 2025 22:10:06 -0400
-Subject: [PATCH] add unpad operator
-
-adds the unpad operator to GGML
---
- ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
- ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
- ggml/src/ggml-cpu/ops.h              |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
- ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
- ggml/src/ggml.c                      | 25 ++++++++++++-
- 10 files changed, 223 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index 1b8603e7..53ef31b2 100644
--- a/ggml/include/ggml.h
-+++ b/ggml/include/ggml.h
-@@ -489,6 +489,7 @@ extern "C" {
-         GGML_OP_UPSCALE, // nearest interpolate
-         GGML_OP_PAD,
-         GGML_OP_PAD_REFLECT_1D,
-+        GGML_OP_UNPAD,
-         GGML_OP_ARANGE,
-         GGML_OP_TIMESTEP_EMBEDDING,
-         GGML_OP_ARGSORT,
-@@ -1777,6 +1778,15 @@ extern "C" {
-             int                   p0,
-             int                   p1);
- 
-+    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-+    GGML_API struct ggml_tensor * ggml_unpad(
-+            struct ggml_context * ctx,
-+            struct ggml_tensor  * a,
-+            int                  p0,
-+            int                  p1,
-+            int                  p2,
-+            int                  p3);
-+
-     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
-     // timesteps: [N,]
-     // return: [N, dim]
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 64405449..34624cca 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 7413192b..becdae07 100644
--- a/ggml/src/ggml-cpu/ops.cpp
-+++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
-     }
- }
- 
-+// ggml_compute_forward_unpad
-+
-+static void ggml_compute_forward_unpad_f32(
-+    const struct ggml_compute_params *params,
-+    struct ggml_tensor *dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_ASSERT(src0->nb[0] == sizeof(float));
-+    GGML_ASSERT( dst->nb[0] == sizeof(float));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    float * dst_ptr = (float *) dst->data;
-+
-+    // TODO: optimize
-+
-+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-+
-+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-+
-+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+                        dst_ptr[dst_idx] = *src_ptr;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+void ggml_compute_forward_unpad(
-+    const struct ggml_compute_params * params,
-+    struct ggml_tensor * dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    switch (src0->type) {
-+        case GGML_TYPE_F32:
-+            {
-+                ggml_compute_forward_unpad_f32(params, dst);
-+            } break;
-+        default:
-+            {
-+                GGML_ABORT("fatal error");
-+            }
-+    }
-+}
-+
- // ggml_compute_forward_arange
- 
- static void ggml_compute_forward_arange_f32(
-diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
-index dc081b9e..a7125555 100644
--- a/ggml/src/ggml-cpu/ops.h
-+++ b/ggml/src/ggml-cpu/ops.h
-@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
- void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 04ce764e..491acccb 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
-         case GGML_OP_PAD:
-             ggml_cuda_op_pad(ctx, dst);
-             break;
-+        case GGML_OP_UNPAD:
-+            ggml_cuda_op_unpad(ctx, dst);
-+            break;
-         case GGML_OP_ARANGE:
-             ggml_cuda_op_arange(ctx, dst);
-             break;
-@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-         case GGML_OP_UPSCALE:
-             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
-         case GGML_OP_PAD:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_LEAKY_RELU:
-diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index 77432b04..7d45a7e1 100644
--- a/ggml/src/ggml-cuda/pad.cu
-+++ b/ggml/src/ggml-cuda/pad.cu
-@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
- }
-+
-+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-+    // blockIdx.y: idx of ne1
-+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-+    if (nidx >= ne0) {
-+        return;
-+    }
-+
-+    // operation
-+    int offset_dst =
-+        nidx +
-+        blockIdx.y * ne0 +
-+        blockIdx.z * ne0 * gridDim.y;
-+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-+        int offset_src =
-+            nidx +
-+            blockIdx.y * ne00 +
-+            blockIdx.z * ne00 * ne01;
-+        dst[offset_dst] = x[offset_src];
-+    }
-+}
-+
-+static void unpad_f32_cuda(const float * x, float * dst,
-+    const int ne00, const int ne01, const int ne02, const int ne03,
-+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-+    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-+}
-+
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-+    const ggml_tensor * src0 = dst->src[0];
-+    const float * src0_d = (const float *)src0->data;
-+    float * dst_d = (float *)dst->data;
-+    cudaStream_t stream = ctx.stream();
-+
-+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-+
-+    unpad_f32_cuda(src0_d, dst_d,
-+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
-+}
-\ No newline at end of file
-diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
-index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh
-+++ b/ggml/src/ggml-cuda/pad.cuh
-@@ -3,3 +3,4 @@
- #define CUDA_PAD_BLOCK_SIZE 256
- 
- void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 425524d0..112abef6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
-+    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
-     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
-     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
-     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_POOL_2D:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-         case GGML_OP_LEAKY_RELU:
-@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
- 
-                 const int nth = MIN(1024, ne0);
- 
-+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-+            } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+
-+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
-+
-+                [encoder setComputePipelineState:pipeline];
-+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-+
-+                const int nth = MIN(1024, ne0);
-+
-                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-             } break;
-         case GGML_OP_ARANGE:
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9f4147e9..6ceb3cef 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
-     }
- }
- 
-+kernel void kernel_unpad_f32(
-+    device  const char * src0,
-+    device        char * dst,
-+    constant   int64_t & ne00,
-+    constant   int64_t & ne01,
-+    constant   int64_t & ne02,
-+    constant   int64_t & ne03,
-+    constant  uint64_t & nb00,
-+    constant  uint64_t & nb01,
-+    constant  uint64_t & nb02,
-+    constant  uint64_t & nb03,
-+    constant   int64_t & ne0,
-+    constant   int64_t & ne1,
-+    constant   int64_t & ne2,
-+    constant   int64_t & ne3,
-+    constant  uint64_t & nb0,
-+    constant  uint64_t & nb1,
-+    constant  uint64_t & nb2,
-+    constant  uint64_t & nb3,
-+    uint3 tgpig[[threadgroup_position_in_grid]],
-+    uint3 tpitg[[thread_position_in_threadgroup]],
-+    uint3   ntg[[threads_per_threadgroup]]) {
-+
-+    const int64_t i3 = tgpig.z;
-+    const int64_t i2 = tgpig.y;
-+    const int64_t i1 = tgpig.x;
-+
-+    const int64_t i03 = i3;
-+    const int64_t i02 = i2;
-+    const int64_t i01 = i1;
-+
-+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-+
-+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-+            if (i0 < ne00) {
-+                dst_ptr[i0] = src0_ptr[i0];
-+            }
-+        }
-+
-+        return;
-+    }
-+}
-+
- kernel void kernel_arange_f32(
-     device        char * dst,
-     constant   ggml_metal_kargs_arange & args,
-diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7654ae17..3c57aff8 100644
--- a/ggml/src/ggml.c
-+++ b/ggml/src/ggml.c
-@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "UPSCALE",
-     "PAD",
-     "PAD_REFLECT_1D",
-+    "UNPAD",
-     "ARANGE",
-     "TIMESTEP_EMBEDDING",
-     "ARGSORT",
-@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "OPT_STEP_ADAMW",
- };
- 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "none",
-@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "upscale(x)",
-     "pad(x)",
-     "pad_reflect_1d(x)",
-+    "unpad(x)",
-     "arange(start, stop, step)",
-     "timestep_embedding(timesteps, dim, max_period)",
-     "argsort(x)",
-@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "adamw(x)",
- };
- 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
- 
-@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
-     return result;
- }
- 
-+// ggml_unpad
-+
-+struct ggml_tensor * ggml_unpad(
-+    struct ggml_context * ctx,
-+    struct ggml_tensor  * a,
-+    int p0, int p1, int p2, int p3) {
-+
-+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-+            a->ne[0] - p0,
-+            a->ne[1] - p1,
-+            a->ne[2] - p2,
-+            a->ne[3] - p3);
-+
-+    result->op = GGML_OP_UNPAD;
-+    result->src[0] = a;
-+
-+    return result;
-+}
-+
- // ggml_arange
- 
- struct ggml_tensor * ggml_arange(
--- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -15,33 +15,139 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
- src/llama-context.cpp  | 105 +++++++++++++----------------------------
- src/llama-context.h    |   4 +-
- src/llama-kv-cache.cpp |  39 +++------------
- src/llama-kv-cache.h   |   9 +++-
- 4 files changed, 51 insertions(+), 106 deletions(-)
+ src/llama-context.cpp  |  18 ++++---
+ src/llama-context.h    |   1 +
+ src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
+ src/llama-kv-cache.h   |  12 ++++-
+ 4 files changed, 59 insertions(+), 79 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index cd06ad91..77177c5e 100644
+index c22687e4..c5948e8f 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
- llm_graph_result_ptr llama_context::build_kv_self_defrag(
-         ggml_context * ctx0,
-        ggml_cgraph * gf) const {
-+        ggml_cgraph * gf,
-+        const std::vector<struct llama_kv_defrag_move> & moves) const {
-     auto res = std::make_unique<llm_graph_result>();
+         // find KV slot
+         if (!kv_self->find_slot(ubatch)) {
+-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-            return 1;
+            kv_self->defrag_sched(-1.0f);
+            kv_self->update(*this);
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                return 1;
+            }
+         }
+ 
+         ggml_backend_sched_reset(sched.get());
+@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
+ 
+             // TODO: not sure if this is needed
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                GGML_ABORT("TODO: handle this error");
+                kv_self->defrag_sched(-1.0f);
+                kv_self->update(*this);
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    GGML_ABORT("TODO: handle this error");
+                }
+             }
+ 
+             auto * gf = graph_init();
+diff --git a/src/llama-context.h b/src/llama-context.h
+index c0ceacb1..0264e937 100644
+--- a/src/llama-context.h
+++ b/src/llama-context.h
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
+ #include "llama-graph.h"
+ #include "llama-adapter.h"
+#include "llama-kv-cache.h"
 
-     const auto & hparams = model.hparams;
+ #include "ggml-cpp.h"
+ #include "ggml-opt.h"
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 3dcad65b..60e67b03 100644
+--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
+ }
+ 
+ bool llama_kv_cache_unified::update(llama_context & lctx) {
+-    bool need_reserve = false;
+-
+     auto * sched = lctx.get_sched();
+ 
+     if (has_shift) {
+@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+             res->set_inputs(nullptr);
 
-    const auto & ids = kv_self->defrag_info.ids;
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         {
+@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+ 
+     if (do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = lctx.graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return false;
+        }
+
+        for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, defrag_info.moves.size());
+            chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
+ 
+-        if (defrag_prepare(lctx.graph_max_nodes())) {
+             ggml_backend_sched_reset(sched);
+ 
+             auto * gf = lctx.graph_init();
+ 
+-            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
+ 
+             ggml_backend_sched_alloc_graph(sched, gf);
+ 
+             res->set_inputs(nullptr);
+ 
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         do_defrag = false;
+     }
+ 
+-    return need_reserve;
+    // we never need to reserve a worst case graph
+    return false;
+ }
+ 
+ void llama_kv_cache_unified::defrag_sched(float thold) {
+@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+         const llama_cparams & cparams,
+                ggml_context * ctx,
+-                ggml_cgraph * gf) const {
+                ggml_cgraph * gf,
+                const std::vector<struct llama_kv_defrag_move> & moves) const {
+     auto res = std::make_unique<llm_graph_result>();
+ 
+-    const auto & ids = defrag_info.ids;
 -
 #if 0
     // CPU defrag
     //
-@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@@ -63,188 +169,63 @@ index cd06ad91..77177c5e 100644
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
-             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
 
-             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
 
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
 +                        n_embd_v_gqa, move.len,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
 
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
-+                        n_embd_v_gqa, move.len,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(v_l[il]->type, move.src));
             } else {
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        ggml_row_size(kv_self->v_l[il]->type, i));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+                         ggml_row_size(v_l[il]->type, size),
+-                        ggml_row_size(v_l[il]->type, i));
+                        ggml_row_size(v_l[il]->type, move.src));
 
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        ggml_row_size(kv_self->v_l[il]->type, id));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+                         ggml_row_size(v_l[il]->type, size),
+-                        ggml_row_size(v_l[il]->type, id));
+                        ggml_row_size(v_l[il]->type, move.dst));
             }
 
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
         }
 -
 -        i += nm - 1;
     }
-
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
- #endif
 
-     return res;
-@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
- void llama_context::kv_self_update() {
-     auto & kv = kv_self;
- 
-    bool need_reserve = false;
-
-     if (kv->has_shift) {
-         if (!kv->get_can_shift()) {
-             GGML_ABORT("The current context does not support K-shift");
-@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
-             res->set_inputs(nullptr);
- 
-             graph_compute(gf, false);
-
-            need_reserve = true;
-         }
- 
-         {
-@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
-     // defragment the KV cache if needed
-     if (kv->do_defrag) {
-         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-+        const uint32_t n_max_nodes = graph_max_nodes();
-+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
-+        if (!kv->defrag_prepare(n_max_nodes)) {
-+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
-+            return;
-+        }
- 
-        if (kv->defrag_prepare(graph_max_nodes())) {
-            ggml_backend_sched_reset(sched.get());
-+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
-+            std::vector<struct llama_kv_defrag_move> chunk;
-+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
-+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
- 
-+            ggml_backend_sched_reset(sched.get());
-             auto * gf = graph_init();
-
-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
-+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
-             ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-             res->set_inputs(nullptr);
-
-             graph_compute(gf, false);
-
-            need_reserve = true;
-         }
- 
-         kv->do_defrag = false;
-     }
-
-    // reserve a worst case graph if needed
-    if (need_reserve) {
-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        // simulate full KV cache
-        kv_self->n = kv_self->size;
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        auto * gf = graph_init();
-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(sched.get());
-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
- }
- 
- enum llama_pooling_type llama_context::pooling_type() const {
-@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
-         // find KV slot
-         {
-             if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                return 1;
-+                kv_self->defrag();
-+                kv_self_update();
-+                if (!kv_self->find_slot(ubatch)) {
-+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-+                    return 1;
-+                }
-             }
- 
-             if (!kv_self->recurrent) {
-diff --git a/src/llama-context.h b/src/llama-context.h
-index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -5,6 +5,7 @@
- #include "llama-cparams.h"
- #include "llama-graph.h"
- #include "llama-adapter.h"
-+#include "llama-kv-cache.h"
- 
- #include "ggml-cpp.h"
- 
-@@ -179,7 +180,8 @@ private:
- 
-     llm_graph_result_ptr build_kv_self_defrag(
-             ggml_context * ctx0,
-            ggml_cgraph * gf) const;
-+            ggml_cgraph * gf,
-+            const std::vector<struct llama_kv_defrag_move> & moves) const;
- 
-     // TODO: read/write lora adapters and cvec
-     size_t state_write_data(llama_io_write_i & io);
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
 
     assert(n_used <= n_kv);
 
@@ -263,7 +244,7 @@ index 69f8d35a..35a750d3 100644
 
     // determine which KV cells to move where
     //
-@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     //
     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
     //
@@ -275,7 +256,7 @@ index 69f8d35a..35a750d3 100644
 
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
         const auto & cell0 = cells[i0];
-@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
         // are we moving a continuous block of memory?
         bool cont = false;
 
@@ -295,7 +276,7 @@ index 69f8d35a..35a750d3 100644
                 cont = false;
                 continue;
             }
-@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             head = n_used;
 
             if (!cont) {
@@ -307,7 +288,7 @@ index 69f8d35a..35a750d3 100644
             }
 
             nf++;
-@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             }
         }
 
@@ -325,37 +306,47 @@ index 69f8d35a..35a750d3 100644
         return false;
     }
 
-    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
 -
-    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+-    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
 +    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
 
     return true;
 }
 diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
-index 56c74035..25cbcb56 100644
+index bf3b4b6a..928b9712 100644
 --- a/src/llama-kv-cache.h
 +++ b/src/llama-kv-cache.h
-@@ -43,6 +43,13 @@ private:
+@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
+ private:
     llama_kv_cache * kv;
 };
- 
+ 
 +// block of KV slots to move when defragging
 +struct llama_kv_defrag_move {
 +    uint32_t src;
 +    uint32_t dst;
 +    uint32_t len;
 +};
-+
- struct llama_kv_cell {
-     llama_pos pos   = -1;
-     llama_pos delta =  0;
-@@ -131,7 +138,7 @@ public:
-     // defrag
 
+ //
+ // llama_kv_cache_unified
+@@ -207,7 +214,7 @@ private:
+ 
+     // defrag
     struct {
 -        std::vector<uint32_t> ids;
 +        std::vector<llama_kv_defrag_move> moves;
     } defrag_info;
 
     // return true if cells have been moved
+@@ -249,7 +256,8 @@ private:
+     llm_graph_result_ptr build_graph_defrag(
+             const llama_cparams & cparams,
+                    ggml_context * ctx,
+-                    ggml_cgraph * gf) const;
+                    ggml_cgraph * gf,
+                    const std::vector<llama_kv_defrag_move> & moves) const;
+ 
+     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+     void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0011-sort-devices-by-score.patch
@@ -11,7 +11,7 @@ with the fastest acceleration is loaded
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 82ae1b5b..1487f322 100644
+index 405d8e31..4e67d243 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
 @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {

--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 43d9fc4f..4c0d3824 100644
+index ddea5ad3..45918bf6 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)

--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 4c0d3824..79c26312 100644
+index 45918bf6..0beaed86 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)

--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@@ -9,8 +9,8 @@ such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
- src/llama-vocab.cpp | 2 +-
- 3 files changed, 7 insertions(+), 3 deletions(-)
+ src/llama-vocab.cpp | 4 +---
+ 3 files changed, 7 insertions(+), 5 deletions(-)

 diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
 index 79ee2020..3efb22f0 100644
@@ -53,15 +53,17 @@ index 381a9c7d..e45b453d 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1306864e..d6515ff6 100644
+index 10f34d33..9f5fd57b 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
- 
+@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
-                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
-+                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
+                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+-
+-                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 34624cca..59bd3c62 100644
+index a30e67f2..2462d2b8 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 

--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index c0a5f934..75731053 100644
+index 804b11e0..15a10ca8 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
 @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {

--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Thu, 1 May 2025 13:45:12 -0700
+Subject: [PATCH] add argsort and cuda copy for i32
+
+---
+ ggml/src/ggml-cpu/ops.cpp     |  43 ++++++++++++++
+ ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/cpy.cu     |  49 ++++++++++++++++
+ 3 files changed, 192 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index becdae07..7a44b6cf 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
+     }
+ }
+ 
+static void ggml_compute_forward_argsort_i32(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(int32_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+ void ggml_compute_forward_argsort(
+     const ggml_compute_params * params,
+     ggml_tensor * dst) {
+@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
+             {
+                 ggml_compute_forward_argsort_f32(params, dst);
+             } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_argsort_i32(params, dst);
+            } break;
+         default:
+             {
+                 GGML_ABORT("fatal error");
+diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
+index 607ded85..53b02634 100644
+--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
+@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
+     }
+ }
+ 
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+    extern __shared__ int shared_mem[];
+    int * indices = shared_mem;
+
+    const int tid = threadIdx.x;
+    const int row = blockIdx.y;
+
+    // Initialize all indices, handling the case where threads < ncols_pad
+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+    }
+    __syncthreads();
+
+    // Bitonic sort
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k/2; j > 0; j /= 2) {
+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
+                const int ij = i ^ j;
+                if (ij > i) {
+                    // Only compare values within the actual data range
+                    if (i < ncols && ij < ncols) {
+                        if ((i & k) == 0) {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        } else {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write sorted indices to output, only threads handling valid data
+    for (int i = tid; i < ncols; i += blockDim.x) {
+        dst[row * ncols + i] = indices[i];
+    }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // Bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    // Ensure thread count doesn't exceed maximum (typically 1024)
+    const int max_threads = 1024;  // This is the typical max for most GPUs
+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+    const dim3 block_dims(threads_per_block, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // Check if shared memory size is within limits
+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+    // Launch kernels with the updated thread configuration
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+     const ggml_tensor * src0 = dst->src[0];
+     const float * src0_d = (const float *)src0->data;
+     float * dst_d = (float *)dst->data;
+     cudaStream_t stream = ctx.stream();
+ 
+-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+     GGML_ASSERT( dst->type == GGML_TYPE_I32);
+     GGML_ASSERT(ggml_is_contiguous(src0));
+ 
+@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ 
+     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+ 
+-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    } else {
+        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    }
+ }
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 2d46176e..47383486 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+     *dsti = *xi;
+ }
+ 
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t * xi = (const int32_t *) cxi;
+    int32_t * dsti = (int32_t *) cdsti;
+
+    *dsti = *xi;
+}
+
+ template <cpy_kernel_t cpy_1>
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
+     cpy_1(cx + x_offset, cdst + dst_offset);
+ }
+ 
+// First, add this template function after the other template functions
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
+static void ggml_cpy_i32_i32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+     const float * xi = (const float *) cxi;
+     block_q8_0 * dsti = (block_q8_0 *) cdsti;
+@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));