llama: update vendored code to commit 40c6d79f (#7875)

527cc978 · Jeffrey Morgan · GitHub · a37f4a86 · 527cc978 · 527cc978
Unverified Commit 527cc978 authored Dec 10, 2024 by Jeffrey Morgan Committed by GitHub Dec 10, 2024
20 changed files
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..800dfb95 100644
+index 6a6f4c2a..fa09f3b3 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
+@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -29,7 +29,7 @@ index 4c0a1bb6..800dfb95 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
+@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
             } else {

--- a/llama/patches/0005-embeddings.patch
+++ b/llama/patches/0005-embeddings.patch
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
 Subject: [PATCH] embeddings

 ---
- src/llama.cpp | 15 +++++++++------
- 1 file changed, 9 insertions(+), 6 deletions(-)
+ src/llama.cpp | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 800dfb95..a639522d 100644
+index fa09f3b3..d1791af0 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -16920,7 +16920,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -20,20 +20,15 @@ index 800dfb95..a639522d 100644
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -17192,20 +17192,23 @@ static int llama_decode_internal(
-             // no output
+@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
             res  = nullptr;
             embd = nullptr;
-        } else if (cparams.embeddings) {
+         } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
-+        }
-+
-+        if (cparams.embeddings) {
+             embd = nullptr;
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-+                embd = ggml_graph_node(gf, i);
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    embd = ggml_graph_node(gf, i);
+@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
                     break;
                 }
             }
@@ -46,6 +41,7 @@ index 800dfb95..a639522d 100644
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-         ggml_backend_sched_alloc_graph(lctx.sched, gf);
+         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
--- a/llama/patches/0003-metal.patch
+++ b/llama/patches/0003-metal.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] metal
-
---
- ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
- 1 file changed, 13 insertions(+), 17 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 9da08fe2..3a433703 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
-                 // to the matrix-vector kernel
-                 int ne11_mm_min = 1;
- 
-#if 0
-                 // the numbers below are measured on M2 Ultra for 7B and 13B models
-                 // these numbers do not translate to other devices or model sizes
-                 // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-+                        switch (src0t) {
-+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q4_0:
-+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-+                            case GGML_TYPE_Q5_0:                          // not tested yet
-+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-+                            default:             ne11_mm_min = 1;  break;
-                         }
-#endif
- 
-                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- a/llama/patches/0006-clip-unicode.patch
+++ b/llama/patches/0006-clip-unicode.patch
@@ -8,12 +8,12 @@ Subject: [PATCH] clip-unicode
 1 file changed, 39 insertions(+), 1 deletion(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 14e02c8d..6e849d8e 100644
+index d7c94352..427d5e02 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -44,6 +44,19 @@
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
- #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+@@ -56,6 +56,19 @@
+ #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+ #endif // defined(LLAVA_LOG_OFF)
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -31,7 +31,7 @@ index 14e02c8d..6e849d8e 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1225,8 +1238,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             gguf_free(ctx);
             return nullptr;
         }
@@ -62,7 +62,7 @@ index 14e02c8d..6e849d8e 100644
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
-@@ -1266,7 +1300,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
             }
         }

--- a/llama/patches/0004-ggml-metal.patch
+++ b/llama/patches/0004-ggml-metal.patch
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Wed, 12 Jun 2024 12:18:40 -0700
-Subject: [PATCH] ggml-metal
-
---
- ggml/src/ggml-metal.m | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 3a433703..829c5e39 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -392,8 +392,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
- #if GGML_METAL_EMBED_LIBRARY
-             GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
- 
-            extern const char ggml_metallib_start[];
-            extern const char ggml_metallib_end[];
-+            extern const char *ggml_metallib_start;
-+            extern const char *ggml_metallib_end;
- 
-             NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
- #else
--- a/llama/patches/0007-solar-pro.patch
+++ b/llama/patches/0007-solar-pro.patch
@@ -11,14 +11,14 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
- src/llama.cpp | 269 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 255 insertions(+), 14 deletions(-)
+ src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 253 insertions(+), 14 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index a639522d..83b80b59 100644
+index d1791af0..b01770d0 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -217,6 +217,7 @@ enum llm_arch {
+@@ -195,6 +195,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -26,7 +26,7 @@ index a639522d..83b80b59 100644
     LLM_ARCH_UNKNOWN,
 };
 
-@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
@@ -34,7 +34,7 @@ index a639522d..83b80b59 100644
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
-@@ -327,6 +329,7 @@ enum llm_kv {
+@@ -306,6 +308,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -42,7 +42,7 @@ index a639522d..83b80b59 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 
@@ -78,15 +78,15 @@ index a639522d..83b80b59 100644
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
-@@ -608,6 +612,7 @@ enum llm_tensor {
+@@ -603,6 +607,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
 +    LLM_TENSOR_BSKCN_TV,
 };
 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1527,6 +1532,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -111,15 +111,15 @@ index a639522d..83b80b59 100644
     {
         LLM_ARCH_UNKNOWN,
         {
-@@ -2360,6 +2383,7 @@ enum e_model {
+@@ -2401,6 +2424,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
 +    MODEL_22B,
     MODEL_30B,
+     MODEL_32B,
     MODEL_34B,
-     MODEL_35B,
-@@ -2409,6 +2433,8 @@ struct llama_hparams {
+@@ -2451,6 +2475,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -128,7 +128,7 @@ index a639522d..83b80b59 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-@@ -2479,6 +2505,7 @@ struct llama_hparams {
+@@ -2521,6 +2547,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
@@ -136,7 +136,7 @@ index a639522d..83b80b59 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2588,6 +2615,14 @@ struct llama_hparams {
+@@ -2630,6 +2657,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
@@ -151,7 +151,7 @@ index a639522d..83b80b59 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2769,6 +2804,8 @@ struct llama_layer {
+@@ -2816,6 +2851,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
@@ -160,7 +160,7 @@ index a639522d..83b80b59 100644
 };
 
 // very similar to llama_batch,
-@@ -6134,6 +6171,21 @@ static void llm_load_hparams(
+@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
@@ -182,46 +182,51 @@ index a639522d..83b80b59 100644
         default: (void)0;
     }
 
-@@ -8831,6 +8883,38 @@ static bool llm_load_tensors(
+@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+     // this tensor is loaded for T5, but never used
+     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+ };
+ 
+ // checks if the weight tensor can be used with the specified buffer type and device
+@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 +                    }
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
-+
 +                        auto & layer = model.layers[i];
 +
-+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 +
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 +
-+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +
-+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 +
-                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-@@ -16179,6 +16263,158 @@ struct llm_build_context {
+                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+@@ -16652,6 +16734,158 @@ struct llm_build_context {
 
         return gf;
     }
@@ -239,7 +244,7 @@ index a639522d..83b80b59 100644
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
@@ -380,7 +385,7 @@ index a639522d..83b80b59 100644
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16443,6 +16679,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
@@ -391,7 +396,7 @@ index a639522d..83b80b59 100644
         default:
             GGML_ABORT("fatal error");
     }
-@@ -19589,6 +19829,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:

--- a/llama/patches/0008-conditional-fattn.patch
+++ b/llama/patches/0008-conditional-fattn.patch
@@ -4,14 +4,14 @@ Date: Wed, 9 Oct 2024 17:26:23 -0700
 Subject: [PATCH] conditional-fattn

 ---
- ggml/src/ggml-cuda.cu | 2 ++
+ ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

-diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index 809d6ab1..fe77b81c 100644
--- a/ggml/src/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda.cu
-@@ -2347,9 +2347,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 52aec229..cbf4fddf 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;

--- a/llama/patches/0009-blas.patch
+++ b/llama/patches/0009-blas.patch
@@ -4,22 +4,23 @@ Date: Mon, 30 Sep 2024 16:31:04 -0700
 Subject: [PATCH] blas

 ---
- ggml/src/ggml-blas.cpp | 4 ++++
+ ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
 1 file changed, 4 insertions(+)

-diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
-index 6d99c6be..8e1ab99d 100644
--- a/ggml/src/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas.cpp
+diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
+index ec158dfa..b3ac1fa4 100644
+--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
 @@ -1,3 +1,5 @@
 +#ifdef GGML_USE_BLAS
 +
 #include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
-@@ -366,3 +368,5 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
-     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
-     ctx->n_threads = n_threads;
+@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
 }
+ 
+ GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
 +
-+#endif
+#endif // GGML_USE_BLAS
+\ No newline at end of file
--- a/llama/patches/0010-add-mllama-support.patch
+++ b/llama/patches/0010-add-mllama-support.patch
@@ -12,29 +12,46 @@ kv cache once per run

 remaining is to implement the cross attention mask
 ---
- examples/llava/llava.cpp |   2 +-
+ examples/llava/llava.cpp |   5 +-
 include/llama.h          |   5 +
- src/llama.cpp            | 447 +++++++++++++++++++++++++++++++++++++--
- 3 files changed, 436 insertions(+), 18 deletions(-)
+ src/llama.cpp            | 477 +++++++++++++++++++++++++++++++++++++--
+ 3 files changed, 467 insertions(+), 20 deletions(-)

 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 8558c6bd..37b2f2e2 100644
+index 4ca53a0b..d56644a8 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-         if (n_eval > n_batch) {
+@@ -412,7 +412,7 @@ struct llava_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -424,6 +424,7 @@ struct llava_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-         if (llama_decode(ctx_llama, batch)) {
+         float * embd = image_embed->embed+i*n_embd;
+-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+         if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/include/llama.h b/include/llama.h
-index 7cae1bbe..aca09310 100644
+index e85f459f..aba85f86 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -240,6 +240,7 @@ extern "C" {
+@@ -245,6 +245,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@@ -42,7 +59,7 @@ index 7cae1bbe..aca09310 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -423,6 +424,10 @@ extern "C" {
+@@ -419,6 +420,10 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
 
@@ -54,10 +71,10 @@ index 7cae1bbe..aca09310 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 83b80b59..35748488 100644
+index b01770d0..46881642 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
+@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
 
 enum llm_arch {
     LLM_ARCH_LLAMA,
@@ -65,7 +82,7 @@ index 83b80b59..35748488 100644
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
     LLM_ARCH_GROK,
-@@ -223,6 +224,7 @@ enum llm_arch {
+@@ -201,6 +202,7 @@ enum llm_arch {
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,           "llama"        },
@@ -73,7 +90,7 @@ index 83b80b59..35748488 100644
     { LLM_ARCH_FALCON,          "falcon"       },
     { LLM_ARCH_GROK,            "grok"         },
     { LLM_ARCH_GPT2,            "gpt2"         },
-@@ -330,6 +332,7 @@ enum llm_kv {
+@@ -309,6 +311,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -81,7 +98,7 @@ index 83b80b59..35748488 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-@@ -439,6 +442,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
@@ -89,7 +106,7 @@ index 83b80b59..35748488 100644
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
-@@ -613,6 +617,14 @@ enum llm_tensor {
+@@ -608,6 +612,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -103,8 +120,8 @@ index 83b80b59..35748488 100644
 +    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
 };
 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -642,6 +654,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
@@ -145,7 +162,7 @@ index 83b80b59..35748488 100644
     {
         LLM_ARCH_BAICHUAN,
         {
-@@ -2390,6 +2436,7 @@ enum e_model {
+@@ -2432,6 +2478,7 @@ enum e_model {
     MODEL_40B,
     MODEL_65B,
     MODEL_70B,
@@ -153,7 +170,7 @@ index 83b80b59..35748488 100644
     MODEL_236B,
     MODEL_314B,
     MODEL_SMALL,
-@@ -2434,6 +2481,7 @@ struct llama_hparams {
+@@ -2476,6 +2523,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
@@ -161,7 +178,7 @@ index 83b80b59..35748488 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
-@@ -2502,10 +2550,11 @@ struct llama_hparams {
+@@ -2544,10 +2592,11 @@ struct llama_hparams {
         if (this->n_expert      != other.n_expert)      return true;
         if (this->n_expert_used != other.n_expert_used) return true;
 
@@ -177,7 +194,7 @@ index 83b80b59..35748488 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2623,6 +2672,10 @@ struct llama_hparams {
+@@ -2665,6 +2714,10 @@ struct llama_hparams {
 
         GGML_ABORT("fatal error");
     }
@@ -188,7 +205,7 @@ index 83b80b59..35748488 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2652,6 +2705,9 @@ struct llama_cparams {
+@@ -2694,6 +2747,9 @@ struct llama_cparams {
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
@@ -198,7 +215,7 @@ index 83b80b59..35748488 100644
 
     enum llama_pooling_type pooling_type;
 
-@@ -2806,6 +2862,16 @@ struct llama_layer {
+@@ -2853,6 +2909,16 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
 
     struct ggml_tensor * bskcn_tv;
@@ -215,7 +232,7 @@ index 83b80b59..35748488 100644
 };
 
 // very similar to llama_batch,
-@@ -3452,6 +3518,8 @@ struct llama_context {
+@@ -3439,6 +3505,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
@@ -224,13 +241,34 @@ index 83b80b59..35748488 100644
 };
 
 struct llama_lora_weight {
-@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
+@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
 
     for (int i = 0; i < (int) n_layer; i++) {
 +        // for cross attention layers
 +        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+            const llama_model::buft_list_t * buft_list;
+            if (offload) {
+                buft_list = model.dev_layer.at(i).buft_list;
+            } else {
+                buft_list = &model.cpu_buft_list;
+            }
+            ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+                [&](ggml_context * ctx) {
+                    ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+                    if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+                        return k;
+                    }
+                    ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+                    return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+                });
+            ggml_context * ctx = ctx_for_buft(buft);
+
+            if (!ctx) {
+                LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+                return false;
+            }
 +            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
 +            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
 +            ggml_format_name(k, "cache_k_l%d", i);
@@ -243,7 +281,7 @@ index 83b80b59..35748488 100644
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
+@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
     }
 
     // zero-out the per-layer hparams
@@ -263,7 +301,7 @@ index 83b80b59..35748488 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
+@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@@ -272,7 +310,7 @@ index 83b80b59..35748488 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
+@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
                     }
                 }
             } break;
@@ -289,63 +327,78 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
-                         layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+     // this tensor is loaded for T5, but never used
+     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+-    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ };
+ 
+ // checks if the weight tensor can be used with the specified buffer type and device
+@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
+                         }
                     }
                 } break;
 +            case LLM_ARCH_MLLAMA:
 +                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +
 +                        // if output is NULL, init from the input tok embed
 +                        if (model.output == NULL) {
-+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
 +                        }
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 +
 +                        auto & layer = model.layers[i];
 +
 +                        if (hparams.cross_attention_layers(i)) {
-+                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
-+                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
-+                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
-+                            layer.cross_attn_q_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
-+                            layer.cross_attn_q_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
-+                            layer.cross_attn_v_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
-+                            layer.cross_attn_attn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
-+                            layer.cross_attn_mlp_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
-+                            layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +                        } else {
-+                            layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                            layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-+                            layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-+                            layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 +                        }
 +                    }
 +                } break;
-             case LLM_ARCH_GROK:
+             case LLM_ARCH_MINICPM3:
                 {
-                     if (n_expert == 0) {
-@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+                     const int64_t n_embd_head_qk_rope = hparams.n_rot;
+@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@@ -354,7 +407,7 @@ index 83b80b59..35748488 100644
         }
 
         if (params.vocab_only) {
-@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
+@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
@@ -376,7 +429,7 @@ index 83b80b59..35748488 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -10167,6 +10323,7 @@ struct llm_build_context {
+@@ -10513,6 +10696,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -384,18 +437,10 @@ index 83b80b59..35748488 100644
     }
 
     void free() {
-@@ -10754,6 +10911,239 @@ struct llm_build_context {
-                 LLM_NORM_RMS, cb, -1);
-         cb(cur, "result_norm", -1);
+@@ -10992,6 +11176,240 @@ struct llm_build_context {
+         return gf;
+     }
 
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-+        cb(cur, "result_output", -1);
-+
-+        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
-+
 +    struct ggml_cgraph * build_mllama() {
 +        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 +
@@ -410,7 +455,7 @@ index 83b80b59..35748488 100644
 +        struct ggml_tensor * inpL;
 +        struct ggml_tensor * inpCAS;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 +        inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
 +
 +        // inp_pos - contains the positions
@@ -429,7 +474,7 @@ index 83b80b59..35748488 100644
 +            cb(cur, "attn_norm", il);
 +
 +            if (hparams.cross_attention_layers(il)) {
-+                if (!batch.embd && !cparams.cross_attn) {
+                if (!ubatch.embd && !cparams.cross_attn) {
 +                    continue;
 +                }
 +
@@ -447,7 +492,7 @@ index 83b80b59..35748488 100644
 +                cb(Qcur, "Qcur", il);
 +
 +                struct ggml_tensor * Kcur, * Vcur;
-+                if (batch.embd) {
+                if (ubatch.embd) {
 +                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
 +                    cb(Kcur, "Kcur", il);
 +
@@ -621,10 +666,19 @@ index 83b80b59..35748488 100644
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
 +
-         // lm_head
-         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-         cb(cur, "result_output", -1);
-@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+     struct ggml_cgraph * build_baichuan() {
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ 
+@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@@ -635,14 +689,14 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
-     if (batch.embd) {
+     if (ubatch.embd) {
 -        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = batch.n_tokens;
+-        const int64_t n_tokens = ubatch.n_tokens;
 +        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
-+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
 +            // zero out inp_embd since it's not used
 +            float * inp_embd_data = (float *)lctx.inp_embd->data;
 +            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
@@ -650,24 +704,24 @@ index 83b80b59..35748488 100644
 +            }
 +        } else {
 +            const int64_t n_embd   = hparams.n_embd;
-+            const int64_t n_tokens = batch.n_tokens;
+            const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
-+            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+-        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+            ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
 +        }
     }
 
-     if (batch.pos && lctx.inp_pos) {
-@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
+     if (ubatch.pos && lctx.inp_pos) {
+@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch_all, n_embd,
-+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
+-    lctx.sbatch.from_batch(batch, n_embd,
+    lctx.sbatch.from_batch(batch, batch.n_embd,
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 
-@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
+@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
 
     const int64_t n_embd = hparams.n_embd;
 
@@ -676,7 +730,7 @@ index 83b80b59..35748488 100644
 
     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
 
-@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@@ -687,7 +741,7 @@ index 83b80b59..35748488 100644
     }
 
     size_t total_size_org = 0;
-@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
@@ -695,7 +749,7 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
-@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
@@ -705,8 +759,8 @@ index 83b80b59..35748488 100644
 +
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
-                  int32_t   n_tokens,
-@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
+                  int32_t   n_tokens) {
+@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
@@ -714,7 +768,7 @@ index 83b80b59..35748488 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
@@ -722,7 +776,7 @@ index 83b80b59..35748488 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);

--- a/llama/patches/0011-add-unpad-operator.patch
+++ b/llama/patches/0011-add-unpad-operator.patch
@@ -4,20 +4,21 @@ Date: Thu, 17 Oct 2024 17:19:25 -0700
 Subject: [PATCH] add unpad operator

 ---
- ggml/include/ggml.h        | 10 ++++
- ggml/src/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu  | 46 +++++++++++++++++++
+ ggml/include/ggml.h                  | 10 +++++
+ ggml/src/ggml-cpu/ggml-cpu.c         | 57 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
+ ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal.m      | 33 ++++++++++++++
- ggml/src/ggml-metal.metal  | 45 ++++++++++++++++++
- ggml/src/ggml.c            | 93 +++++++++++++++++++++++++++++++++++++-
- 7 files changed, 230 insertions(+), 2 deletions(-)
+ ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 +++++++++++-
+ 8 files changed, 219 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index ce3d92cb..962cb5f7 100644
+index 65cb92c4..acbcccc6 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -506,6 +506,7 @@ extern "C" {
+@@ -499,6 +499,7 @@ extern "C" {
         GGML_OP_POOL_2D_BACK,
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
@@ -25,7 +26,7 @@ index ce3d92cb..962cb5f7 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1764,6 +1765,15 @@ extern "C" {
+@@ -1695,6 +1696,15 @@ extern "C" {
             int                  p2,
             int                  p3);
 
@@ -41,11 +42,93 @@ index ce3d92cb..962cb5f7 100644
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
-diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index fe77b81c..6e84af56 100644
--- a/ggml/src/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda.cu
-@@ -2270,6 +2270,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 23ae2e10..111ff3b0 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
+     }
+ }
+ 
+static void ggml_compute_forward_unpad_f32(
+    const struct ggml_compute_params *params,
+    struct ggml_tensor *dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_unpad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_unpad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+ 
+ // ggml_compute_forward_arange
+ 
+@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+             } break;
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cbf4fddf..9ca6cb77 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -55,7 +138,7 @@ index fe77b81c..6e84af56 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -2992,6 +2995,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
+@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
@@ -126,35 +209,35 @@ index 8fd386b0..e2ededc3 100644
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 829c5e39..25702d85 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -193,6 +193,7 @@
-     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 093ae900..cb9a1307 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+     GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
 +    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -689,6 +690,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
+@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,     conv_transpose_1d_f16_f32,      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
 +        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -846,6 +848,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
-             return false;
+@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -2655,6 +2658,36 @@ static void ggml_metal_encode_node(
+@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -191,11 +274,11 @@ index 829c5e39..25702d85 100644
                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_ARANGE:
-diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
-index 2b200032..09887511 100644
--- a/ggml/src/ggml-metal.metal
-+++ b/ggml/src/ggml-metal.metal
-@@ -2029,6 +2029,51 @@ kernel void kernel_pad_f32(
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 5caa0846..47038c31 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
     }
 }
 
@@ -248,10 +331,10 @@ index 2b200032..09887511 100644
     device        char * dst,
     constant   int64_t & ne0,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index bcbc32d9..f4864ac8 100644
+index 1a9a7efa..ea2b259b 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -2997,6 +2997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
@@ -259,16 +342,16 @@ index bcbc32d9..f4864ac8 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -3030,7 +3031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
-+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -3091,6 +3092,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
@@ -276,16 +359,16 @@ index bcbc32d9..f4864ac8 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -3124,7 +3126,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
-+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -6955,6 +6957,32 @@ struct ggml_tensor * ggml_pad(
+@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
     return result;
 }
 
@@ -295,12 +378,6 @@ index bcbc32d9..f4864ac8 100644
 +    struct ggml_context * ctx,
 +    struct ggml_tensor  * a,
 +    int p0, int p1, int p2, int p3) {
-+    bool is_node = false;
-+
-+    if (a->grad) {
-+        GGML_ABORT("fatal error"); // TODO: implement backward
-+        is_node = true;
-+    }
 +
 +    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
 +            a->ne[0] - p0,
@@ -309,7 +386,6 @@ index bcbc32d9..f4864ac8 100644
 +            a->ne[3] - p3);
 +
 +    result->op = GGML_OP_UNPAD;
-+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 +    result->src[0] = a;
 +
 +    return result;
@@ -318,92 +394,3 @@ index bcbc32d9..f4864ac8 100644
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
-@@ -15312,6 +15340,58 @@ static void ggml_compute_forward_pad(
-     }
- }
- 
-+static void ggml_compute_forward_unpad_f32(
-+    const struct ggml_compute_params *params,
-+    struct ggml_tensor *dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_ASSERT(src0->nb[0] == sizeof(float));
-+    GGML_ASSERT( dst->nb[0] == sizeof(float));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    float * dst_ptr = (float *) dst->data;
-+
-+    // TODO: optimize
-+
-+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-+
-+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-+
-+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+                        dst_ptr[dst_idx] = *src_ptr;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void ggml_compute_forward_unpad(
-+    const struct ggml_compute_params * params,
-+    struct ggml_tensor * dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    switch (src0->type) {
-+        case GGML_TYPE_F32:
-+            {
-+                ggml_compute_forward_unpad_f32(params, dst);
-+            } break;
-+        default:
-+            {
-+                GGML_ABORT("fatal error");
-+            }
-+    }
-+}
- 
- // ggml_compute_forward_arange
- 
-@@ -17294,6 +17374,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -18369,6 +18453,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
-             {
-                 GGML_ABORT("fatal error"); // TODO: not implemented
-             }
-+        case GGML_OP_UNPAD:
-+            {
-+                GGML_ABORT("fatal error"); // TODO: not implemented
-+            }
-         case GGML_OP_ARANGE:
-             {
-                 GGML_ABORT("fatal error"); // TODO: not implemented
-@@ -19165,6 +19253,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-             } break;
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
--- a/llama/patches/0012-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0012-fix-deepseek-deseret-regex.patch
@@ -7,11 +7,11 @@ On windows compiled with gcc the c++ regex library failed to handle
 the characters
 ---
 src/llama-vocab.cpp |  2 +-
- src/unicode.cpp     | 21 +++++++++++++++++++++
- 2 files changed, 22 insertions(+), 1 deletion(-)
+ src/unicode.cpp     | 22 ++++++++++++++++++++++
+ 2 files changed, 23 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index d2f34ddd..3ef6af19 100644
+index d1dc9627..05ef0e71 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
 @@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -24,7 +24,7 @@ index d2f34ddd..3ef6af19 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index f4e941cd..9d78ff16 100644
+index 3d459263..51dd81fb 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
 @@ -2,6 +2,11 @@
@@ -39,7 +39,7 @@ index f4e941cd..9d78ff16 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -58,7 +58,13 @@ index f4e941cd..9d78ff16 100644
 +    free(wbuf);
 +    return ret;
 +#else
-     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+ #if defined(__clang__)
+     // disable C++17 deprecation warning for std::codecvt_utf8
+ #    pragma clang diagnostic push
+@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+ #endif
+ 
     return conv.from_bytes(s);
 +#endif
 }

--- a/llama/patches/0011-relative-include-paths.patch
+++ b/llama/patches/0011-relative-include-paths.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 3 Dec 2024 21:30:51 -0800
+Subject: [PATCH] relative include paths
+
+---
+ ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
+ ggml/src/ggml-cpu/ggml-cpu.c         | 2 +-
+ ggml/src/ggml-cpu/ggml-cpu.cpp       | 2 +-
+ ggml/src/ggml-quants.c               | 2 +-
+ 4 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+index 11152385..bbf8934e 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+@@ -4,7 +4,7 @@
+ #include "ggml-quants.h"
+ #include "ggml-impl.h"
+ #include "ggml-cpu.h"
+-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
+ 
+ #include <math.h>
+ #include <string.h>
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 111ff3b0..df0bd3c6 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -10,7 +10,7 @@
+ #include "ggml-quants.h"
+ #include "ggml-cpu-quants.h"
+ #include "ggml-threading.h"
+-#include "amx/amx.h"
+#include "amx.h"
+ #include "ggml.h"
+ 
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
+index 77e5d87a..91476ad0 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
+@@ -3,7 +3,7 @@
+ #include "ggml-cpu.h"
+ #include "ggml-cpu-aarch64.h"
+ #include "ggml-impl.h"
+-#include "amx/amx.h"
+#include "amx.h"
+ #include <cctype>
+ #include <string>
+ #include <vector>
+diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
+index 7301a9c6..49ab3daf 100644
+--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
+@@ -3,7 +3,7 @@
+ 
+ #include "ggml-quants.h"
+ #include "ggml-impl.h"
+-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
+ #include "ggml-cpu.h"
+ 
+ #include <math.h>
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -559,7 +559,6 @@ type Options struct {
 	TopK             int      `json:"top_k"`
 	TopP             float32  `json:"top_p"`
 	MinP             float32  `json:"min_p"`
-	TFSZ             float32  `json:"tfs_z"`
 	TypicalP         float32  `json:"typical_p"`
 	RepeatLastN      int      `json:"repeat_last_n"`
 	Temperature      float32  `json:"temperature"`
@@ -632,7 +631,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	samplingParams.TopK = req.TopK
 	samplingParams.TopP = req.TopP
 	samplingParams.MinP = req.MinP
-	samplingParams.TfsZ = req.TFSZ
 	samplingParams.TypicalP = req.TypicalP
 	samplingParams.Temp = req.Temperature
 	samplingParams.RepeatLastN = req.RepeatLastN
@@ -930,6 +928,7 @@ func Execute(args []string) error {
 	level := slog.LevelInfo
 	if *verbose {
 		level = slog.LevelDebug
+		llama.EnableDebug()
 	}
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,

--- a/llama/sampling.cpp
+++ b/llama/sampling.cpp
 /**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
 *
 * MIT License
 *
@@ -124,8 +124,8 @@ struct ring_buffer {
    std::vector<T> data;
 };

-struct gpt_sampler {
-    gpt_sampler_params params;
+struct common_sampler {
+    common_params_sampling params;

    struct llama_sampler * grmr;
    struct llama_sampler * chain;
@@ -151,26 +151,28 @@ struct gpt_sampler {
    }
 };

-std::string gpt_sampler_params::print() const {
+std::string common_params_sampling::print() const {
    char result[1024];

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
 }

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    auto * result = new gpt_sampler {
+    auto * result = new common_sampler {
        /* .params = */ params,
        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
        /* .chain  = */ llama_sampler_chain_init(lparams),
@@ -197,33 +199,45 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                params.penalize_nl,
                params.ignore_eos));

-    if (params.temp > 0.0f) {
    if (params.mirostat == 0) {
        for (const auto & cnstr : params.samplers) {
            switch (cnstr) {
-                    case GPT_SAMPLER_TYPE_TOP_K:
+                    case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char*> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto& str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
+                        }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                        break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                    break;
-                    case GPT_SAMPLER_TYPE_TOP_P:
+                case COMMON_SAMPLER_TYPE_TOP_P:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
                    break;
-                    case GPT_SAMPLER_TYPE_MIN_P:
+                case COMMON_SAMPLER_TYPE_MIN_P:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                    break;
-                    case GPT_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
-                    case GPT_SAMPLER_TYPE_TYPICAL_P:
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
                    break;
-                    case GPT_SAMPLER_TYPE_TEMPERATURE:
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
@@ -234,23 +248,11 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
    } else {
        GGML_ASSERT(false && "unknown mirostat version");
    }
-    } else {
-        if (params.n_probs > 0) {
-            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
-            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
-            //
-            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
-            // it is much faster, since we avoid sorting all tokens and should give a good approximation
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
-    }

    return result;
 }

-void gpt_sampler_free(struct gpt_sampler * gsmpl) {
+void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
        llama_sampler_free(gsmpl->grmr);

@@ -260,7 +262,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
    }
 }

-void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -270,14 +272,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
    gsmpl->prev.push_back(token);
 }

-void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
+void common_sampler_reset(struct common_sampler * gsmpl) {
    llama_sampler_reset(gsmpl->grmr);

    llama_sampler_reset(gsmpl->chain);
 }

-struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
-    return new gpt_sampler {
+struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
+    return new common_sampler {
        /* .params = */ gsmpl->params,
        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
@@ -287,7 +289,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
    };
 }

-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

    if (gsmpl) {
@@ -298,7 +300,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
    }
 }

-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -344,21 +346,60 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+}
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
    return llama_sampler_get_seed(gsmpl->chain);
 }

 // helpers

-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
    return &gsmpl->cur_p;
 }

-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
+llama_token common_sampler_last(const struct common_sampler * gsmpl) {
    return gsmpl->prev.rat(0);
 }

-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
+std::string common_sampler_print(const struct common_sampler * gsmpl) {
    std::string result = "logits ";

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@@ -369,7 +410,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
    return result;
 }

-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
    n = std::min(n, (int) gsmpl->prev.size());

    if (n <= 0) {
@@ -384,63 +425,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,

        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");

-        result += llama_token_to_piece(ctx_main, id);
+        result += common_token_to_piece(ctx_main, id);
    }

    return result;
 }

-char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
+char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
-        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
-        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_DRY:         return 'd';
+        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
+        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
        default : return '?';
    }
 }

-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_DRY:         return "dry";
+        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
+        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        default : return "";
    }
 }

-std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
-        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "dry",         COMMON_SAMPLER_TYPE_DRY },
+        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
-        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
-        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
+        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<gpt_sampler_type> samplers;
+    std::vector<common_sampler_type> samplers;
    samplers.reserve(names.size());

    for (const auto & name : names) {
@@ -460,17 +505,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
    return samplers;
 }

-std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
+std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, common_sampler_type> sampler_name_map = {
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
    };

-    std::vector<gpt_sampler_type> samplers;
+    std::vector<common_sampler_type> samplers;
    samplers.reserve(chars.size());

    for (const auto & c : chars) {

--- a/llama/sampling.h
+++ b/llama/sampling.h
 /**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
 *
 * MIT License
 *
@@ -33,7 +33,7 @@
 #include <string>
 #include <vector>

-// gpt_sampler extends llama_sampler with additional functionality:
+// common_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
 //  - custom sampler logic based on the parameters
@@ -49,30 +49,30 @@
 // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
 // grammar constraints are applied to the full vocabulary and the token is resampled.
 //
-// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
+// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
 // be moved into the core llama library.
 //
-// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
+// For convenience, the common_sampler also maintains a container with the current candidate tokens.
 // This can be used to access the probabilities of the rest of the non-sampled tokens.
 //
 // TODO: measure grammar performance
 //

-struct gpt_sampler;
+struct common_sampler;

 // llama_sampler API overloads

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

-void gpt_sampler_free(struct gpt_sampler * gsmpl);
+void common_sampler_free(struct common_sampler * gsmpl);

 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
-struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_reset (struct common_sampler * gsmpl);
+struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

 // arguments can be nullptr to skip printing
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

 // extended sampling implementation:
 //
@@ -84,26 +84,47 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 // if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+// generalized version of common_sampler_sample
+//
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
+//
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
+//
+// requires: idxs.size() == draft.size() + 1
+//
+// returns at least 1 token, up to idxs.size()
+//
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+
+// assume idxs == [ 0, 1, 2, ..., draft.size() ]
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

 // helpers

 // access the internal list of current candidate tokens
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);

 // get the last accepted token
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
+llama_token common_sampler_last(const struct common_sampler * gsmpl);

 // print the sampler chain into a string
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
+std::string common_sampler_print(const struct common_sampler * gsmpl);

 // get a string representation of the last accepted tokens
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);

-char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
+char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -3,16 +3,12 @@
 #include "sampling_ext.h"
 #include "json-schema-to-grammar.h"

-struct gpt_sampler *gpt_sampler_cinit(
-    const struct llama_model *model, struct gpt_sampler_cparams *params)
-{
-    try
-    {
-        gpt_sampler_params sparams;
+struct common_sampler *common_sampler_cinit(const struct llama_model *model, struct common_sampler_cparams *params) {
+    try {
+        common_params_sampling sparams;
        sparams.top_k = params->top_k;
        sparams.top_p = params->top_p;
        sparams.min_p = params->min_p;
-        sparams.tfs_z = params->tfs_z;
        sparams.typ_p = params->typical_p;
        sparams.temp = params->temp;
        sparams.penalty_last_n = params->penalty_last_n;
@@ -25,38 +21,28 @@ struct gpt_sampler *gpt_sampler_cinit(
        sparams.penalize_nl = params->penalize_nl;
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
-        return gpt_sampler_init(model, sparams);
-    }
-    catch (const std::exception &err)
-    {
+        sparams.xtc_probability = 0.0;
+        sparams.xtc_threshold = 0.5;
+        return common_sampler_init(model, sparams);
+    } catch (const std::exception &err) {
        return nullptr;
    }
 }

-void gpt_sampler_cfree(struct gpt_sampler *sampler)
-{
-    gpt_sampler_free(sampler);
+void common_sampler_cfree(struct common_sampler *sampler) {
+    common_sampler_free(sampler);
 }

-void gpt_sampler_creset(struct gpt_sampler *sampler)
-{
-    gpt_sampler_reset(sampler);
+void common_sampler_creset(struct common_sampler *sampler) {
+    common_sampler_reset(sampler);
 }

-llama_token gpt_sampler_csample(
-    struct gpt_sampler *sampler,
-    struct llama_context *ctx_main,
-    int idx)
-{
-    return gpt_sampler_sample(sampler, ctx_main, idx);
+void common_sampler_caccept(struct common_sampler *sampler, llama_token id, bool apply_grammar) {
+    common_sampler_accept(sampler, id, apply_grammar);
 }

-void gpt_sampler_caccept(
-    struct gpt_sampler *sampler,
-    llama_token id,
-    bool apply_grammar)
-{
-    gpt_sampler_accept(sampler, id, apply_grammar);
+llama_token common_sampler_csample(struct common_sampler *sampler, struct llama_context *ctx, int idx) {
+    return common_sampler_sample(sampler, ctx, idx);
 }

 int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)

--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
-#ifndef GPT_SAMPLER_EXT_H
-#define GPT_SAMPLER_EXT_H
+#ifndef SAMPLING_EXT_H
+#define SAMPLING_EXT_H

 #ifdef __cplusplus
 extern "C"
@@ -9,14 +9,11 @@ extern "C"

    // Forward declaration to avoid include of "sampling.h" which has c++
    // includes
-    struct gpt_sampler;
-
-    struct gpt_sampler_cparams
-    {
+    struct common_sampler;
+    struct common_sampler_cparams {
        int32_t top_k;
        float top_p;
        float min_p;
-        float tfs_z;
        float typical_p;
        float temp;
        int32_t penalty_last_n;
@@ -31,21 +28,11 @@ extern "C"
        char *grammar;
    };

-    struct gpt_sampler *gpt_sampler_cinit(
-        const struct llama_model *model,
-        struct gpt_sampler_cparams *params);
-    void gpt_sampler_cfree(struct gpt_sampler *sampler);
-    void gpt_sampler_creset(struct gpt_sampler *sampler);
-
-    llama_token gpt_sampler_csample(
-        struct gpt_sampler *sampler,
-        struct llama_context *ctx_main,
-        int idx);
-
-    void gpt_sampler_caccept(
-        struct gpt_sampler *sampler,
-        llama_token id,
-        bool apply_grammar);
+    struct common_sampler *common_sampler_cinit(const struct llama_model *model, struct common_sampler_cparams *params);
+    void common_sampler_cfree(struct common_sampler *sampler);
+    void common_sampler_creset(struct common_sampler *sampler);
+    void common_sampler_caccept(struct common_sampler *sampler, llama_token id, bool apply_grammar);
+    llama_token common_sampler_csample(struct common_sampler *sampler, struct llama_context *ctx, int idx);

    int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);

@@ -53,4 +40,4 @@ extern "C"
 }
 #endif

-#endif // GPT_SAMPLER_EXT_H
+#endif // SAMPLING_EXT_H
--- a/llama/sgemm.cpp
+++ b/llama/sgemm.cpp
@@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
 inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

+#if defined(__MMA__)
+typedef vector unsigned char vec_t;
+typedef __vector_quad acc_t;
+#endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED FUSED MULTIPLY ADD

@@ -942,6 +946,36 @@ class tinyBLAS_Q0_AVX {
        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
    }

+    inline __m256i load(const block_q5_0 *b) {
+        return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
+    }
+
+    inline __m128i load0(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
+        __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
+        bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxl, bytesl);
+    }
+
+    inline __m128i load1(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
+        __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
+        bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxh, bytesh);
+    }
+
    inline __m256i load(const block_iq4_nl *b) {
        return MM256_SET_M128I(load1(b), load0(b));
    }
@@ -973,6 +1007,17 @@ class tinyBLAS_Q0_AVX {
                                                        _mm_srli_epi16(x, 4), 1));
    }

+    static inline __m256i bittobyte(const uint8_t *p) {
+        uint32_t x32;
+        memcpy(&x32, p, sizeof(uint32_t));
+        __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
+                                          _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                          _mm256_shuffle_epi8(_mm256_set1_epi32(x32),
+                                                                              _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
+                                                                                                0x0101010101010101, 0x0000000000000000))));
+        return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
+    }
+
    const TA *const A;
    const TB *const B;
    TC *const C;
@@ -985,6 +1030,600 @@ class tinyBLAS_Q0_AVX {
 };
 #endif // __AVX__

+//PPC Implementation
+#if defined(__MMA__)
+
+#define SAVE_ACC(ACC, ii, jj) \
+   __builtin_mma_disassemble_acc(vec_C, ACC); \
+   for (int I = 0; I < 4; I++) { \
+      for (int J = 0; J < 4; J++) { \
+         *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
+      } \
+   } \
+
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_PPC {
+  public:
+    tinyBLAS_PPC(int64_t k,
+                const TA *A, int64_t lda,
+                const TB *B, int64_t ldb,
+                TC *C, int64_t ldc,
+                int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+       mnpack(0, m, 0, n);
+    }
+
+  private:
+
+    void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
+
+    void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
+        int64_t i, j;
+        float *aoffset = NULL, *boffset = NULL;
+        float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+
+        aoffset = const_cast<float*>(a);
+        boffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+                i = (cols >> 3);
+                if (i > 0) {
+                    __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
+                    vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
+                    vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                    do {
+                        C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                        C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                        C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                        C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                        C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
+                        C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
+                        C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
+                        C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
+                        __builtin_vsx_disassemble_pair(c1, &C1);
+                        __builtin_vsx_disassemble_pair(c2, &C2);
+                        __builtin_vsx_disassemble_pair(c3, &C3);
+                        __builtin_vsx_disassemble_pair(c4, &C4);
+                        __builtin_vsx_disassemble_pair(c5, &C5);
+                        __builtin_vsx_disassemble_pair(c6, &C6);
+                        __builtin_vsx_disassemble_pair(c7, &C7);
+                        __builtin_vsx_disassemble_pair(c8, &C8);
+
+                        t1 = vec_mergeh(c1[0], c2[0]);
+                        t2 = vec_mergeh(c3[0], c4[0]);
+                        t3 = vec_mergeh(c5[0], c6[0]);
+                        t4 = vec_mergeh(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset);
+                        vec_xst(t6, 0, boffset+4);
+                        vec_xst(t7, 0, boffset+8);
+                        vec_xst(t8, 0, boffset+12);
+
+                        t1 = vec_mergel(c1[0], c2[0]);
+                        t2 = vec_mergel(c3[0], c4[0]);
+                        t3 = vec_mergel(c5[0], c6[0]);
+                        t4 = vec_mergel(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+16);
+                        vec_xst(t6, 0, boffset+20);
+                        vec_xst(t7, 0, boffset+24);
+                        vec_xst(t8, 0, boffset+28);
+
+                        t1 = vec_mergeh(c1[1], c2[1]);
+                        t2 = vec_mergeh(c3[1], c4[1]);
+                        t3 = vec_mergeh(c5[1], c6[1]);
+                        t4 = vec_mergeh(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+32);
+                        vec_xst(t6, 0, boffset+36);
+                        vec_xst(t7, 0, boffset+40);
+                        vec_xst(t8, 0, boffset+44);
+
+                        t1 = vec_mergel(c1[1], c2[1]);
+                        t2 = vec_mergel(c3[1], c4[1]);
+                        t3 = vec_mergel(c5[1], c6[1]);
+                        t4 = vec_mergel(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+48);
+                        vec_xst(t6, 0, boffset+52);
+                        vec_xst(t7, 0, boffset+56);
+                        vec_xst(t8, 0, boffset+60);
+
+                        aoffset1 += 8*lda;
+                        aoffset2 += 8*lda;
+                        aoffset3 += 8*lda;
+                        aoffset4 += 8*lda;
+                        boffset += 64;
+                        i--;
+                    } while(i > 0);
+                }
+                if (cols & 4) {
+                    vector float c1, c2, c3, c4, c5, c6, c7, c8;
+                    vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                    c1 = vec_xl(0, aoffset1);
+                    c2 = vec_xl(0, aoffset2);
+                    c3 = vec_xl(0, aoffset3);
+                    c4 = vec_xl(0, aoffset4);
+                    c5 = vec_xl(0, aoffset5);
+                    c6 = vec_xl(0, aoffset6);
+                    c7 = vec_xl(0, aoffset7);
+                    c8 = vec_xl(0, aoffset8);
+
+                    t1 = vec_mergeh(c1, c2);
+                    t2 = vec_mergeh(c3, c4);
+                    t3 = vec_mergeh(c5, c6);
+                    t4 = vec_mergeh(c7, c8);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergel(c1, c2);
+                    t2 = vec_mergel(c3, c4);
+                    t3 = vec_mergel(c5, c6);
+                    t4 = vec_mergel(c7, c8);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+                }
+            j--;
+            } while(j > 0);
+        }
+
+        if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+            i = (cols >> 3);
+            if (i > 0) {
+                __vector_pair C1, C2, C3, C4;
+                vector float c1[2], c2[2], c3[2], c4[2];
+                vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                do {
+                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                    C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                    __builtin_vsx_disassemble_pair(c1, &C1);
+                    __builtin_vsx_disassemble_pair(c2, &C2);
+                    __builtin_vsx_disassemble_pair(c3, &C3);
+                    __builtin_vsx_disassemble_pair(c4, &C4);
+
+                    t1 = vec_mergeh(c1[0], c2[0]);
+                    t2 = vec_mergeh(c3[0], c4[0]);
+                    t3 = vec_mergel(c1[0], c2[0]);
+                    t4 = vec_mergel(c3[0], c4[0]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergeh(c1[1], c2[1]);
+                    t2 = vec_mergeh(c3[1], c4[1]);
+                    t3 = vec_mergel(c1[1], c2[1]);
+                    t4 = vec_mergel(c3[1], c4[1]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+
+                    aoffset1 += 8*lda;
+                    aoffset2 += 8*lda;
+                    aoffset3 += 8*lda;
+                    aoffset4 += 8*lda;
+                    boffset += 32;
+                    i--;
+                } while(i > 0);
+            }
+
+            if (cols & 4) {
+                vector float c1, c2, c3, c4;
+                vector float t1, t2, t3, t4;
+                c1 = vec_xl(0, aoffset1);
+                c2 = vec_xl(0, aoffset2);
+                c3 = vec_xl(0, aoffset3);
+                c4 = vec_xl(0, aoffset4);
+
+                t1 = vec_mergeh(c1, c2);
+                t2 = vec_mergeh(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1, c2);
+                t2 = vec_mergel(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            if (cols & 4) {
+                vector float c1, c2, c3, c4 = {0};
+                vector float t1, t2, t3, t4;
+                c1 = vec_xl(0, aoffset1);
+                c2 = vec_xl(0, aoffset2);
+                c3 = vec_xl(0, aoffset3);
+
+                t1 = vec_mergeh(c1, c2);
+                t2 = vec_mergeh(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1, c2);
+                t2 = vec_mergel(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+    }
+
+    void KERNEL_4x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[4], vec_C[4];
+        acc_t acc_0;
+        __builtin_mma_xxsetaccz(&acc_0);
+        for (int l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+    }
+
+    void KERNEL_4x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[8], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+    }
+
+    void KERNEL_8x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[4], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii+4, jj);
+    }
+
+    void KERNEL_8x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[16], vec_B[16], vec_C[4];
+        acc_t acc_0, acc_1, acc_2, acc_3;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        __builtin_mma_xxsetaccz(&acc_2);
+        __builtin_mma_xxsetaccz(&acc_3);
+        for (int l = 0; l < k; l+=8) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
+            for(int x = 0; x < 16; x+=2) {
+                __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
+                __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
+            }
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+        SAVE_ACC(&acc_2, ii+4, jj);
+        SAVE_ACC(&acc_3, ii+4, jj+4);
+    }
+
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        int m_rem = MIN(m - m0, 16);
+        int n_rem = MIN(n - n0, 16);
+        if (m_rem >= 16 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if(m_rem >= 8 && n_rem >= 16) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 8) {
+            mc = 4;
+            nc = 8;
+            gemm<4,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 4) {
+            mc = 8;
+            nc = 4;
+            gemm<8,4>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 4) {
+            mc = 4;
+            nc = 4;
+            gemm<4,4>(m0, m, n0, n);
+        } else if ((m_rem < 4) && (n_rem > 4)) {
+            nc = 4;
+            switch(m_rem) {
+                case 1:
+                    mc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    mc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    mc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else if ((m_rem > 4) && (n_rem < 4)) {
+            mc = 4;
+            switch(n_rem) {
+                case 1:
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else {
+            switch((m_rem << 4) | n_rem) {
+                case 0x43:
+                    mc = 4;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x42:
+                    mc = 4;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x41:
+                    mc = 4;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x34:
+                    mc = 3;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x33:
+                    mc = 3;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x32:
+                    mc = 3;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x31:
+                    mc = 3;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x24:
+                    mc = 2;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x23:
+                    mc = 2;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x22:
+                    mc = 2;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x21:
+                    mc = 2;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x14:
+                    mc = 1;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x13:
+                    mc = 1;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x12:
+                    mc = 1;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x11:
+                    mc = 1;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+     void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            vec_t vec_C[4];
+            acc_t acc_0;
+            __builtin_mma_xxsetaccz(&acc_0);
+            vec_t vec_A[4], vec_B[4];
+            for (int l=0; l<k; l+=4) {
+                if (RN >= 4 && RM == 1) {
+                    float* a = const_cast<float*>(A+(ii)*lda+l);
+                    READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+                    vec_A[0] = (vec_t)vec_xl(0,a);
+                    vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
+                    vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
+                    vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
+                } else {
+                    READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
+                    READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
+                }
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_0);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < RN; J++) {
+                    *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
+                }
+            }
+       }
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (RM == 4 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x4;
+        } else if (RM == 4 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x8;
+        } else if (RM == 8 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x4;
+        } else if (RM == 8 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x8;
+        }
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            (this->*kernel)(ii, jj);
+        }
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *C;
+    TA *At;
+    TB *Bt;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+#endif
 } // namespace

 /**
@@ -1073,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            ith, nth};
        tb.matmul(m, n);
        return true;
+#elif defined(__MMA__)
+        if (k % 8)
+            return false;
+        tinyBLAS_PPC<float, float, float> tb{
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
 #else
        return false;
 #endif
@@ -1182,6 +1831,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
    }

+    case GGML_TYPE_Q5_0: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
+            k, (const block_q5_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
    case GGML_TYPE_IQ4_NL: {
        if (Btype != GGML_TYPE_Q8_0)
            return false;

--- a/llama/unicode-data.cpp
+++ b/llama/unicode-data.cpp
 /**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
 *
 * MIT License
 *
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <unordered_set>

-const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
+const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
 {0x000000, 0x0080},
 {0x000020, 0x0008},
 {0x000021, 0x0020},
@@ -2337,7 +2337,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };

-const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
 {0x000043, 0x000063},
@@ -3773,7 +3774,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
 {0x01E921, 0x01E943},
 };

-const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
 {0x000063, 0x000043},
@@ -5226,7 +5228,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
 {0x01E943, 0x01E921},
 };

-const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
+const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x000000, 0x000000, 0x000000},
 {0x0000C0, 0x0000C5, 0x000041},
 {0x0000C7, 0x0000C7, 0x000043},

--- a/llama/unicode-data.h
+++ b/llama/unicode-data.h
 /**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
 *
 * MIT License
 *
@@ -39,8 +39,8 @@ struct range_nfd {

 static const uint32_t MAX_CODEPOINTS = 0x110000;

-extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
-extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
-extern const std::vector<range_nfd> unicode_ranges_nfd;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;