llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

d7d7e996 · Jeffrey Morgan · GitHub · 2db96c18 · d7d7e996 · d7d7e996
Unverified Commit d7d7e996 authored Feb 26, 2025 by Jeffrey Morgan Committed by GitHub Feb 26, 2025
20 changed files
--- a/Makefile.sync
+++ b/Makefile.sync
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
+FETCH_HEAD=d7cfe1ffe0f435d0048a6058d529daf76e072d9c
 .PHONY: help
 help:

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
+char const *LLAMA_COMMIT = "d7cfe1ffe0f435d0048a6058d529daf76e072d9c";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -4,6 +4,7 @@
 #include "llama-cpp.h"
+#include <set>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -24,11 +25,11 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-struct common_lora_adapter_info {
+struct common_adapter_lora_info {
    std::string path;
    float scale;
-    struct llama_lora_adapter * ptr;
+    struct llama_adapter_lora * ptr;
 };
 using llama_tokens = std::vector<llama_token>;
@@ -103,6 +104,17 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };
+enum common_conversation_mode {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
+    COMMON_CONVERSATION_MODE_ENABLED  = 1,
+    COMMON_CONVERSATION_MODE_AUTO     = 2,
+};
+struct common_grammar_trigger {
+    std::string word;
+    bool at_start;
+};
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -128,6 +140,7 @@ struct common_params_sampling {
    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
    bool    ignore_eos         = false;
@@ -149,6 +162,10 @@ struct common_params_sampling {
    };
    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
+    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::set<llama_token>               preserved_tokens;
    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -161,15 +178,19 @@ struct common_params_speculative {
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
    std::string model = "";     // draft model for speculative decoding                      // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
 };
 struct common_params_vocoder {
@@ -178,6 +199,13 @@ struct common_params_vocoder {
    std::string model     = ""; // model path                                                // NOLINT
    std::string model_url = ""; // model url to download                                     // NOLINT
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+};
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
 };
 struct common_params {
@@ -240,14 +268,13 @@ struct common_params {
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -271,11 +298,11 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence
    bool usage             = false; // print usage
+    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
@@ -301,6 +328,8 @@ struct common_params {
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)
@@ -322,7 +351,9 @@ struct common_params {
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    std::vector<std::string> api_keys;
@@ -401,13 +432,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
 //
 #ifdef __GNUC__
-#ifdef __MINGW32__
+#    if defined(__MINGW32__) && !defined(__clang__)
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    else
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -416,6 +447,10 @@ std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_join(const std::vector<std::string> & values, const std::string & separator);
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
+std::string string_repeat(const std::string & str, size_t n);
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 template<class T>
@@ -454,6 +489,11 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }
+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -481,7 +521,7 @@ struct common_init_result {
    llama_model_ptr   model;
    llama_context_ptr context;
-    std::vector<llama_lora_adapter_ptr> lora;
+    std::vector<llama_adapter_lora_ptr> lora;
 };
 struct common_init_result     common_init_from_params(common_params & params);
@@ -495,6 +535,7 @@ struct llama_model * common_load_model_from_url(
    const std::string & local_path,
    const std::string & hf_token,
    const struct llama_model_params & params);
 struct llama_model * common_load_model_from_hf(
    const std::string & repo,
    const std::string & remote_path,
@@ -502,8 +543,12 @@ struct llama_model * common_load_model_from_hf(
    const std::string & hf_token,
    const struct llama_model_params & params);
+std::pair<std::string, std::string> common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & hf_token);
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 //
 // Batch utils
@@ -541,7 +586,7 @@ std::vector<llama_token> common_tokenize(
                        bool   parse_special = false);
 std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+    const struct llama_vocab * vocab,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);
@@ -553,48 +598,23 @@ std::string common_token_to_piece(
                       llama_token   token,
                       bool          special = true);
+std::string common_token_to_piece(
+          const struct llama_vocab * vocab,
+                       llama_token   token,
+                       bool          special = true);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string common_detokenize(
-                         llama_context * ctx,
+            const struct llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);
-//
+std::string common_detokenize(
-// Chat template utils
+              const struct llama_vocab * vocab,
-//
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);
-// same with llama_chat_message, but uses std::string
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-};
-// Get the built-in chat template for the model. Return empty string if not present.
-std::string common_get_builtin_chat_template(const struct llama_model * model);
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl);
-// CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & chat,
-        bool add_ass);
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass);
-// Returns an example of formatted chat
-std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl);
 //
 // KV cache utils

--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
 #include "json-schema-to-grammar.h"
+#include "common.h"
 #include <algorithm>
 #include <fstream>
 #include <map>
@@ -11,11 +13,6 @@
 using json = nlohmann::ordered_json;
-template <typename Iterator>
-static std::string join(Iterator begin, Iterator end, const std::string & separator);
-static std::string repeat(const std::string & str, size_t n);
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = repeat("0", sub_len);
+                    auto sub_zeros = string_repeat("0", sub_len);
-                    auto sub_nines = repeat("9", sub_len);
+                    auto sub_nines = string_repeat("9", sub_len);
                    auto to_reached = false;
                    out << "(";
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        auto max_digits = max_s.length();
        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, repeat("9", digits));
+            uniform_range(min_s, string_repeat("9", digits));
-            min_s = "1" + repeat("0", digits);
+            min_s = "1" + string_repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
 std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
-template <typename Iterator>
-std::string join(Iterator begin, Iterator end, const std::string & separator) {
-    std::ostringstream result;
-    if (begin != end) {
-        result << *begin;
-        for (Iterator it = begin + 1; it != end; ++it) {
-            result << separator << *it;
-        }
-    }
-    return result.str();
-}
-static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
-    std::vector<std::string> tokens;
-    size_t start = 0;
-    size_t end = str.find(delimiter);
-    while (end != std::string::npos) {
-        tokens.push_back(str.substr(start, end - start));
-        start = end + delimiter.length();
-        end = str.find(delimiter, start);
-    }
-    tokens.push_back(str.substr(start));
-    return tokens;
-}
-static std::string repeat(const std::string & str, size_t n) {
-    if (n == 0) {
-        return "";
-    }
-    std::string result;
-    result.reserve(str.length() * n);
-    for (size_t i = 0; i < n; ++i) {
-        result += str;
-    }
-    return result;
-}
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
    std::string result;
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
 class SchemaConverter {
 private:
+    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::unordered_map<std::string, std::string> _rules;
@@ -418,7 +373,7 @@ private:
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
-        return join(rules.begin(), rules.end(), " | ");
+        return string_join(rules, " | ");
    }
    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -481,7 +436,7 @@ private:
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
-                return std::make_pair(join(results.begin(), results.end(), " "), false);
+                return std::make_pair(string_join(results, " "), false);
            };
            while (i < length) {
@@ -539,7 +494,7 @@ private:
                    }
                    curly_brackets += '}';
                    i++;
-                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = std::numeric_limits<int>::max();
                    try {
@@ -809,10 +764,11 @@ private:
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
-        bool dotall)
+        bool dotall,
+        bool compact_spaces)
          : _fetch_json(fetch_json), _dotall(dotall)
    {
-        _rules["space"] = SPACE_RULE;
+        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
    }
    void resolve_refs(json & schema, const std::string & url) {
@@ -854,7 +810,7 @@ public:
                            return;
                        }
                        std::string pointer = ref.substr(ref.find('#') + 1);
-                        std::vector<std::string> tokens = split(pointer, "/");
+                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
                            if (target.is_null() || !target.contains(sel)) {
@@ -905,7 +861,7 @@ public:
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -1019,10 +975,10 @@ public:
    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
-            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
        }
    }
@@ -1035,11 +991,35 @@ public:
    }
 };
-std::string json_schema_to_grammar(const json & schema) {
+std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
-    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+#ifdef LLAMA_USE_LLGUIDANCE
+    if (!force_gbnf) {
+        return "%llguidance {}\nstart: %json " + schema.dump();
+    }
+#else
+    (void)force_gbnf;
+#endif // LLAMA_USE_LLGUIDANCE
+    return build_grammar([&](const common_grammar_builder & callbacks) {
        auto copy = schema;
-    converter.resolve_refs(copy, "input");
+        callbacks.resolve_refs(copy);
-    converter.visit(copy, "");
+        callbacks.add_schema("", copy);
+    });
+}
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
+    common_grammar_builder builder {
+        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
+            return converter._add_rule(name, rule);
+        },
+        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
+            return converter.visit(schema, name == "root" ? "" : name);
+        },
+        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
+            converter.resolve_refs(schema, "");
+        }
+    };
+    cb(builder);
    converter.check_errors();
    return converter.format_grammar();
 }
--- a/llama/llama.cpp/common/json-schema-to-grammar.h
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -5,4 +5,18 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
+std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
+                                   bool force_gbnf = false);
+struct common_grammar_builder {
+    std::function<std::string(const std::string &, const std::string &)> add_rule;
+    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
+    std::function<void(nlohmann::ordered_json &)> resolve_refs;
+};
+struct common_grammar_options {
+    bool dotall = false;
+    bool compact_spaces = false;
+};
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/llama/llama.cpp/common/log.cpp
+++ b/llama/llama.cpp/common/log.cpp
 #include "log.h"
+#include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
@@ -14,16 +15,6 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }
-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -206,6 +197,7 @@ public:
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
            }
 #endif
+            va_end(args_copy);
        }
        entry.level = level;

--- a/llama/llama.cpp/common/log.h
+++ b/llama/llama.cpp/common/log.h
@@ -2,9 +2,20 @@
 #include "ggml.h" // for ggml_log_level
+#define LOG_CLR_TO_EOL  "\033[K\r"
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))

--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -113,7 +113,10 @@ struct common_sampler {
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);
-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const int n_vocab = llama_vocab_n_tokens(vocab);
        cur.resize(n_vocab);
@@ -131,24 +134,47 @@ std::string common_params_sampling::print() const {
    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
            mirostat, mirostat_eta, mirostat_tau);
    return std::string(result);
 }
 struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
+    const llama_vocab * vocab = llama_model_get_vocab(model);
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
    lparams.no_perf = params.no_perf;
+    struct llama_sampler * grmr;
+    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
+#ifdef LLAMA_USE_LLGUIDANCE
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+#else
+        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+#endif // LLAMA_USE_LLGUIDANCE
+    } else {
+        std::vector<const char *> trigger_words;
+        trigger_words.reserve(params.grammar_trigger_words.size());
+        for (const auto & str : params.grammar_trigger_words) {
+            trigger_words.push_back(str.word.c_str());
+        }
+        grmr = params.grammar_lazy
+             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
+                                               trigger_words.data(), trigger_words.size(),
+                                               params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
+             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+    }
    auto * result = new common_sampler {
        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .grmr   = */ grmr,
        /* .chain  = */ llama_sampler_chain_init(lparams),
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
@@ -157,11 +183,16 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
    llama_sampler_chain_add(result->chain,
            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
+                llama_vocab_n_tokens(vocab),
                params.logit_bias.size(),
                params.logit_bias.data()));
    if (params.mirostat == 0) {
+        if (params.top_n_sigma >= 0) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
+        } else {
            for (const auto & cnstr : params.samplers) {
                switch (cnstr) {
                    case COMMON_SAMPLER_TYPE_DRY:
@@ -172,7 +203,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                                c_breakers.push_back(str.c_str());
                            }
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                        }
                        break;
                    case COMMON_SAMPLER_TYPE_TOP_K:
@@ -194,7 +225,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                        break;
                    case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
                        break;
                    case COMMON_SAMPLER_TYPE_PENALTIES:
                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
@@ -203,10 +234,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                        GGML_ASSERT(false && "unknown sampler type");
                }
            }
+        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));

--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
 std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+                const char * grammar_kind, const char * grammar_data);
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@@ -55,6 +55,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -73,6 +74,9 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -89,10 +93,14 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 #ifdef __cplusplus
 }
 #endif

--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    return true;
 }
-static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
+static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
    int width = image->nx;
    int height = image->ny;
    int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
            else {
-                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-                if (has_minicpmv_projector == 2) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-                }
-                else if (has_minicpmv_projector == 3) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-                }
            }
            if (!encoded) {
@@ -313,6 +307,23 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        load_image_size->height = img->ny;
        clip_add_load_image_size(ctx_clip, load_image_size);
        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
+    }
+    else if (clip_is_glm(ctx_clip)){
+        struct clip_image_size * load_image_size = clip_image_size_init();
+        load_image_size->width = img_res_v.data[0].nx;
+        load_image_size->height = img_res_v.data[0].ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
+        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
+        *n_img_pos = (pos * pos + 2);
+        if (!encoded){
+            LOG_ERR("Unable to encode image \n");
+            return false;
+        }
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
@@ -342,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
        const int32_t * image_grid = clip_image_grid(ctx_clip);
+        const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
        std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
+        for (size_t i = 0; i < num_gridpoints; i += 2) {
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }
@@ -384,7 +396,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
        // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@@ -394,10 +406,14 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    int num_max_patches = 6;
+    // Granite vision uses up to 10 patches + base patch
+    int num_max_patches = 11;
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
+    if (clip_is_glm(ctx_clip)) {
+        num_max_patches = 1;
+    }
    float * image_embd;
    if (clip_is_qwen2vl(ctx_clip)) {
        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
@@ -457,7 +473,7 @@ struct llava_embd_batch {
 };
 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
        int n_eval = image_embed->n_image_pos - i;

--- a/llama/llama.cpp/include/llama-cpp.h
+++ b/llama/llama.cpp/include/llama-cpp.h
@@ -9,7 +9,7 @@
 #include "llama.h"
 struct llama_model_deleter {
-    void operator()(llama_model * model) { llama_free_model(model); }
+    void operator()(llama_model * model) { llama_model_free(model); }
 };
 struct llama_context_deleter {
@@ -20,11 +20,11 @@ struct llama_sampler_deleter {
    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
 };
-struct llama_lora_adapter_deleter {
+struct llama_adapter_lora_deleter {
-    void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
 };
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
-typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
+typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
 #include "llama-adapter.h"
+#include "llama-impl.h"
+#include "llama-mmap.h"
 #include "llama-model.h"
 #include <algorithm>
@@ -9,7 +11,7 @@
 // vec
-struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
+struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }
@@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
    return tensors[il];
 }
-struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
    return cur;
 }
-static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+bool llama_adapter_cvec::init(const llama_model & model) {
    const auto & hparams = model.hparams;
-    GGML_ASSERT(cvec.tensors.empty());
+    GGML_ASSERT(tensors.empty());
-    GGML_ASSERT(cvec.ctxs.empty());
+    GGML_ASSERT(ctxs.empty());
-    GGML_ASSERT(cvec.bufs.empty());
+    GGML_ASSERT(bufs.empty());
    // create a context for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
            }
            ctx_map[buft] = ctx;
-            cvec.ctxs.emplace_back(ctx);
+            ctxs.emplace_back(ctx);
            return ctx;
        }
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
    };
    // make tensors
-    cvec.tensors.reserve(hparams.n_layer);
+    tensors.reserve(hparams.n_layer);
-    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+    tensors.push_back(nullptr); // there's never a tensor for layer 0
    for (size_t il = 1; il < hparams.n_layer; il++) {
-        ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
+        ggml_backend_buffer_type_t buft = model.select_buft(il);
        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
            return false;
        }
        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
-        cvec.tensors.push_back(tensor);
+        tensors.push_back(tensor);
    }
    // allocate tensors / buffers and zero
-    cvec.bufs.reserve(ctx_map.size());
+    bufs.reserve(ctx_map.size());
    for (auto it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
-        cvec.bufs.emplace_back(buf);
+        bufs.emplace_back(buf);
    }
    return true;
 }
-int32_t llama_control_vector_apply(
+int32_t llama_adapter_cvec::apply(
-        struct llama_control_vector & cvec,
        const llama_model & model,
        const float * data,
        size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
    if (data == nullptr) {
        // disable the current control vector (but leave allocated for later)
-        cvec.layer_start = -1;
+        layer_start = -1;
-        cvec.layer_end   = -1;
+        layer_end   = -1;
        return 0;
    }
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
        return 1;
    }
-    if (cvec.tensors.empty()) {
+    if (tensors.empty()) {
-        if (!llama_control_vector_init(cvec, model)) {
+        if (!init(model)) {
            return 1;
        }
    }
-    cvec.layer_start = il_start;
+    layer_start = il_start;
-    cvec.layer_end   = il_end;
+    layer_end   = il_end;
    for (size_t il = 1; il < hparams.n_layer; il++) {
-        assert(cvec.tensors[il] != nullptr);
+        assert(tensors[il] != nullptr);
        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
        if (off + n_embd <= len) {
-            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
        }
    }
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
 // lora
-llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
    const std::string name(w->name);
    const auto pos = ab_map.find(name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
    return nullptr;
 }
-void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
+static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
-    delete adapter;
-}
-static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
    ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    };
    // bundle lora_a and lora_b into pairs
-    std::map<std::string, llama_lora_weight> ab_map;
+    std::map<std::string, llama_adapter_lora_weight> ab_map;
    auto str_endswith = [](const std::string & str, const std::string & suffix) {
        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
    };
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
        if (str_endswith(name, ".lora_a")) {
            replace_all(name, ".lora_a", "");
            if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(cur, nullptr);
+                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
            } else {
                ab_map[name].a = cur;
            }
        } else if (str_endswith(name, ".lora_b")) {
            replace_all(name, ".lora_b", "");
            if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(nullptr, cur);
+                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
            } else {
                ab_map[name].b = cur;
            }
+        } else if (str_endswith(name, "_norm.weight")) {
+            // TODO: add support for norm vector
+            // for now, we don't really care because most adapters still work fine without it
+            continue;
        } else {
            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
        }
@@ -250,33 +251,41 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    // add tensors
    for (auto & it : ab_map) {
        const std::string & name = it.first;
-        llama_lora_weight & w = it.second;
+        llama_adapter_lora_weight & w = it.second;
+        bool is_token_embd = str_endswith(name, "token_embd.weight");
        if (!w.a || !w.b) {
            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
        }
        // device buft and device ctx
-        auto * model_tensor = llama_model_get_tensor(model, name.c_str());
+        const auto * model_tensor = model.get_tensor(name.c_str());
        if (!model_tensor) {
-            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
+            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }
        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
        // validate tensor shape
+        if (is_token_embd) {
+            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+            if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+            }
+        } else {
            if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
-            throw std::runtime_error("tensor '" + name + "' has incorrect shape");
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
            }
            if (w.a->ne[1] != w.b->ne[0]) {
                throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
            }
+        }
        // save tensor to adapter
        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
-        adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
+        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
    }
    // allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
+struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
-    struct llama_lora_adapter * adapter = new llama_lora_adapter();
+    struct llama_adapter_lora * adapter = new llama_adapter_lora();
    try {
-        llama_lora_adapter_init_impl(*model, path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
    return nullptr;
 }
+void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+    delete adapter;
+}
--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
 #pragma once
-#include "llama-impl.h"
+#include "llama.h"
-#include "llama-hparams.h"
 #include "ggml-cpp.h"
+#include <string>
 #include <unordered_map>
 #include <vector>
+// TODO: pimpl
 //
 // llama_adapter_cvec
 //
-// TODO: rename to llama_adapter_cvec
+struct llama_adapter_cvec {
-struct llama_control_vector {
-    std::vector<ggml_context_ptr> ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-    std::vector<struct ggml_tensor *> tensors; // per layer
-    int32_t layer_start = -1;
-    int32_t layer_end   = -1;
    struct ggml_tensor * tensor_for(int il) const;
    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
-};
-int32_t llama_control_vector_apply(
+    int32_t apply(
-        struct llama_control_vector & cvec,
            const llama_model & model,
            const float * data,
            size_t len,
@@ -36,31 +27,48 @@ int32_t llama_control_vector_apply(
            int32_t il_start,
            int32_t il_end);
+private:
+    bool init(const llama_model & model);
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+    std::vector<struct ggml_tensor *> tensors; // per layer
+};
 //
 // llama_adapter_lora
 //
-// TODO: rename to llama_adapter_lora_weight
+struct llama_adapter_lora_weight {
-struct llama_lora_weight {
    struct ggml_tensor * a = nullptr;
    struct ggml_tensor * b = nullptr;
-    llama_lora_weight() = default;
+    // get actual scale based on rank and alpha
-    llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    float get_scale(float alpha, float adapter_scale) const {
+        const float rank  = (float) b->ne[0];
+        const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
+        return scale;
+    }
+    llama_adapter_lora_weight() = default;
+    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
 };
-// TODO: rename to llama_adapter_lora
+struct llama_adapter_lora {
-struct llama_lora_adapter {
    // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_lora_weight> ab_map;
+    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    float alpha;
-    llama_lora_adapter() = default;
+    llama_adapter_lora() = default;
-    ~llama_lora_adapter() = default;
+    ~llama_adapter_lora() = default;
-    llama_lora_weight * get_weight(struct ggml_tensor * w);
+    llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
 };
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -28,6 +28,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
    { LLM_ARCH_PHI2,             "phi2"             },
    { LLM_ARCH_PHI3,             "phi3"             },
+    { LLM_ARCH_PHIMOE,           "phimoe"           },
    { LLM_ARCH_PLAMO,            "plamo"            },
    { LLM_ARCH_CODESHELL,        "codeshell"        },
    { LLM_ARCH_ORION,            "orion"            },
@@ -57,6 +58,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_NEMOTRON,         "nemotron"         },
    { LLM_ARCH_EXAONE,           "exaone"           },
    { LLM_ARCH_RWKV6,            "rwkv6"            },
+    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
    { LLM_ARCH_GRANITE,          "granite"          },
    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
    { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -107,6 +109,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
    { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
+    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -179,6 +182,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
@@ -622,6 +627,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_PHIMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
    {
        LLM_ARCH_PLAMO,
        {
@@ -1036,6 +1062,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
@@ -1182,6 +1211,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TIME_MIX_LERP_V,           "blk.%d.time_mix_lerp_v" },
            { LLM_TENSOR_TIME_MIX_LERP_R,           "blk.%d.time_mix_lerp_r" },
            { LLM_TENSOR_TIME_MIX_LERP_G,           "blk.%d.time_mix_lerp_g" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
            { LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },
            { LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },
            { LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },
@@ -1199,6 +1229,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
        },
    },
+    {
+        LLM_ARCH_RWKV6QWEN2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
+            { LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },
+            { LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_FFN_NORM,                  "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,                  "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,                  "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,                    "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_GRANITE,
        {
@@ -1253,6 +1309,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
        },
    },
+    {
+        LLM_ARCH_SOLAR,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
+        },
+    },
    {
        LLM_ARCH_WAVTOKENIZER_DEC,
        {
@@ -1278,24 +1352,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
        },
    },
-    {
-        LLM_ARCH_SOLAR,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
-        },
-    },
    {
        LLM_ARCH_UNKNOWN,
        {
@@ -1399,6 +1455,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_TIME_MIX_LERP_V,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_LERP_R,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_FUSED,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
    {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1455,10 +1512,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
-LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
+LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
 std::string LLM_KV::operator()(llm_kv kv) const {
-    return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+    return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
+        : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
 }
 std::string LLM_TN_IMPL::str() const {

--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -32,6 +32,7 @@ enum llm_arch {
    LLM_ARCH_QWEN2VL,
    LLM_ARCH_PHI2,
    LLM_ARCH_PHI3,
+    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
@@ -61,6 +62,7 @@ enum llm_arch {
    LLM_ARCH_NEMOTRON,
    LLM_ARCH_EXAONE,
    LLM_ARCH_RWKV6,
+    LLM_ARCH_RWKV6QWEN2,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
    LLM_ARCH_CHAMELEON,
@@ -111,6 +113,7 @@ enum llm_kv {
    LLM_KV_TIME_DECAY_EXTRA_DIM,
    LLM_KV_RESIDUAL_SCALE,
    LLM_KV_EMBEDDING_SCALE,
+    LLM_KV_TOKEN_SHIFT_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -177,6 +180,8 @@ enum llm_kv {
    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
+    LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
    LLM_KV_TOKENIZER_FIM_PRE_ID,
    LLM_KV_TOKENIZER_FIM_SUF_ID,
    LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -256,6 +261,7 @@ enum llm_tensor {
    LLM_TENSOR_TIME_MIX_LERP_V,
    LLM_TENSOR_TIME_MIX_LERP_R,
    LLM_TENSOR_TIME_MIX_LERP_G,
+    LLM_TENSOR_TIME_MIX_LERP_FUSED,
    LLM_TENSOR_TIME_MIX_FIRST,
    LLM_TENSOR_TIME_MIX_DECAY,
    LLM_TENSOR_TIME_MIX_DECAY_W1,
@@ -343,9 +349,10 @@ enum llm_tensor_layer {
 };
 struct LLM_KV {
-    LLM_KV(llm_arch arch);
+    LLM_KV(llm_arch arch, const char * suffix = nullptr);
    llm_arch arch;
+    const char * suffix;
    std::string operator()(llm_kv kv) const;
 };

--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
@@ -50,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
@@ -73,7 +75,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return tmpl.find(haystack) != std::string::npos;
    };
    if (tmpl_contains("<|im_start|>")) {
-        return LLM_CHAT_TEMPLATE_CHATML;
+        return tmpl_contains("<|im_sep|>")
+            ? LLM_CHAT_TEMPLATE_PHI_4
+            : LLM_CHAT_TEMPLATE_CHATML;
    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
        if (tmpl_contains("[SYSTEM_PROMPT]")) {
            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -112,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
-        return LLM_CHAT_TEMPLATE_FALCON_3;
+        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
        return LLM_CHAT_TEMPLATE_ZEPHYR;
    } else if (tmpl_contains("bos_token + message['role']")) {
@@ -149,7 +153,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_MINICPM;
    } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
        return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
-    } else if (tmpl_contains(LU8("'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'"))) {
+    } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
        return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
@@ -269,6 +273,14 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
+        // chatml template
+        for (auto message : chat) {
+            ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
+        }
+        if (add_ass) {
+            ss << "<|im_start|>assistant<|im_sep|>";
+        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
        // Falcon 3
        for (auto message : chat) {
@@ -429,6 +441,14 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n" << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {