Unverified Commit 0cf7794b authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)"

This reverts commit 3a9e8e9f.

* ggml update to b7087

* fix argsort on metal

* update to b7108

* fix bakllava regression

This model lacks the metadata for the projector type.

* update to b7209

* fix TopK perf

* only build arm code on arm
parent 854d40ed
...@@ -6,8 +6,10 @@ ...@@ -6,8 +6,10 @@
#include <cmath> #include <cmath>
#include <algorithm> #include <algorithm>
#include <cstdint>
#include <stdexcept> #include <stdexcept>
#define MAX_REPETITION_THRESHOLD 2000
// //
// helpers // helpers
// //
...@@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence( ...@@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence(
size_t last_sym_start = rule.size(); size_t last_sym_start = rule.size();
const char * pos = src; const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) { // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
// (though it's technically the same as -1 now)
auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
bool no_max = max_times == UINT64_MAX;
if (last_sym_start == rule.size()) { if (last_sym_start == rule.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
} }
...@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence( ...@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
rule.resize(last_sym_start); rule.resize(last_sym_start);
} else { } else {
// Repeat the previous elements (min_times - 1) times // Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) { for (uint64_t i = 1; i < min_times; i++) {
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end()); rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
} }
} }
uint32_t last_rec_rule_id = 0; uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times; auto n_opt = no_max ? 1 : max_times - min_times;
llama_grammar_rule rec_rule(prev_rule); llama_grammar_rule rec_rule(prev_rule);
for (int i = 0; i < n_opt; i++) { for (uint64_t i = 0; i < n_opt; i++) {
rec_rule.resize(prev_rule.size()); rec_rule.resize(prev_rule.size());
uint32_t rec_rule_id = generate_symbol_id( rule_name); uint32_t rec_rule_id = generate_symbol_id( rule_name);
if (i > 0 || max_times < 0) { if (i > 0 || no_max) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
} }
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0}); rec_rule.push_back({LLAMA_GRETYPE_END, 0});
...@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence( ...@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
throw std::runtime_error(std::string("expecting an int at ") + pos); throw std::runtime_error(std::string("expecting an int at ") + pos);
} }
const char * int_end = parse_int(pos); const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos)); uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested); pos = parse_space(int_end, is_nested);
int max_times = -1; uint64_t max_times = UINT64_MAX; // default: no max limit
if (*pos == '}') { if (*pos == '}') {
max_times = min_times; max_times = min_times;
...@@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence( ...@@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence(
} else { } else {
throw std::runtime_error(std::string("expecting ',' at ") + pos); throw std::runtime_error(std::string("expecting ',' at ") + pos);
} }
bool has_max = max_times != UINT64_MAX;
if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
}
handle_repetitions(min_times, max_times); handle_repetitions(min_times, max_times);
} else { } else {
break; break;
......
...@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn( ...@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
if (gate && type_gate == LLM_FFN_PAR) { if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp); cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il); cb(cur, "ffn_gate_par", il);
...@@ -958,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -958,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// organize experts into n_expert_groups // organize experts into n_expert_groups
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens] ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens] ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens] group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
// get top n_group_used expert groups // get top n_group_used expert groups
group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens] group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens] group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens] ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
cb(expert_groups, "ffn_moe_group_topk", il); cb(expert_groups, "ffn_moe_group_topk", il);
// mask out the other groups // mask out the other groups
...@@ -976,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -976,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
} }
// select experts // select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
cb(selected_experts->src[0], "ffn_moe_argsort", il); cb(selected_experts->src[0], "ffn_moe_argsort", il);
cb(selected_experts, "ffn_moe_topk", il); cb(selected_experts, "ffn_moe_topk", il);
...@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
cb(weights_sum, "ffn_moe_weights_sum", il); cb(weights_sum, "ffn_moe_weights_sum", il);
if (arch == LLM_ARCH_BAILINGMOE2) { // Avoid division by zero, clamp to smallest number representable by F16
weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20); weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
cb(weights_sum, "ffn_moe_weights_sum_biased", il); cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
}
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights_norm", il); cb(weights, "ffn_moe_weights_norm", il);
...@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il); cb(experts, "ffn_moe_down", il);
...@@ -1137,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -1137,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// input embeddings with optional lora // input embeddings with optional lora
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd_inp();
auto inp = std::make_unique<llm_graph_input_embd>(); auto inp = std::make_unique<llm_graph_input_embd>();
...@@ -1274,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { ...@@ -1274,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
// return cur; // return cur;
//} //}
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd; const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
...@@ -1587,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1587,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn(
int il) const { int il) const {
// these nodes are added to the graph together so that they are not reordered // these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced // by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
ggml_build_forward_expand(gf, q_cur); ggml_build_forward_expand(gf, q_cur);
ggml_build_forward_expand(gf, k_cur);
ggml_build_forward_expand(gf, v_cur); ggml_build_forward_expand(gf, v_cur);
ggml_build_forward_expand(gf, k_cur);
const auto * mctx_cur = inp->mctx; const auto * mctx_cur = inp->mctx;
...@@ -2030,7 +2036,7 @@ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buck ...@@ -2030,7 +2036,7 @@ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buck
if (bidirectional) { if (bidirectional) {
relative_bucket += (relative_position > 0) * n_buckets; relative_bucket += (relative_position > 0) * n_buckets;
relative_position = abs(relative_position); relative_position = std::abs(relative_position);
} else { } else {
relative_position = -std::min<int32_t>(relative_position, 0); relative_position = -std::min<int32_t>(relative_position, 0);
} }
......
...@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { ...@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
return n_head/n_head_kv; return n_head/n_head_kv;
} }
uint32_t llama_hparams::n_embd_inp() const {
uint32_t n_embd_inp = n_embd;
if (n_deepstack_layers > 0) {
n_embd_inp += n_embd * n_deepstack_layers;
}
return n_embd_inp;
}
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const { uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il); const uint32_t n_head_kv = this->n_head_kv(il);
...@@ -148,7 +158,7 @@ bool llama_hparams::is_recurrent(uint32_t il) const { ...@@ -148,7 +158,7 @@ bool llama_hparams::is_recurrent(uint32_t il) const {
} }
uint32_t llama_hparams::n_pos_per_embd() const { uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1; return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
} }
bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const { bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
// bump if necessary // bump if necessary
#define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2 #define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
enum llama_expert_gating_func_type { enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
...@@ -185,6 +185,9 @@ struct llama_hparams { ...@@ -185,6 +185,9 @@ struct llama_hparams {
std::array<float, LLAMA_MAX_LAYERS> xielu_beta; std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
std::array<float, LLAMA_MAX_LAYERS> xielu_eps; std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
// qwen3vl deepstack
uint32_t n_deepstack_layers = 0;
// needed by encoder-decoder models (e.g. T5, FLAN-T5) // needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141 // ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL; llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
...@@ -226,6 +229,9 @@ struct llama_hparams { ...@@ -226,6 +229,9 @@ struct llama_hparams {
uint32_t n_gqa(uint32_t il = 0) const; uint32_t n_gqa(uint32_t il = 0) const;
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of key embeddings across all k-v heads // dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const; uint32_t n_embd_k_gqa(uint32_t il = 0) const;
......
...@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state; ...@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
time_meas::~time_meas() { time_meas::~time_meas() {
if (t_start_us >= 0) { if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us; t_acc += ggml_time_us() - t_start_us;
}
} }
}
void llama_log_set(ggml_log_callback log_callback, void * user_data) { void llama_log_set(ggml_log_callback log_callback, void * user_data) {
ggml_log_set(log_callback, user_data); ggml_log_set(log_callback, user_data);
......
...@@ -45,7 +45,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( ...@@ -45,7 +45,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
const uint32_t size_base = kv_size; const uint32_t size_base = kv_size;
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad)); // note: the SWA cache is always padded to 256 for performance
// https://github.com/ggml-org/llama.cpp/issues/17037
uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
if (swa_full) { if (swa_full) {
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstring>
#include <limits> #include <limits>
#include <map> #include <map>
#include <stdexcept> #include <stdexcept>
...@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache( ...@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache(
const uint32_t n_layer_kv = hparams.n_layer_kv(); const uint32_t n_layer_kv = hparams.n_layer_kv();
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
}
};
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
// create a context for each buffer type // create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
...@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache( ...@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache(
return nullptr; return nullptr;
} }
ctx_map[buft] = ctx; ctx_map.emplace(buft, ctx);
ctxs.emplace_back(ctx);
return ctx; return ctx;
} }
return it->second; return it->second.get();
}; };
GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max); GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
...@@ -167,11 +174,8 @@ llama_kv_cache::llama_kv_cache( ...@@ -167,11 +174,8 @@ llama_kv_cache::llama_kv_cache(
} }
// allocate tensors and initialize the buffers to avoid NaNs in the padding // allocate tensors and initialize the buffers to avoid NaNs in the padding
for (auto it : ctx_map) { for (auto & [buft, ctx] : ctx_map) {
auto * buft = it.first; ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
auto * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) { if (!buf) {
throw std::runtime_error("failed to allocate buffer for kv cache"); throw std::runtime_error("failed to allocate buffer for kv cache");
} }
...@@ -179,7 +183,7 @@ llama_kv_cache::llama_kv_cache( ...@@ -179,7 +183,7 @@ llama_kv_cache::llama_kv_cache(
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
ggml_backend_buffer_clear(buf, 0); ggml_backend_buffer_clear(buf, 0);
bufs.emplace_back(buf); ctxs_bufs.emplace_back(std::move(ctx), buf);
} }
{ {
...@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) { ...@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) {
} }
if (data) { if (data) {
for (auto & buf : bufs) { for (auto & [_, buf] : ctxs_bufs) {
ggml_backend_buffer_clear(buf.get(), 0); ggml_backend_buffer_clear(buf.get(), 0);
} }
} }
...@@ -334,6 +338,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll ...@@ -334,6 +338,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
llama_pos pos = v_cells[s0].pos_get(i); llama_pos pos = v_cells[s0].pos_get(i);
llama_pos shift = v_cells[s0].get_shift(i); llama_pos shift = v_cells[s0].get_shift(i);
llama_kv_cell_ext ext = v_cells[s0].ext_get(i);
if (shift != 0) { if (shift != 0) {
pos -= shift; pos -= shift;
assert(pos >= 0); assert(pos >= 0);
...@@ -345,6 +351,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll ...@@ -345,6 +351,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
if (shift != 0) { if (shift != 0) {
v_cells[s1].pos_add(i, shift); v_cells[s1].pos_add(i, shift);
} }
v_cells[s1].ext_set(i, ext);
} }
} }
...@@ -379,6 +387,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { ...@@ -379,6 +387,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1");
auto & cells = v_cells[seq_to_stream[seq_id]]; auto & cells = v_cells[seq_to_stream[seq_id]];
auto & head = v_heads[seq_to_stream[seq_id]]; auto & head = v_heads[seq_to_stream[seq_id]];
...@@ -423,6 +432,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll ...@@ -423,6 +432,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll
void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1");
auto & cells = v_cells[seq_to_stream[seq_id]]; auto & cells = v_cells[seq_to_stream[seq_id]];
...@@ -472,8 +482,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { ...@@ -472,8 +482,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const { std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret; std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { for (const auto & [_, buf] : ctxs_bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
} }
return ret; return ret;
} }
...@@ -896,6 +906,14 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ...@@ -896,6 +906,14 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
cells.pos_set(idx, ubatch.pos[i]); cells.pos_set(idx, ubatch.pos[i]);
if (ubatch.is_pos_2d()) {
llama_kv_cell_ext ext {
/*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
/*.y =*/ ubatch.pos[i + ubatch.n_tokens],
};
cells.ext_set(idx, ext);
}
for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) { for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
cells.seq_add(idx, ubatch.seq_id[i][s]); cells.seq_add(idx, ubatch.seq_id[i][s]);
} }
...@@ -957,10 +975,14 @@ bool llama_kv_cache::get_has_shift() const { ...@@ -957,10 +975,14 @@ bool llama_kv_cache::get_has_shift() const {
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const { uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
uint32_t result = 0; uint32_t result = 0;
// pad the n_kv value so that the graph remains constant across batches and can be reused
// note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
const uint32_t n_pad_cur = std::max(n_pad, 256u);
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
const auto & cells = v_cells[sinfo.strm[s]]; const auto & cells = v_cells[sinfo.strm[s]];
result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result); result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
} }
return result; return result;
...@@ -1239,6 +1261,11 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u ...@@ -1239,6 +1261,11 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
const llama_pos p1 = ubatch->pos[i]; const llama_pos p1 = ubatch->pos[i];
// for M-RoPE
const bool is_2d = ubatch->is_pos_2d();
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii); const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
for (uint32_t j = 0; j < n_kv; ++j) { for (uint32_t j = 0; j < n_kv; ++j) {
...@@ -1258,6 +1285,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u ...@@ -1258,6 +1285,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
continue; continue;
} }
// M-RoPE causal mask
if (causal_attn && is_2d && p0 == p1) {
const auto & p0_ext = cells.ext_get(j);
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
continue;
}
}
// apply SWA if any // apply SWA if any
if (is_masked_swa(p0, p1)) { if (is_masked_swa(p0, p1)) {
continue; continue;
...@@ -1298,7 +1333,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch ...@@ -1298,7 +1333,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
size_t llama_kv_cache::total_size() const { size_t llama_kv_cache::total_size() const {
size_t size = 0; size_t size = 0;
for (const auto & buf : bufs) { for (const auto & [_, buf] : ctxs_bufs) {
size += ggml_backend_buffer_get_size(buf.get()); size += ggml_backend_buffer_get_size(buf.get());
} }
...@@ -1340,7 +1375,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift( ...@@ -1340,7 +1375,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow; const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & n_rot = hparams.n_rot; const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
// @ngxson : this is a workaround // @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift // for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering // a normal RoPE should work, we just need to use the correct ordering
...@@ -1551,6 +1586,9 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t ...@@ -1551,6 +1586,9 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
io.write(&pos, sizeof(pos)); io.write(&pos, sizeof(pos));
io.write(&n_seq_id, sizeof(n_seq_id)); io.write(&n_seq_id, sizeof(n_seq_id));
// TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
for (const auto & seq_id : seq_ids) { for (const auto & seq_id : seq_ids) {
io.write(&seq_id, sizeof(seq_id)); io.write(&seq_id, sizeof(seq_id));
} }
...@@ -1696,6 +1734,8 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 ...@@ -1696,6 +1734,8 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
return false; return false;
} }
// TODO: we cannot yet restore llama_kv_cell_ext as the apply_ubatch() does not support it yet
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
apply_ubatch(sinfo, ubatch); apply_ubatch(sinfo, ubatch);
const auto head_cur = sinfo.head(); const auto head_cur = sinfo.head();
...@@ -2010,8 +2050,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub ...@@ -2010,8 +2050,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
kv->set_input_pos_bucket(dst, ubatch); kv->set_input_pos_bucket(dst, ubatch);
} }
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
// the FA kernels require padding to avoid extra runtime boundary checks
return cparams.flash_attn ? 256u : 32u;
}
...@@ -19,8 +19,6 @@ struct llama_context; ...@@ -19,8 +19,6 @@ struct llama_context;
class llama_kv_cache : public llama_memory_i { class llama_kv_cache : public llama_memory_i {
public: public:
static uint32_t get_padding(const llama_cparams & cparams);
struct stream_copy_info { struct stream_copy_info {
bool empty() const { bool empty() const {
assert(ssrc.size() == sdst.size()); assert(ssrc.size() == sdst.size());
...@@ -217,8 +215,8 @@ private: ...@@ -217,8 +215,8 @@ private:
// this is the SWA type of the cache - not to be confused with the model SWA type // this is the SWA type of the cache - not to be confused with the model SWA type
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs; // ggml contexts for the KV cache along with the allocated backend buffers:
std::vector<ggml_backend_buffer_ptr> bufs; std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot()) // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
......
...@@ -5,9 +5,27 @@ ...@@ -5,9 +5,27 @@
#include <bitset> #include <bitset>
#include <cassert> #include <cassert>
#include <vector> #include <cstring>
#include <set>
#include <map> #include <map>
#include <set>
#include <vector>
struct llama_kv_cell_ext {
// 2D spatial positions, typically used for M-RoPE
llama_pos x = 0;
llama_pos y = 0;
// return true if the current 2D spatial position is greater than other
bool is_2d_gt(llama_pos ox, llama_pos oy) const {
return (y > oy) || (y == oy && x > ox);
}
void reset() {
static_assert(std::is_trivially_copyable_v<llama_kv_cell_ext>);
memset(this, 0, sizeof(*this));
}
};
// meta information about KV cells that can be part of multiple sequences at the same time // meta information about KV cells that can be part of multiple sequences at the same time
// TODO: add unit tests // TODO: add unit tests
...@@ -16,6 +34,7 @@ public: ...@@ -16,6 +34,7 @@ public:
void reset() { void reset() {
for (uint32_t i = 0; i < pos.size(); ++i) { for (uint32_t i = 0; i < pos.size(); ++i) {
pos[i] = -1; pos[i] = -1;
ext[i].reset();
shift[i] = 0; shift[i] = 0;
seq[i].reset(); seq[i].reset();
} }
...@@ -43,6 +62,7 @@ public: ...@@ -43,6 +62,7 @@ public:
void resize(uint32_t n) { void resize(uint32_t n) {
pos.resize(n); pos.resize(n);
ext.resize(n);
shift.resize(n); shift.resize(n);
seq.resize(n); seq.resize(n);
...@@ -108,6 +128,7 @@ public: ...@@ -108,6 +128,7 @@ public:
const auto idx = i + j; const auto idx = i + j;
res.pos[j] = pos[idx]; res.pos[j] = pos[idx];
res.ext[j] = ext[idx];
res.seq[j] = seq[idx]; res.seq[j] = seq[idx];
assert(shift[idx] == 0); assert(shift[idx] == 0);
...@@ -126,6 +147,7 @@ public: ...@@ -126,6 +147,7 @@ public:
const auto idx = idxs[j]; const auto idx = idxs[j];
res.pos[j] = pos[idx]; res.pos[j] = pos[idx];
res.ext[j] = ext[idx];
res.seq[j] = seq[idx]; res.seq[j] = seq[idx];
assert(shift[idx] == 0); assert(shift[idx] == 0);
...@@ -154,6 +176,7 @@ public: ...@@ -154,6 +176,7 @@ public:
} }
pos[idx] = other.pos[j]; pos[idx] = other.pos[j];
ext[idx] = other.ext[j];
seq[idx] = other.seq[j]; seq[idx] = other.seq[j];
if (pos[idx] != -1) { if (pos[idx] != -1) {
...@@ -184,6 +207,7 @@ public: ...@@ -184,6 +207,7 @@ public:
} }
pos[idx] = other.pos[j]; pos[idx] = other.pos[j];
ext[idx] = other.ext[j];
seq[idx] = other.seq[j]; seq[idx] = other.seq[j];
if (pos[idx] != -1) { if (pos[idx] != -1) {
...@@ -203,6 +227,7 @@ public: ...@@ -203,6 +227,7 @@ public:
seq[i].reset(); seq[i].reset();
pos[i] = -1; pos[i] = -1;
ext[i].reset();
shift[i] = 0; shift[i] = 0;
used.erase(i); used.erase(i);
...@@ -221,6 +246,7 @@ public: ...@@ -221,6 +246,7 @@ public:
if (seq[i].none()) { if (seq[i].none()) {
pos[i] = -1; pos[i] = -1;
ext[i].reset();
shift[i] = 0; shift[i] = 0;
used.erase(i); used.erase(i);
...@@ -250,6 +276,7 @@ public: ...@@ -250,6 +276,7 @@ public:
seq[i].reset(); seq[i].reset();
pos[i] = -1; pos[i] = -1;
ext[i].reset();
shift[i] = 0; shift[i] = 0;
used.erase(i); used.erase(i);
...@@ -340,6 +367,13 @@ public: ...@@ -340,6 +367,13 @@ public:
return pos[i]; return pos[i];
} }
const llama_kv_cell_ext & ext_get(uint32_t i) const {
assert(i < pos.size());
assert(pos[i] != -1);
return ext[i];
}
// note: call only if the cell is not empty // note: call only if the cell is not empty
llama_pos get_shift(uint32_t i) const { llama_pos get_shift(uint32_t i) const {
assert(i < pos.size()); assert(i < pos.size());
...@@ -368,6 +402,11 @@ public: ...@@ -368,6 +402,11 @@ public:
used.insert(i); used.insert(i);
} }
void ext_set(uint32_t i, llama_kv_cell_ext p) {
assert(i < ext.size());
ext[i] = p;
}
// pos[i] = pos[i] + d // pos[i] = pos[i] + d
// sets "has_shift" to true // sets "has_shift" to true
// note: call only if the cell is not empty // note: call only if the cell is not empty
...@@ -424,6 +463,9 @@ private: ...@@ -424,6 +463,9 @@ private:
std::vector<llama_pos> pos; std::vector<llama_pos> pos;
// stores extra info per cell
std::vector<llama_kv_cell_ext> ext;
// this array accumulates any applied shifts to the pos array since the last reset_shift() call // this array accumulates any applied shifts to the pos array since the last reset_shift() call
// this is used to queue multiple updates to the pos array, which in the end can be applied in one go: // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
// //
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstring>
#include <limits> #include <limits>
#include <map> #include <map>
#include <stdexcept> #include <stdexcept>
...@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent( ...@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
cells.clear(); cells.clear();
cells.resize(mem_size); cells.resize(mem_size);
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
}
};
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
// create a context for each buffer type // create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
...@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent( ...@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
return nullptr; return nullptr;
} }
ctx_map[buft] = ctx; ctx_map.emplace(buft, ctx);
ctxs.emplace_back(ctx);
return ctx; return ctx;
} }
return it->second; return it->second.get();
}; };
r_l.resize(n_layer); r_l.resize(n_layer);
...@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent( ...@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
} }
// allocate tensors and initialize the buffers to avoid NaNs in the padding // allocate tensors and initialize the buffers to avoid NaNs in the padding
for (auto it : ctx_map) { for (auto & [buft, ctx] : ctx_map) {
auto * buft = it.first; ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
auto * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) { if (!buf) {
throw std::runtime_error("failed to allocate buffer for rs cache"); throw std::runtime_error("failed to allocate buffer for rs cache");
} }
ggml_backend_buffer_clear(buf, 0); ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
bufs.emplace_back(buf); ctxs_bufs.emplace_back(std::move(ctx), buf);
} }
{ {
...@@ -129,7 +133,7 @@ void llama_memory_recurrent::clear(bool data) { ...@@ -129,7 +133,7 @@ void llama_memory_recurrent::clear(bool data) {
used = 0; used = 0;
if (data) { if (data) {
for (auto & buf : bufs) { for (auto & [_, buf] : ctxs_bufs) {
ggml_backend_buffer_clear(buf.get(), 0); ggml_backend_buffer_clear(buf.get(), 0);
} }
} }
...@@ -147,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos ...@@ -147,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
p1 = std::numeric_limits<llama_pos>::max(); p1 = std::numeric_limits<llama_pos>::max();
} }
// models like Mamba or RWKV can't have a state partially erased // models like Mamba or RWKV can't have a state partially erased at the end
// of the sequence because their state isn't preserved for previous tokens
if (seq_id >= (int64_t) size) { if (seq_id >= (int64_t) size) {
// could be fatal // could be fatal
return false; return false;
...@@ -156,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos ...@@ -156,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
int32_t & tail_id = cells[seq_id].tail; int32_t & tail_id = cells[seq_id].tail;
if (tail_id >= 0) { if (tail_id >= 0) {
const auto & cell = cells[tail_id]; const auto & cell = cells[tail_id];
// partial intersection is invalid // partial intersection is invalid if it includes the final pos
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) { if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n"); //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
return false; return false;
} }
...@@ -364,8 +369,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { ...@@ -364,8 +369,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const { std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret; std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { for (const auto & [_, buf] : ctxs_bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
} }
return ret; return ret;
} }
...@@ -662,7 +667,7 @@ bool llama_memory_recurrent::get_can_shift() const { ...@@ -662,7 +667,7 @@ bool llama_memory_recurrent::get_can_shift() const {
size_t llama_memory_recurrent::total_size() const { size_t llama_memory_recurrent::total_size() const {
size_t size = 0; size_t size = 0;
for (const auto & buf : bufs) { for (const auto & [_, buf] : ctxs_bufs) {
size += ggml_backend_buffer_get_size(buf.get()); size += ggml_backend_buffer_get_size(buf.get());
} }
......
...@@ -109,8 +109,8 @@ private: ...@@ -109,8 +109,8 @@ private:
const uint32_t n_seq_max = 1; const uint32_t n_seq_max = 1;
std::vector<ggml_context_ptr> ctxs; // ggml contexts for the KV cache along with the allocated backend buffers:
std::vector<ggml_backend_buffer_ptr> bufs; std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
size_t total_size() const; size_t total_size() const;
......
This diff is collapsed.
...@@ -77,6 +77,7 @@ enum llm_type { ...@@ -77,6 +77,7 @@ enum llm_type {
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
LLM_TYPE_22B, LLM_TYPE_22B,
LLM_TYPE_26B,
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
...@@ -113,8 +114,10 @@ enum llm_type { ...@@ -113,8 +114,10 @@ enum llm_type {
LLM_TYPE_16B_A1B, LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B, LLM_TYPE_30B_A3B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B, LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_355B_A32B, // GLM-4.5 LLM_TYPE_355B_A32B, // GLM-4.5
...@@ -234,6 +237,7 @@ struct llama_layer { ...@@ -234,6 +237,7 @@ struct llama_layer {
struct ggml_tensor * wk_enc = nullptr; struct ggml_tensor * wk_enc = nullptr;
struct ggml_tensor * wv_enc = nullptr; struct ggml_tensor * wv_enc = nullptr;
struct ggml_tensor * wo_enc = nullptr; struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;
// attention bias // attention bias
struct ggml_tensor * bq = nullptr; struct ggml_tensor * bq = nullptr;
...@@ -307,6 +311,9 @@ struct llama_layer { ...@@ -307,6 +311,9 @@ struct llama_layer {
struct ggml_tensor * ssm_conv1d_b = nullptr; struct ggml_tensor * ssm_conv1d_b = nullptr;
struct ggml_tensor * ssm_dt_b = nullptr; struct ggml_tensor * ssm_dt_b = nullptr;
// qwen3next
struct ggml_tensor * ssm_beta_alpha = nullptr;
// rwkv // rwkv
struct ggml_tensor * time_mix_w1 = nullptr; struct ggml_tensor * time_mix_w1 = nullptr;
struct ggml_tensor * time_mix_w2 = nullptr; struct ggml_tensor * time_mix_w2 = nullptr;
...@@ -385,6 +392,13 @@ struct llama_layer { ...@@ -385,6 +392,13 @@ struct llama_layer {
// openai-moe // openai-moe
struct ggml_tensor * attn_sinks = nullptr; struct ggml_tensor * attn_sinks = nullptr;
// cogvlm
struct ggml_tensor * visexp_attn_wqkv = nullptr;
struct ggml_tensor * visexp_attn_wo = nullptr;
struct ggml_tensor * visexp_ffn_gate = nullptr;
struct ggml_tensor * visexp_ffn_down = nullptr;
struct ggml_tensor * visexp_ffn_up = nullptr;
// xIELU activation parameters for Apertus // xIELU activation parameters for Apertus
struct ggml_tensor * ffn_act_alpha_n = nullptr; struct ggml_tensor * ffn_act_alpha_n = nullptr;
struct ggml_tensor * ffn_act_alpha_p = nullptr; struct ggml_tensor * ffn_act_alpha_p = nullptr;
...@@ -503,9 +517,8 @@ struct llama_model { ...@@ -503,9 +517,8 @@ struct llama_model {
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const; ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
// note: can mutate `cparams`
// TODO: move this to new llm_arch_model_i interface // TODO: move this to new llm_arch_model_i interface
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
// TODO: move this to new llm_arch_model_i interface // TODO: move this to new llm_arch_model_i interface
ggml_cgraph * build_graph(const llm_graph_params & params) const; ggml_cgraph * build_graph(const llm_graph_params & params) const;
......
...@@ -653,7 +653,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -653,7 +653,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64); gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64)); gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool); gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
...@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue; continue;
} else if (remapped_name != it.first) { }
if (remapped_name != it.first) {
ggml_set_name(it.second.tensor, remapped_name.c_str()); ggml_set_name(it.second.tensor, remapped_name.c_str());
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor)); LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
} }
...@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
{ {
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads // attention layers have a non-zero number of kv heads
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
if (llama_model_has_encoder(&model)) { if (llama_model_has_encoder(&model)) {
// now n_attn_layer is the number of attention layers in the encoder // now n_layer_attn is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers // for each decoder block, there are 2 attention layers
n_attn_layer += 2 * model.hparams.dec_n_layer; n_layer_attn += 2 * model.hparams.dec_n_layer;
} }
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
} }
size_t total_size_org = 0; size_t total_size_org = 0;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-grammar.h" #include "llama-grammar.h"
#include <array>
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cfloat> #include <cfloat>
...@@ -471,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) { ...@@ -471,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
for (auto * smpl : chain->samplers) { for (auto * smpl : chain->samplers) {
llama_sampler_reset(smpl); llama_sampler_reset(smpl);
} }
chain->t_sample_us = 0;
chain->n_sample = 0;
} }
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
...@@ -1625,10 +1623,12 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( ...@@ -1625,10 +1623,12 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
auto * ctx = new llama_sampler_grammar; auto * ctx = new llama_sampler_grammar;
if (grammar_str != nullptr && grammar_str[0] != '\0') { if (grammar_str != nullptr && grammar_str[0] != '\0') {
std::string trigger_pattern;
llama_grammar * grammar = nullptr;
// TODO: remove trigger_words support. // TODO: remove trigger_words support.
if (trigger_words != nullptr && num_trigger_words > 0) { if (trigger_words != nullptr && num_trigger_words > 0) {
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0); GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
std::string trigger_pattern("[\\s\\S]*?("); trigger_pattern = "[\\s\\S]*?(";
for (size_t i = 0; i < num_trigger_words; ++i) { for (size_t i = 0; i < num_trigger_words; ++i) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
if (i > 0) { if (i > 0) {
...@@ -1637,15 +1637,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( ...@@ -1637,15 +1637,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0"); trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
} }
trigger_pattern += ")[\\s\\S]*"; trigger_pattern += ")[\\s\\S]*";
const auto * trigger_pattern_c = trigger_pattern.c_str();
trigger_patterns = &trigger_pattern_c; std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
num_trigger_patterns = 1; grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
} else {
grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
} }
*ctx = { *ctx = {
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str, /* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root, /* .grammar_root = */ grammar_root,
/* .grammar = */ llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens), /* .grammar = */ grammar,
}; };
if (!ctx->grammar) { if (!ctx->grammar) {
delete ctx; delete ctx;
...@@ -2665,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c ...@@ -2665,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
void llama_perf_sampler_print(const struct llama_sampler * chain) { void llama_perf_sampler_print(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain); const auto data = llama_perf_sampler(chain);
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
} }
void llama_perf_sampler_reset(struct llama_sampler * chain) { void llama_perf_sampler_reset(struct llama_sampler * chain) {
...@@ -2676,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) { ...@@ -2676,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
auto * ctx = (struct llama_sampler_chain *) chain->ctx; auto * ctx = (struct llama_sampler_chain *) chain->ctx;
ctx->t_sample_us = ctx->n_sample = 0; ctx->t_sample_us = 0;
ctx->n_sample = 0;
} }
...@@ -401,6 +401,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -401,6 +401,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_GPT4O: case LLAMA_VOCAB_PRE_TYPE_GPT4O:
case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
regex_exprs = { regex_exprs = {
// original regex from tokenizer.json // original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
...@@ -442,6 +443,17 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -442,6 +443,17 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_AFMOE:
regex_exprs = {
// Digit handling - uses custom implementation in unicode.cpp
// Groups digits with leading 1-2 based on total length modulo 3
"\\p{AFMoE_digits}",
// CJK and Asian scripts (using direct Unicode literals)
"[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ・-゚⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
// Main BPE pattern
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
regex_exprs = { regex_exprs = {
...@@ -1012,7 +1024,7 @@ private: ...@@ -1012,7 +1024,7 @@ private:
} }
private: private:
uint32_t get_node(size_t index) { uint32_t get_node(size_t index) {
if (index > xcda_array_size) { if (index >= xcda_array_size) {
throw std::runtime_error("Index out of array bounds in XCDA array!"); throw std::runtime_error("Index out of array bounds in XCDA array!");
} }
return xcda_array[index]; return xcda_array[index];
...@@ -1269,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer { ...@@ -1269,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {
// Build suffix list in lexicographical order of reversed strings // Build suffix list in lexicographical order of reversed strings
std::vector<std::string> suffixes; std::vector<std::string> suffixes;
suffixes.reserve(suffix_to_score.size() + 1);
for (const auto & pair : suffix_to_score) { for (const auto & pair : suffix_to_score) {
suffixes.push_back(pair.first); suffixes.push_back(pair.first);
} }
...@@ -1981,6 +1994,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1981,6 +1994,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "grok-2") { tokenizer_pre == "grok-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "afmoe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
clean_spaces = false;
} else if (
tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
......
...@@ -49,6 +49,8 @@ enum llama_vocab_pre_type { ...@@ -49,6 +49,8 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39, LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
}; };
struct LLM_KV; struct LLM_KV;
......
...@@ -5,4 +5,8 @@ package llama ...@@ -5,4 +5,8 @@ package llama
// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include // #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
// #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602 // #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602
import "C" import "C"
import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
import (
_ "github.com/ollama/ollama/llama/llama.cpp/src/models"
_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
)
#include "models.h"
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// MuP scaling: embeddings * sqrt(hidden_size)
// mup_enabled = true, hidden_size = 1024, scale = 32.0
inpL = ggml_scale(ctx0, inpL, sqrtf(float(n_embd)));
cb(inpL, "inp_embd_scaled", -1);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// dual attention normalization (pre)
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * attn_inp = cur; // save input for gate computation
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
// compute gate from input
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
cb(gate, "attn_gate_proj", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
// Q/K normalization
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il);
// RoPE only for sliding_attention layers
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
((il + 1) % hparams.n_no_rope_layer_step) != 0;
if (use_rope) {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur_rope", il);
}
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
cur = build_attn(inp_attn,
NULL, NULL, // wo will be applied after gating
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
// attention gating: attn_out * sigmoid(gate) BEFORE o_proj
gate = ggml_sigmoid(ctx0, gate);
cb(gate, "attn_gate_sig", il);
cur = ggml_mul(ctx0, cur, gate);
cb(cur, "attn_gated", il);
// now apply output projection
cur = build_lora_mm(model.layers[il].wo, cur);
cb(cur, "attn_o_proj", il);
}
// dual attention normalization (post)
cur = build_norm(cur,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// dual ffn normalization (pre)
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// MoE or dense FFN
if ((uint32_t)il >= hparams.n_layer_dense_lead) {
// MoE layer with sigmoid routing, normalization, and scaling
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU,
hparams.expert_weights_norm, // norm_w (route_norm=True)
hparams.expert_weights_scale, // scale_w
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
// shared expert
if (hparams.n_expert_shared > 0) {
ggml_tensor * ffn_shexp = build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
} else {
cur = moe_out;
}
} else {
// dense layer
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
}
// dual ffn normalization (post)
cur = build_norm(cur,
model.layers[il].ffn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_post_norm", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
#include "models.h"
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_pos", il);
cb(Kcur, "Kcur_pos", il);
cb(Vcur, "Vcur_pos", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network with xIELU activation
{
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// Up projection
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
cb(up, "ffn_up", il);
float alpha_n_val = hparams.xielu_alpha_n[il];
float alpha_p_val = hparams.xielu_alpha_p[il];
float beta_val = hparams.xielu_beta[il];
float eps_val = hparams.xielu_eps[il];
// Apply xIELU activation
ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
cb(activated, "ffn_xielu", il);
// Down projection
cur = build_lora_mm(model.layers[il].ffn_down, activated);
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment