"docs/vscode:/vscode.git/clone" did not exist on "248bece63376411e9f32330fe4e99c0c140b4514"
Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
...@@ -29,8 +29,8 @@ enum llm_chat_template { ...@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3, LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGML_4, LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_EXAONE_3,
...@@ -41,6 +41,7 @@ enum llm_chat_template { ...@@ -41,6 +41,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_YANDEX,
LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_BAILING,
LLM_CHAT_TEMPLATE_LLAMA4, LLM_CHAT_TEMPLATE_LLAMA4,
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };
......
...@@ -114,7 +114,7 @@ llama_context::llama_context( ...@@ -114,7 +114,7 @@ llama_context::llama_context(
} }
if (n_ctx_per_seq > hparams.n_ctx_train) { if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const {
ggml_backend_buffer * bbuf) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_ext_factor = cparams.yarn_ext_factor;
...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back // dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
if (bbuf) { tmp = ggml_rope_ext(ctx0, tmp,
for (const auto & backend : backends) {
// Figure out which backend KV cache belongs to
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
break;
}
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0); 0);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { ...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative) // set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
ggml_backend_buffer_clear(buf_output.get(), 0);
this->n_outputs = 0; this->n_outputs = 0;
this->n_outputs_max = n_outputs_max; this->n_outputs_max = n_outputs_max;
......
...@@ -172,8 +172,7 @@ private: ...@@ -172,8 +172,7 @@ private:
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const;
ggml_backend_buffer * bbuf) const;
llm_graph_result_ptr build_kv_self_shift( llm_graph_result_ptr build_kv_self_shift(
ggml_context * ctx0, ggml_context * ctx0,
......
...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { ...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) { if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
}
} }
} }
...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
) * f_attn_temp_scale + 1.0; ) * f_attn_temp_scale + 1.0;
} }
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
} }
} }
...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res (std::make_unique<llm_graph_result>()) { res (std::make_unique<llm_graph_result>()) {
} }
int64_t llm_graph_context::n_pos_per_token() const { int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1; return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
} }
...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn( ...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) { if (down) {
cur = build_lora_mm(down, cur); cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
} }
if (down_b) { if (down_b) {
...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il); cb(up, "ffn_moe_up", il);
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * experts = nullptr;
cb(gate, "ffn_moe_gate", il); if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
switch (type_op) { switch (type_op) {
case LLM_FFN_SILU: case LLM_FFN_SILU:
{ {
gate = ggml_silu(ctx0, gate); cur = ggml_silu(ctx0, cur);
cb(gate, "ffn_moe_silu", il); cb(cur, "ffn_moe_silu", il);
} break; } break;
case LLM_FFN_GELU: case LLM_FFN_GELU:
{ {
gate = ggml_gelu(ctx0, gate); cur = ggml_gelu(ctx0, cur);
cb(gate, "ffn_moe_gelu", il); cb(cur, "ffn_moe_gelu", il);
} break; } break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] if (gate_exps) {
cb(par, "ffn_moe_gate_par", il); cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate_par", il);
}
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il); cb(experts, "ffn_moe_down", il);
if (!weight_before_ffn) { if (!weight_before_ffn) {
...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
} }
ggml_tensor * llm_graph_context::build_inp_pos() const { ggml_tensor * llm_graph_context::build_inp_pos() const {
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token()); auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto & cur = inp->pos; auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { ...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
} }
ggml_tensor * llm_graph_context::build_inp_attn_scale() const { ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale; auto & cur = inp->attn_scale;
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); // this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
......
...@@ -91,29 +91,27 @@ public: ...@@ -91,29 +91,27 @@ public:
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
public: public:
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
virtual ~llm_graph_input_pos() = default; virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch] ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1; const int64_t n_pos_per_embd = 1;
}; };
// temperature tuning, used by llama4 // temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i { class llm_graph_input_attn_temp : public llm_graph_input_i {
public: public:
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default; virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch] ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
const int64_t n_pos_per_token = 1;
const uint32_t n_attn_temp_floor_scale; const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale; const float f_attn_temp_scale;
}; };
...@@ -430,7 +428,7 @@ struct llm_graph_context { ...@@ -430,7 +428,7 @@ struct llm_graph_context {
llm_graph_context(const llm_graph_params & params); llm_graph_context(const llm_graph_params & params);
int64_t n_pos_per_token() const; int64_t n_pos_per_embd() const;
void cb(ggml_tensor * cur, const char * name, int il) const; void cb(ggml_tensor * cur, const char * name, int il) const;
......
...@@ -72,6 +72,7 @@ struct llama_hparams { ...@@ -72,6 +72,7 @@ struct llama_hparams {
float expert_weights_scale = 0.0; float expert_weights_scale = 0.0;
bool expert_weights_norm = false; bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
float f_norm_eps; float f_norm_eps;
float f_norm_rms_eps; float f_norm_rms_eps;
......
This diff is collapsed.
This diff is collapsed.
...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times. // This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// don't quantize vision stuff
quantize &= name.find("v.") == std::string::npos;
quantize &= name.find("mm.") == std::string::npos;
// quantize only 2D and 3D tensors (experts) // quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2); quantize &= (ggml_n_dims(tensor) >= 2);
......
...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// } // }
if (k <= 0) { if (k <= 0) {
k = cur_p->size; return;
} }
k = std::min(k, (int) cur_p->size); k = std::min(k, (int) cur_p->size);
...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
} }
cur_p->sorted = true; cur_p->sorted = true;
} }
cur_p->size = k; cur_p->size = k;
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment