Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
...@@ -29,8 +29,8 @@ enum llm_chat_template { ...@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3, LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGML_4, LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_EXAONE_3,
...@@ -41,6 +41,7 @@ enum llm_chat_template { ...@@ -41,6 +41,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_YANDEX,
LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_BAILING,
LLM_CHAT_TEMPLATE_LLAMA4, LLM_CHAT_TEMPLATE_LLAMA4,
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };
......
...@@ -114,7 +114,7 @@ llama_context::llama_context( ...@@ -114,7 +114,7 @@ llama_context::llama_context(
} }
if (n_ctx_per_seq > hparams.n_ctx_train) { if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const {
ggml_backend_buffer * bbuf) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_ext_factor = cparams.yarn_ext_factor;
...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back // dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
if (bbuf) { tmp = ggml_rope_ext(ctx0, tmp,
for (const auto & backend : backends) {
// Figure out which backend KV cache belongs to
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
break;
}
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0); 0);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { ...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative) // set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
ggml_backend_buffer_clear(buf_output.get(), 0);
this->n_outputs = 0; this->n_outputs = 0;
this->n_outputs_max = n_outputs_max; this->n_outputs_max = n_outputs_max;
......
...@@ -172,8 +172,7 @@ private: ...@@ -172,8 +172,7 @@ private:
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const;
ggml_backend_buffer * bbuf) const;
llm_graph_result_ptr build_kv_self_shift( llm_graph_result_ptr build_kv_self_shift(
ggml_context * ctx0, ggml_context * ctx0,
......
...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { ...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) { if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
}
} }
} }
...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
) * f_attn_temp_scale + 1.0; ) * f_attn_temp_scale + 1.0;
} }
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
} }
} }
...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res (std::make_unique<llm_graph_result>()) { res (std::make_unique<llm_graph_result>()) {
} }
int64_t llm_graph_context::n_pos_per_token() const { int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1; return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
} }
...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn( ...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) { if (down) {
cur = build_lora_mm(down, cur); cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
} }
if (down_b) { if (down_b) {
...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il); cb(up, "ffn_moe_up", il);
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * experts = nullptr;
cb(gate, "ffn_moe_gate", il); if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
switch (type_op) { switch (type_op) {
case LLM_FFN_SILU: case LLM_FFN_SILU:
{ {
gate = ggml_silu(ctx0, gate); cur = ggml_silu(ctx0, cur);
cb(gate, "ffn_moe_silu", il); cb(cur, "ffn_moe_silu", il);
} break; } break;
case LLM_FFN_GELU: case LLM_FFN_GELU:
{ {
gate = ggml_gelu(ctx0, gate); cur = ggml_gelu(ctx0, cur);
cb(gate, "ffn_moe_gelu", il); cb(cur, "ffn_moe_gelu", il);
} break; } break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] if (gate_exps) {
cb(par, "ffn_moe_gate_par", il); cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate_par", il);
}
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il); cb(experts, "ffn_moe_down", il);
if (!weight_before_ffn) { if (!weight_before_ffn) {
...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
} }
ggml_tensor * llm_graph_context::build_inp_pos() const { ggml_tensor * llm_graph_context::build_inp_pos() const {
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token()); auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto & cur = inp->pos; auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { ...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
} }
ggml_tensor * llm_graph_context::build_inp_attn_scale() const { ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale; auto & cur = inp->attn_scale;
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); // this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
......
...@@ -91,29 +91,27 @@ public: ...@@ -91,29 +91,27 @@ public:
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
public: public:
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
virtual ~llm_graph_input_pos() = default; virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch] ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1; const int64_t n_pos_per_embd = 1;
}; };
// temperature tuning, used by llama4 // temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i { class llm_graph_input_attn_temp : public llm_graph_input_i {
public: public:
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default; virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch] ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
const int64_t n_pos_per_token = 1;
const uint32_t n_attn_temp_floor_scale; const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale; const float f_attn_temp_scale;
}; };
...@@ -430,7 +428,7 @@ struct llm_graph_context { ...@@ -430,7 +428,7 @@ struct llm_graph_context {
llm_graph_context(const llm_graph_params & params); llm_graph_context(const llm_graph_params & params);
int64_t n_pos_per_token() const; int64_t n_pos_per_embd() const;
void cb(ggml_tensor * cur, const char * name, int il) const; void cb(ggml_tensor * cur, const char * name, int il) const;
......
...@@ -72,6 +72,7 @@ struct llama_hparams { ...@@ -72,6 +72,7 @@ struct llama_hparams {
float expert_weights_scale = 0.0; float expert_weights_scale = 0.0;
bool expert_weights_norm = false; bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
float f_norm_eps; float f_norm_eps;
float f_norm_rms_eps; float f_norm_rms_eps;
......
...@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) { ...@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M"; case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M"; case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B"; case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B"; case LLM_TYPE_2_8B: return "2.8B";
...@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) { ...@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B"; case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B"; case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B"; case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B"; case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B"; case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B"; case LLM_TYPE_34B: return "34B";
...@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) { ...@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B"; case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B"; case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B"; case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B"; case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B"; case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B"; case LLM_TYPE_SMALL: return "0.1B";
...@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) { ...@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B"; case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B"; default: return "?B";
} }
} }
...@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
   
if (hparams.n_layer == 12 && hparams.n_embd == 768) { if (hparams.n_layer == 12 && hparams.n_embd == 768) {
type = LLM_TYPE_137M; type = LLM_TYPE_137M;
...@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
   
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_MISTRAL3: break;
default: throw std::runtime_error("unsupported model architecture"); default: throw std::runtime_error("unsupported model architecture");
} }
   
...@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break; } break;
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
...@@ -2166,21 +2178,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -2166,21 +2178,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
} }
   
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
   
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
   
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
   
if (arch == LLM_ARCH_BERT) { if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else { } else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
} }
}
   
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
...@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur); cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
   
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
...@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
   
// feed-forward network // feed-forward network
if (model.arch == LLM_ARCH_BERT) { if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il); LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_up, NULL, NULL,
...@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il); LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_up, NULL, NULL,
...@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il); LLM_FFN_SILU, LLM_FFN_PAR, il);
}
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
}
   
// attentions bypass the intermediate layer // attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
...@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph( ...@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
llm = std::make_unique<llm_build_bert>(*this, params, gf); llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break; } break;
...@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { ...@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR: case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE: case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
   
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2
...@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { ...@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX: case LLM_ARCH_DBRX:
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET: case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
......
This diff is collapsed.
...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times. // This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// don't quantize vision stuff
quantize &= name.find("v.") == std::string::npos;
quantize &= name.find("mm.") == std::string::npos;
// quantize only 2D and 3D tensors (experts) // quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2); quantize &= (ggml_n_dims(tensor) >= 2);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment