Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
...@@ -29,8 +29,8 @@ enum llm_chat_template { ...@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3, LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGML_4, LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_EXAONE_3,
...@@ -41,6 +41,7 @@ enum llm_chat_template { ...@@ -41,6 +41,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_YANDEX,
LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_BAILING,
LLM_CHAT_TEMPLATE_LLAMA4, LLM_CHAT_TEMPLATE_LLAMA4,
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };
......
...@@ -114,7 +114,7 @@ llama_context::llama_context( ...@@ -114,7 +114,7 @@ llama_context::llama_context(
} }
if (n_ctx_per_seq > hparams.n_ctx_train) { if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const {
ggml_backend_buffer * bbuf) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_ext_factor = cparams.yarn_ext_factor;
...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back // dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
if (bbuf) { tmp = ggml_rope_ext(ctx0, tmp,
for (const auto & backend : backends) {
// Figure out which backend KV cache belongs to
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
break;
}
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ...@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0); 0);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { ...@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative) // set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
ggml_backend_buffer_clear(buf_output.get(), 0);
this->n_outputs = 0; this->n_outputs = 0;
this->n_outputs_max = n_outputs_max; this->n_outputs_max = n_outputs_max;
......
...@@ -172,8 +172,7 @@ private: ...@@ -172,8 +172,7 @@ private:
ggml_tensor * shift, ggml_tensor * shift,
ggml_tensor * factors, ggml_tensor * factors,
float freq_base, float freq_base,
float freq_scale, float freq_scale) const;
ggml_backend_buffer * bbuf) const;
llm_graph_result_ptr build_kv_self_shift( llm_graph_result_ptr build_kv_self_shift(
ggml_context * ctx0, ggml_context * ctx0,
......
...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { ...@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) { if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
}
} }
} }
...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ...@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
) * f_attn_temp_scale + 1.0; ) * f_attn_temp_scale + 1.0;
} }
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
} }
} }
...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ...@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res (std::make_unique<llm_graph_result>()) { res (std::make_unique<llm_graph_result>()) {
} }
int64_t llm_graph_context::n_pos_per_token() const { int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1; return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
} }
...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn( ...@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) { if (down) {
cur = build_lora_mm(down, cur); cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
} }
if (down_b) { if (down_b) {
...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ...@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il); cb(up, "ffn_moe_up", il);
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] ggml_tensor * experts = nullptr;
cb(gate, "ffn_moe_gate", il); if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
switch (type_op) { switch (type_op) {
case LLM_FFN_SILU: case LLM_FFN_SILU:
{ {
gate = ggml_silu(ctx0, gate); cur = ggml_silu(ctx0, cur);
cb(gate, "ffn_moe_silu", il); cb(cur, "ffn_moe_silu", il);
} break; } break;
case LLM_FFN_GELU: case LLM_FFN_GELU:
{ {
gate = ggml_gelu(ctx0, gate); cur = ggml_gelu(ctx0, cur);
cb(gate, "ffn_moe_gelu", il); cb(cur, "ffn_moe_gelu", il);
} break; } break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] if (gate_exps) {
cb(par, "ffn_moe_gate_par", il); cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate_par", il);
}
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il); cb(experts, "ffn_moe_down", il);
if (!weight_before_ffn) { if (!weight_before_ffn) {
...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ...@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
} }
ggml_tensor * llm_graph_context::build_inp_pos() const { ggml_tensor * llm_graph_context::build_inp_pos() const {
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token()); auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto & cur = inp->pos; auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { ...@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
} }
ggml_tensor * llm_graph_context::build_inp_attn_scale() const { ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale; auto & cur = inp->attn_scale;
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); // this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur); ggml_set_input(cur);
res->add_input(std::move(inp)); res->add_input(std::move(inp));
......
...@@ -91,29 +91,27 @@ public: ...@@ -91,29 +91,27 @@ public:
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
public: public:
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
virtual ~llm_graph_input_pos() = default; virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch] ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1; const int64_t n_pos_per_embd = 1;
}; };
// temperature tuning, used by llama4 // temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i { class llm_graph_input_attn_temp : public llm_graph_input_i {
public: public:
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default; virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch] ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
const int64_t n_pos_per_token = 1;
const uint32_t n_attn_temp_floor_scale; const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale; const float f_attn_temp_scale;
}; };
...@@ -430,7 +428,7 @@ struct llm_graph_context { ...@@ -430,7 +428,7 @@ struct llm_graph_context {
llm_graph_context(const llm_graph_params & params); llm_graph_context(const llm_graph_params & params);
int64_t n_pos_per_token() const; int64_t n_pos_per_embd() const;
void cb(ggml_tensor * cur, const char * name, int il) const; void cb(ggml_tensor * cur, const char * name, int il) const;
......
...@@ -72,6 +72,7 @@ struct llama_hparams { ...@@ -72,6 +72,7 @@ struct llama_hparams {
float expert_weights_scale = 0.0; float expert_weights_scale = 0.0;
bool expert_weights_norm = false; bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
float f_norm_eps; float f_norm_eps;
float f_norm_rms_eps; float f_norm_rms_eps;
......
...@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) { ...@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M"; case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M"; case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B"; case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B"; case LLM_TYPE_2_8B: return "2.8B";
...@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) { ...@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B"; case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B"; case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B"; case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B"; case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B"; case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B"; case LLM_TYPE_34B: return "34B";
...@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) { ...@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B"; case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B"; case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B"; case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B"; case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B"; case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B"; case LLM_TYPE_SMALL: return "0.1B";
...@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) { ...@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B"; case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B"; default: return "?B";
} }
} }
...@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
   
if (hparams.n_layer == 12 && hparams.n_embd == 768) { if (hparams.n_layer == 12 && hparams.n_embd == 768) {
type = LLM_TYPE_137M; type = LLM_TYPE_137M;
...@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
   
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_MISTRAL3: break;
default: throw std::runtime_error("unsupported model architecture"); default: throw std::runtime_error("unsupported model architecture");
} }
   
...@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break; } break;
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
...@@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
} }
   
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
   
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
   
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else { } else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
} }
   
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
...@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur); cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
   
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
...@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
   
// feed-forward network // feed-forward network
if (model.arch == LLM_ARCH_BERT) { if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il); LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_up, NULL, NULL,
...@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il); LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_up, NULL, NULL,
...@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il); LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} }
cb(cur, "ffn_out", il);
   
// attentions bypass the intermediate layer // attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
...@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph( ...@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{ {
llm = std::make_unique<llm_build_bert>(*this, params, gf); llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break; } break;
...@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { ...@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR: case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE: case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
   
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2
...@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { ...@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX: case LLM_ARCH_DBRX:
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET: case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
......
...@@ -40,11 +40,13 @@ enum llm_type { ...@@ -40,11 +40,13 @@ enum llm_type {
LLM_TYPE_770M, LLM_TYPE_770M,
LLM_TYPE_780M, LLM_TYPE_780M,
LLM_TYPE_0_5B, LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_1B, LLM_TYPE_1B,
LLM_TYPE_1_3B, LLM_TYPE_1_3B,
LLM_TYPE_1_4B, LLM_TYPE_1_4B,
LLM_TYPE_1_5B, LLM_TYPE_1_5B,
LLM_TYPE_1_6B, LLM_TYPE_1_6B,
LLM_TYPE_1_7B,
LLM_TYPE_1_8B, LLM_TYPE_1_8B,
LLM_TYPE_2B, LLM_TYPE_2B,
LLM_TYPE_2_8B, LLM_TYPE_2_8B,
...@@ -64,6 +66,7 @@ enum llm_type { ...@@ -64,6 +66,7 @@ enum llm_type {
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
LLM_TYPE_22B, LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, LLM_TYPE_34B,
...@@ -73,6 +76,7 @@ enum llm_type { ...@@ -73,6 +76,7 @@ enum llm_type {
LLM_TYPE_70B, LLM_TYPE_70B,
LLM_TYPE_90B, LLM_TYPE_90B,
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B, LLM_TYPE_314B,
LLM_TYPE_671B, LLM_TYPE_671B,
LLM_TYPE_SMALL, LLM_TYPE_SMALL,
...@@ -87,10 +91,10 @@ enum llm_type { ...@@ -87,10 +91,10 @@ enum llm_type {
LLM_TYPE_16x3_8B, LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B, LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B, LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
LLM_TYPE_290B,
LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_30B_A3B,
LLM_TYPE_235B_A22B,
}; };
struct llama_layer_posnet { struct llama_layer_posnet {
......
...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times. // This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// don't quantize vision stuff
quantize &= name.find("v.") == std::string::npos;
quantize &= name.find("mm.") == std::string::npos;
// quantize only 2D and 3D tensors (experts) // quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2); quantize &= (ggml_n_dims(tensor) >= 2);
......
...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// } // }
if (k <= 0) { if (k <= 0) {
k = cur_p->size; return;
} }
k = std::min(k, (int) cur_p->size); k = std::min(k, (int) cur_p->size);
...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
} }
cur_p->sorted = true; cur_p->sorted = true;
} }
cur_p->size = k; cur_p->size = k;
} }
......
...@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"|| tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") { tokenizer_pre == "falcon3" ||
tokenizer_pre == "pixtral") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true; ignore_merges = true;
add_bos = true; add_bos = true;
......
...@@ -2,6 +2,7 @@ package llama ...@@ -2,6 +2,7 @@ package llama
/* /*
#cgo CFLAGS: -std=c11 #cgo CFLAGS: -std=c11
#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
#cgo CXXFLAGS: -std=c++17 #cgo CXXFLAGS: -std=c++17
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
...@@ -198,7 +199,6 @@ type ModelParams struct { ...@@ -198,7 +199,6 @@ type ModelParams struct {
NumGpuLayers int NumGpuLayers int
MainGpu int MainGpu int
UseMmap bool UseMmap bool
UseMlock bool
TensorSplit []float32 TensorSplit []float32
Progress func(float32) Progress func(float32)
VocabOnly bool VocabOnly bool
...@@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { ...@@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.n_gpu_layers = C.int(params.NumGpuLayers)
cparams.main_gpu = C.int32_t(params.MainGpu) cparams.main_gpu = C.int32_t(params.MainGpu)
cparams.use_mmap = C.bool(params.UseMmap) cparams.use_mmap = C.bool(params.UseMmap)
cparams.use_mlock = C.bool(params.UseMlock)
cparams.vocab_only = C.bool(params.VocabOnly) cparams.vocab_only = C.bool(params.VocabOnly)
if len(params.TensorSplit) > 0 { if len(params.TensorSplit) > 0 {
...@@ -461,24 +460,6 @@ func (m *Model) NEmbd() int { ...@@ -461,24 +460,6 @@ func (m *Model) NEmbd() int {
return int(C.llama_model_n_embd(m.c)) return int(C.llama_model_n_embd(m.c))
} }
func Quantize(infile, outfile string, ftype uint32) error {
cinfile := C.CString(infile)
defer C.free(unsafe.Pointer(cinfile))
coutfile := C.CString(outfile)
defer C.free(unsafe.Pointer(coutfile))
params := C.llama_model_quantize_default_params()
params.nthread = -1
params.ftype = ftype
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return fmt.Errorf("llama_model_quantize: %d", rc)
}
return nil
}
// vision processing // vision processing
type ClipContext struct { type ClipContext struct {
c *C.struct_clip_ctx c *C.struct_clip_ctx
...@@ -606,9 +587,6 @@ type SamplingParams struct { ...@@ -606,9 +587,6 @@ type SamplingParams struct {
PenaltyRepeat float32 PenaltyRepeat float32
PenaltyFreq float32 PenaltyFreq float32
PenaltyPresent float32 PenaltyPresent float32
Mirostat int
MirostatTau float32
MirostatEta float32
PenalizeNl bool PenalizeNl bool
Seed uint32 Seed uint32
Grammar string Grammar string
...@@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, ...@@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
cparams.penalty_repeat = C.float(params.PenaltyRepeat) cparams.penalty_repeat = C.float(params.PenaltyRepeat)
cparams.penalty_freq = C.float(params.PenaltyFreq) cparams.penalty_freq = C.float(params.PenaltyFreq)
cparams.penalty_present = C.float(params.PenaltyFreq) cparams.penalty_present = C.float(params.PenaltyFreq)
cparams.mirostat = C.int32_t(params.Mirostat)
cparams.mirostat_tau = C.float(params.MirostatTau)
cparams.mirostat_eta = C.float(params.MirostatEta)
cparams.seed = C.uint32_t(params.Seed) cparams.seed = C.uint32_t(params.Seed)
grammar := C.CString(params.Grammar) grammar := C.CString(params.Grammar)
......
...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 ...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a7febef7..31750b6f 100644 index 9fb2134f..04ce764e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 ...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 266d8af4..12886cd3 100644 index d92392ed..425524d0 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 ...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a0667b7d..bd83adc5 100644 index 140a775f..e33c4ba0 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status); GGML_ASSERT(status);
delete ctx; delete ctx;
...@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644 ...@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1de34c96..4600f61e 100644 index 66b6f2cc..e3e6deae 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
delete ctx; delete ctx;
...@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644 ...@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context { @@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644 ...@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ @@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
...@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644 ...@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 39f3cd34..c569a8a5 100644 index c0bdb9e1..03d03064 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644 ...@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -10,7 +10,7 @@ logs instead of throwing an error ...@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 48060517..a35b498c 100644 index 50ded286..a9ee9f03 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644 ...@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false; clean_spaces = false;
} else { } else {
......
...@@ -11,10 +11,10 @@ instead of forcing one or the error ...@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-) 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 983385f8..32f59819 100644 index 5a2eef9b..9c1fe93f 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0; int64_t n_outputs_all = 0;
// count outputs // count outputs
...@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644 ...@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0; n_outputs_all += batch.logits[i] != 0;
} }
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//} //}
...@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644 ...@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) { if (t_embd && res->get_embd_pooled()) {
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
......
...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters ...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 75970615..d57b4bd6 100644 index ad3e7df1..b3218c78 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -29,6 +29,19 @@ @@ -30,6 +30,19 @@
#include <limits>
#include <array> #include <array>
#include <numeric>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644 ...@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1430,7 +1443,29 @@ struct clip_model_loader { @@ -1971,7 +1984,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644 ...@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1457,7 +1492,11 @@ struct clip_model_loader { @@ -1998,7 +2033,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture ...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+) 7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 62e1480b..f754bc8f 100644 index f2bc8ca7..5ab3f572 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
...@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644 ...@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 98ca00a1..439aaeab 100644 index 41a023da..525c1b7d 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch { @@ -73,6 +73,7 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644 ...@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -144,6 +145,7 @@ enum llm_kv { @@ -146,6 +147,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
...@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644 ...@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -344,6 +346,7 @@ enum llm_tensor { @@ -346,6 +348,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644 ...@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) { if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 80fcd65d..6e278945 100644 index 7ee6a5b7..48dce407 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@ struct llama_hparams { @@ -55,6 +55,8 @@ struct llama_hparams {
...@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644 ...@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -153,6 +155,9 @@ struct llama_hparams { @@ -154,6 +156,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings // dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const; uint32_t n_embd_v_s() const;
...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 ...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6b7bfecf..aba42819 100644 index 822e2bb2..572378c9 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context { @@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
} }
}; };
...@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur; ggml_tensor * cur;
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params, gf); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
...@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644 ...@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index fd82d106..5865d5e9 100644 index 95eca002..856e6042 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type { @@ -64,6 +64,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
+ LLM_TYPE_22B, + LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, @@ -311,6 +312,8 @@ struct llama_layer {
@@ -307,6 +308,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr;
......
...@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support ...@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture adds support for the llama 3.2 vision architecture
--- ---
examples/llava/gemma3-cli.cpp | 3 +-
examples/llava/llava.cpp | 5 +- examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +- examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +-
...@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture ...@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++- src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++ src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +- src/llama-quant.cpp | 4 +-
20 files changed, 475 insertions(+), 22 deletions(-) 19 files changed, 473 insertions(+), 21 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -118,6 +118,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 03a22cbb..5eb40bcd 100644 index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp --- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp
@@ -456,7 +456,7 @@ struct llava_embd_batch { @@ -457,7 +457,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
...@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644 ...@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -468,6 +468,7 @@ struct llava_embd_batch { @@ -469,6 +469,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644 ...@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ @@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch; n_eval = n_batch;
} }
float * embd = image_embed->embed+i*n_embd; float * embd = image_embed->embed+i*n_embd;
...@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644 ...@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 3fd5bebc..f0cec596 100644 index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp --- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp
@@ -233,7 +233,7 @@ struct decode_embd_batch { @@ -476,7 +476,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens); pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -245,6 +245,7 @@ struct decode_embd_batch { @@ -487,6 +487,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644 ...@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, @@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx); float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0); - decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx)); + int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0); + decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch); const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
if (ret != 0) { const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644 index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
...@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644 ...@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
diff --git a/include/llama.h b/include/llama.h diff --git a/include/llama.h b/include/llama.h
index 5657fbf0..f91896e4 100644 index 06c56395..f1628e88 100644
--- a/include/llama.h --- a/include/llama.h
+++ b/include/llama.h +++ b/include/llama.h
@@ -255,6 +255,7 @@ extern "C" { @@ -256,6 +256,7 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
...@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644 ...@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@@ -357,6 +358,7 @@ extern "C" { @@ -358,6 +359,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings bool no_perf; // whether to measure performance timings
...@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644 ...@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@@ -458,6 +460,10 @@ extern "C" { @@ -459,6 +461,10 @@ extern "C" {
struct llama_context_params params), struct llama_context_params params),
"use llama_init_from_model instead"); "use llama_init_from_model instead");
...@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644 ...@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f754bc8f..0568565f 100644 index 5ab3f572..eb7b5325 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
...@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644 ...@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
{ LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" }, { LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
...@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644 ...@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
...@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644 ...@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
{ {
LLM_ARCH_DECI, LLM_ARCH_DECI,
{ {
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644 ...@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 439aaeab..6a989034 100644 index 525c1b7d..bc8a4f0b 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
...@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644 ...@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
@@ -146,6 +147,7 @@ enum llm_kv { @@ -148,6 +149,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644 ...@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -347,6 +349,14 @@ enum llm_tensor { @@ -349,6 +351,14 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
...@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644 ...@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 32f59819..0343ba8a 100644 index 9c1fe93f..cd06ad91 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) { @@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
} }
...@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644 ...@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG #ifndef NDEBUG
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) { @@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value; cparams.warmup = value;
} }
...@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644 ...@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
void llama_context::set_adapter_lora( void llama_context::set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
float scale) { float scale) {
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) { @@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
...@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644 ...@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens); const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch; const llama_batch & batch = batch_allocr.batch;
...@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644 ...@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
const int64_t n_tokens_all = batch.n_tokens; const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all; const bool logits_all = n_outputs_all == n_tokens_all;
...@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644 ...@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
/* simple_split */ !kv_self->recurrent, /* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all); /* logits_all */ logits_all);
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) { int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
...@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644 ...@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() { void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids; auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) { if (!out_ids.empty()) {
...@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644 ...@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
const uint32_t n_embd = model.hparams.n_embd; const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size()); GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { @@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{ {
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644 ...@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
io.write(&logits_size, sizeof(logits_size)); io.write(&logits_size, sizeof(logits_size));
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() { @@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
...@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644 ...@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
/*.abort_callback =*/ nullptr, /*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr, /*.abort_callback_data =*/ nullptr,
}; };
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { @@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup); ctx->set_warmup(warmup);
} }
...@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644 ...@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
ctx->synchronize(); ctx->synchronize();
} }
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index 04facb54..baa03276 100644 index 5457f077..a50c4afa 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -65,6 +65,7 @@ struct llama_context { @@ -65,6 +65,7 @@ struct llama_context {
...@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644 ...@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index a85e9728..d740c120 100644 index fabb9ca2..b67216a4 100644
--- a/src/llama-graph.cpp --- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp +++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { @@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
} }
} }
...@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644 ...@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
// //
// llm_graph_context // llm_graph_context
// //
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { @@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
} }
...@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644 ...@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
llm_graph_input_attn_cross * inp, llm_graph_input_attn_cross * inp,
ggml_cgraph * gf, ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h diff --git a/src/llama-graph.h b/src/llama-graph.h
index d192dc14..260a2af2 100644 index d0c8d321..0fe18150 100644
--- a/src/llama-graph.h --- a/src/llama-graph.h
+++ b/src/llama-graph.h +++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public: @@ -86,6 +86,7 @@ public:
...@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644 ...@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
}; };
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
@@ -285,6 +286,16 @@ public: @@ -283,6 +284,16 @@ public:
const llama_cross * cross = nullptr; const llama_cross * cross = nullptr;
}; };
...@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644 ...@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
// //
// llm_graph_result // llm_graph_result
// //
@@ -493,6 +504,7 @@ struct llm_graph_context { @@ -491,6 +502,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const; ggml_tensor * build_inp_s_mask() const;
...@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644 ...@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+} +}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6e278945..c8a34d52 100644 index 48dce407..b6fc7e6d 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@ @@ -2,6 +2,8 @@
...@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644 ...@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@@ -158,6 +162,9 @@ struct llama_hparams { @@ -159,6 +163,9 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
...@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644 ...@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index aba42819..d051696c 100644 index 572378c9..9d099f11 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false); ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...@@ -604,7 +582,7 @@ index aba42819..d051696c 100644 ...@@ -604,7 +582,7 @@ index aba42819..d051696c 100644
// everything past this point is not vocab-related // everything past this point is not vocab-related
if (hparams.vocab_only) { if (hparams.vocab_only) {
@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...@@ -612,7 +590,7 @@ index aba42819..d051696c 100644 ...@@ -612,7 +590,7 @@ index aba42819..d051696c 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...@@ -624,7 +602,7 @@ index aba42819..d051696c 100644 ...@@ -624,7 +602,7 @@ index aba42819..d051696c 100644
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...@@ -633,7 +611,7 @@ index aba42819..d051696c 100644 ...@@ -633,7 +611,7 @@ index aba42819..d051696c 100644
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false; hparams.use_kq_norm = false;
} }
} break; } break;
...@@ -650,7 +628,7 @@ index aba42819..d051696c 100644 ...@@ -650,7 +628,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff(); const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
...@@ -659,7 +637,7 @@ index aba42819..d051696c 100644 ...@@ -659,7 +637,7 @@ index aba42819..d051696c 100644
const int64_t n_token_types = vocab.n_token_types(); const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot; const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert; const int64_t n_expert = hparams.n_expert;
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
} break; } break;
...@@ -712,7 +690,7 @@ index aba42819..d051696c 100644 ...@@ -712,7 +690,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context { @@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
} }
}; };
...@@ -959,7 +937,7 @@ index aba42819..d051696c 100644 ...@@ -959,7 +937,7 @@ index aba42819..d051696c 100644
struct llm_build_deci : public llm_graph_context { struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_llama>(*this, params, gf); llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break; } break;
...@@ -970,7 +948,7 @@ index aba42819..d051696c 100644 ...@@ -970,7 +948,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
llm = std::make_unique<llm_build_deci>(*this, params, gf); llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4: case LLM_ARCH_LLAMA4:
...@@ -979,7 +957,7 @@ index aba42819..d051696c 100644 ...@@ -979,7 +957,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 5865d5e9..72bab5be 100644 index 856e6042..6be91282 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
...@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644 ...@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
struct llama_cparams; struct llama_cparams;
struct llama_ubatch; struct llama_ubatch;
@@ -70,6 +71,7 @@ enum llm_type { @@ -73,6 +74,7 @@ enum llm_type {
LLM_TYPE_40B, LLM_TYPE_40B,
LLM_TYPE_65B, LLM_TYPE_65B,
LLM_TYPE_70B, LLM_TYPE_70B,
+ LLM_TYPE_90B, + LLM_TYPE_90B,
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B, LLM_TYPE_314B,
LLM_TYPE_671B, @@ -314,6 +316,16 @@ struct llama_layer {
@@ -310,6 +312,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
......
...@@ -18,10 +18,10 @@ adds the unpad operator to GGML ...@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-) 10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8fcc16df..d19fc167 100644 index 1b8603e7..53ef31b2 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -488,6 +488,7 @@ extern "C" { @@ -489,6 +489,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
...@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644 ...@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1757,6 +1758,15 @@ extern "C" { @@ -1777,6 +1778,15 @@ extern "C" {
int p0, int p0,
int p1); int p1);
...@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644 ...@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 50400328..432942bf 100644 index 64405449..34624cca 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm @@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad_reflect_1d(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
...@@ -60,7 +60,7 @@ index 50400328..432942bf 100644 ...@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { @@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -69,10 +69,10 @@ index 50400328..432942bf 100644 ...@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 6050147b..66b8da68 100644 index 7413192b..becdae07 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d( @@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
} }
} }
...@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644 ...@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
static void ggml_compute_forward_arange_f32( static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 410a3720..3eca1cf8 100644 index dc081b9e..a7125555 100644
--- a/ggml/src/ggml-cpu/ops.h --- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h
@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params @@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
...@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644 ...@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 31750b6f..0fef9522 100644 index 04ce764e..491acccb 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
...@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644 ...@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD: case GGML_OP_PAD:
...@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644 ...@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 12886cd3..b2e95a66 100644 index 425524d0..112abef6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
...@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass @@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node( @@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8d6e99e6..71f0f97f 100644 index 9f4147e9..6ceb3cef 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
...@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644 ...@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
device char * dst, device char * dst,
constant ggml_metal_kargs_arange & args, constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 950772c7..2276b631 100644 index 7654ae17..3c57aff8 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D", "PAD_REFLECT_1D",
...@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644 ...@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)", "pad_reflect_1d(x)",
...@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644 ...@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( @@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
...@@ -12,7 +12,7 @@ regex ...@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a35b498c..032019c9 100644 index a9ee9f03..1306864e 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment