Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
20c5fd39
"megatron/vscode:/vscode.git/clone" did not exist on "39181113eee322a7050d3af8d02460a414d90806"
Unverified
Commit
20c5fd39
authored
May 08, 2025
by
Devon Rifkin
Committed by
GitHub
May 08, 2025
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
d2ee599d
6e9a7a25
Changes
156
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
264 additions
and
251 deletions
+264
-251
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+3
-2
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+4
-17
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+1
-2
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+42
-16
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+5
-7
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+1
-0
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+59
-13
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+6
-2
llama/llama.cpp/src/llama-quant.cpp
llama/llama.cpp/src/llama-quant.cpp
+0
-4
llama/llama.cpp/src/llama-sampling.cpp
llama/llama.cpp/src/llama-sampling.cpp
+2
-1
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+2
-1
llama/llama.go
llama/llama.go
+1
-26
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+12
-12
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+2
-2
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+4
-4
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+5
-5
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+21
-21
llama/patches/0006-add-mllama-support.patch
llama/patches/0006-add-mllama-support.patch
+65
-87
llama/patches/0007-add-unpad-operator.patch
llama/patches/0007-add-unpad-operator.patch
+28
-28
llama/patches/0008-fix-deepseek-deseret-regex.patch
llama/patches/0008-fix-deepseek-deseret-regex.patch
+1
-1
No files found.
llama/llama.cpp/src/llama-chat.h
View file @
20c5fd39
...
...
@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3
,
LLM_CHAT_TEMPLATE_COMMAND_R
,
LLM_CHAT_TEMPLATE_LLAMA_3
,
LLM_CHAT_TEMPLATE_CHATG
M
L_3
,
LLM_CHAT_TEMPLATE_CHATG
M
L_4
,
LLM_CHAT_TEMPLATE_CHATGL
M
_3
,
LLM_CHAT_TEMPLATE_CHATGL
M
_4
,
LLM_CHAT_TEMPLATE_GLMEDGE
,
LLM_CHAT_TEMPLATE_MINICPM
,
LLM_CHAT_TEMPLATE_EXAONE_3
,
...
...
@@ -41,6 +41,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_SMOLVLM
,
LLM_CHAT_TEMPLATE_UNKNOWN
,
};
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
20c5fd39
...
...
@@ -114,7 +114,7 @@ llama_context::llama_context(
}
if
(
n_ctx_per_seq
>
hparams
.
n_ctx_train
)
{
LLAMA_LOG_WARN
(
"%s: n_ctx_p
r
e_seq (%u) > n_ctx_train (%u) -- possible training context overflow
\n
"
,
LLAMA_LOG_WARN
(
"%s: n_ctx_pe
r
_seq (%u) > n_ctx_train (%u) -- possible training context overflow
\n
"
,
__func__
,
n_ctx_per_seq
,
hparams
.
n_ctx_train
);
}
...
...
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
,
ggml_backend_buffer
*
bbuf
)
const
{
float
freq_scale
)
const
{
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
...
...
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
tmp
=
ggml_cast
(
ctx0
,
cur
,
GGML_TYPE_F32
);
if
(
bbuf
)
{
for
(
const
auto
&
backend
:
backends
)
{
// Figure out which backend KV cache belongs to
if
(
ggml_backend_supports_buft
(
backend
.
get
(),
ggml_backend_buffer_get_type
(
bbuf
)))
{
ggml_backend_sched_set_tensor_backend
(
sched
.
get
(),
tmp
,
backend
.
get
());
break
;
}
}
}
tmp
=
ggml_rope_ext_inplace
(
ctx0
,
tmp
,
tmp
=
ggml_rope_ext
(
ctx0
,
tmp
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
...
...
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
0
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
,
kv_self
->
k_l
[
il
]
->
buffer
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
);
ggml_build_forward_expand
(
gf
,
cur
);
}
...
...
@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative)
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
ggml_backend_buffer_clear
(
buf_output
.
get
(),
0
);
this
->
n_outputs
=
0
;
this
->
n_outputs_max
=
n_outputs_max
;
...
...
llama/llama.cpp/src/llama-context.h
View file @
20c5fd39
...
...
@@ -172,8 +172,7 @@ private:
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
,
ggml_backend_buffer
*
bbuf
)
const
;
float
freq_scale
)
const
;
llm_graph_result_ptr
build_kv_self_shift
(
ggml_context
*
ctx0
,
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
20c5fd39
...
...
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if
(
ubatch
->
pos
&&
pos
)
{
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
ggml_backend_tensor_set
(
pos
,
ubatch
->
pos
,
0
,
n_tokens
*
n_pos_per_token
*
ggml_element_size
(
pos
));
if
(
ubatch
->
token
&&
n_pos_per_embd
==
4
)
{
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std
::
vector
<
llama_pos
>
pos_data
(
n_tokens
*
n_pos_per_embd
);
// copy the first dimension
for
(
int
i
=
0
;
i
<
n_tokens
;
++
i
)
{
pos_data
[
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
n_tokens
+
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
2
*
n_tokens
+
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
3
*
n_tokens
+
i
]
=
0
;
// 4th dim is 0
}
ggml_backend_tensor_set
(
pos
,
pos_data
.
data
(),
0
,
pos_data
.
size
()
*
ggml_element_size
(
pos
));
}
else
{
ggml_backend_tensor_set
(
pos
,
ubatch
->
pos
,
0
,
n_tokens
*
n_pos_per_embd
*
ggml_element_size
(
pos
));
}
}
}
...
...
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
)
*
f_attn_temp_scale
+
1.0
;
}
ggml_backend_tensor_set
(
attn_scale
,
attn_scale_data
.
data
(),
0
,
n_tokens
*
n_pos_per_token
*
ggml_element_size
(
attn_scale
));
ggml_backend_tensor_set
(
attn_scale
,
attn_scale_data
.
data
(),
0
,
n_tokens
*
ggml_element_size
(
attn_scale
));
}
}
...
...
@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res
(
std
::
make_unique
<
llm_graph_result
>
())
{
}
int64_t
llm_graph_context
::
n_pos_per_
token
()
const
{
int64_t
llm_graph_context
::
n_pos_per_
embd
()
const
{
return
arch
==
LLM_ARCH_QWEN2VL
?
4
:
1
;
}
...
...
@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if
(
down
)
{
cur
=
build_lora_mm
(
down
,
cur
);
if
(
arch
==
LLM_ARCH_GLM4
)
{
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec
(
cur
,
GGML_PREC_F32
);
}
}
if
(
down_b
)
{
...
...
@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor
*
up
=
build_lora_mm_id
(
up_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
cb
(
up
,
"ffn_moe_up"
,
il
);
ggml_tensor
*
gate
=
build_lora_mm_id
(
gate_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
cb
(
gate
,
"ffn_moe_gate"
,
il
);
ggml_tensor
*
experts
=
nullptr
;
if
(
gate_exps
)
{
cur
=
build_lora_mm_id
(
gate_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
cb
(
cur
,
"ffn_moe_gate"
,
il
);
}
else
{
cur
=
up
;
}
switch
(
type_op
)
{
case
LLM_FFN_SILU
:
{
gate
=
ggml_silu
(
ctx0
,
gate
);
cb
(
gate
,
"ffn_moe_silu"
,
il
);
cur
=
ggml_silu
(
ctx0
,
cur
);
cb
(
cur
,
"ffn_moe_silu"
,
il
);
}
break
;
case
LLM_FFN_GELU
:
{
gate
=
ggml_gelu
(
ctx0
,
gate
);
cb
(
gate
,
"ffn_moe_gelu"
,
il
);
cur
=
ggml_gelu
(
ctx0
,
cur
);
cb
(
cur
,
"ffn_moe_gelu"
,
il
);
}
break
;
default:
GGML_ABORT
(
"fatal error"
);
}
ggml_tensor
*
par
=
ggml_mul
(
ctx0
,
up
,
gate
);
// [n_ff, n_expert_used, n_tokens]
cb
(
par
,
"ffn_moe_gate_par"
,
il
);
if
(
gate_exps
)
{
cur
=
ggml_mul
(
ctx0
,
cur
,
up
);
// [n_ff, n_expert_used, n_tokens]
cb
(
cur
,
"ffn_moe_gate_par"
,
il
);
}
ggml_tensor
*
experts
=
build_lora_mm_id
(
down_exps
,
pa
r
,
selected_experts
);
// [n_embd, n_expert_used, n_tokens]
experts
=
build_lora_mm_id
(
down_exps
,
cu
r
,
selected_experts
);
// [n_embd, n_expert_used, n_tokens]
cb
(
experts
,
"ffn_moe_down"
,
il
);
if
(
!
weight_before_ffn
)
{
...
...
@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_pos
()
const
{
auto
inp
=
std
::
make_unique
<
llm_graph_input_pos
>
(
n_pos_per_
token
());
auto
inp
=
std
::
make_unique
<
llm_graph_input_pos
>
(
n_pos_per_
embd
());
auto
&
cur
=
inp
->
pos
;
cur
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
n_tokens
*
n_pos_per_
token
());
cur
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
n_tokens
*
n_pos_per_
embd
());
ggml_set_input
(
cur
);
res
->
add_input
(
std
::
move
(
inp
));
...
...
@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_attn_scale
()
const
{
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_temp
>
(
n_pos_per_token
(),
hparams
.
n_attn_temp_floor_scale
,
hparams
.
f_attn_temp_scale
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_temp
>
(
hparams
.
n_attn_temp_floor_scale
,
hparams
.
f_attn_temp_scale
);
auto
&
cur
=
inp
->
attn_scale
;
cur
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
1
,
1
,
n_tokens
*
n_pos_per_token
());
// this need to be 1x1xN for broadcasting
cur
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
1
,
1
,
n_tokens
);
ggml_set_input
(
cur
);
res
->
add_input
(
std
::
move
(
inp
));
...
...
llama/llama.cpp/src/llama-graph.h
View file @
20c5fd39
...
...
@@ -91,29 +91,27 @@ public:
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
public:
llm_graph_input_pos
(
int64_t
n_pos_per_
token
)
:
n_pos_per_
token
(
n_pos_per_
token
)
{}
llm_graph_input_pos
(
int64_t
n_pos_per_
embd
)
:
n_pos_per_
embd
(
n_pos_per_
embd
)
{}
virtual
~
llm_graph_input_pos
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
pos
=
nullptr
;
// I32 [n_batch]
const
int64_t
n_pos_per_
token
=
1
;
const
int64_t
n_pos_per_
embd
=
1
;
};
// temperature tuning, used by llama4
class
llm_graph_input_attn_temp
:
public
llm_graph_input_i
{
public:
llm_graph_input_attn_temp
(
int64_t
n_pos_per_token
,
uint32_t
n_attn_temp_floor_scale
,
float
f_attn_temp_scale
)
:
n_pos_per_token
(
n_pos_per_token
),
n_attn_temp_floor_scale
(
n_attn_temp_floor_scale
),
f_attn_temp_scale
(
f_attn_temp_scale
)
{}
llm_graph_input_attn_temp
(
uint32_t
n_attn_temp_floor_scale
,
float
f_attn_temp_scale
)
:
n_attn_temp_floor_scale
(
n_attn_temp_floor_scale
),
f_attn_temp_scale
(
f_attn_temp_scale
)
{}
virtual
~
llm_graph_input_attn_temp
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
attn_scale
=
nullptr
;
// F32 [n_batch]
const
int64_t
n_pos_per_token
=
1
;
const
uint32_t
n_attn_temp_floor_scale
;
const
float
f_attn_temp_scale
;
};
...
...
@@ -430,7 +428,7 @@ struct llm_graph_context {
llm_graph_context
(
const
llm_graph_params
&
params
);
int64_t
n_pos_per_
token
()
const
;
int64_t
n_pos_per_
embd
()
const
;
void
cb
(
ggml_tensor
*
cur
,
const
char
*
name
,
int
il
)
const
;
...
...
llama/llama.cpp/src/llama-hparams.h
View file @
20c5fd39
...
...
@@ -72,6 +72,7 @@ struct llama_hparams {
float
expert_weights_scale
=
0
.
0
;
bool
expert_weights_norm
=
false
;
uint32_t
expert_gating_func
=
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
;
uint32_t
moe_every_n_layers
=
0
;
float
f_norm_eps
;
float
f_norm_rms_eps
;
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
20c5fd39
...
...
@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B";
...
...
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B";
...
...
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
...
...
@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B";
}
}
...
...
@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
} break;
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
type = LLM_TYPE_137M;
...
...
@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
...
...
@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
...
...
@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MISTRAL3: break;
default: throw std::runtime_error("unsupported model architecture");
}
...
...
@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
...
...
@@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
}
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT) {
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
}
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
...
...
@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
...
...
@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
if (model.arch == LLM_ARCH_BERT) {
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
...
...
@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
...
...
@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
}
cb(cur, "ffn_out", il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
...
...
@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break;
...
...
@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
...
...
@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:
...
...
llama/llama.cpp/src/llama-model.h
View file @
20c5fd39
...
...
@@ -40,11 +40,13 @@ enum llm_type {
LLM_TYPE_770M
,
LLM_TYPE_780M
,
LLM_TYPE_0_5B
,
LLM_TYPE_0_6B
,
LLM_TYPE_1B
,
LLM_TYPE_1_3B
,
LLM_TYPE_1_4B
,
LLM_TYPE_1_5B
,
LLM_TYPE_1_6B
,
LLM_TYPE_1_7B
,
LLM_TYPE_1_8B
,
LLM_TYPE_2B
,
LLM_TYPE_2_8B
,
...
...
@@ -64,6 +66,7 @@ enum llm_type {
LLM_TYPE_16B
,
LLM_TYPE_20B
,
LLM_TYPE_22B
,
LLM_TYPE_27B
,
LLM_TYPE_30B
,
LLM_TYPE_32B
,
LLM_TYPE_34B
,
...
...
@@ -73,6 +76,7 @@ enum llm_type {
LLM_TYPE_70B
,
LLM_TYPE_90B
,
LLM_TYPE_236B
,
LLM_TYPE_290B
,
LLM_TYPE_314B
,
LLM_TYPE_671B
,
LLM_TYPE_SMALL
,
...
...
@@ -87,10 +91,10 @@ enum llm_type {
LLM_TYPE_16x3_8B
,
LLM_TYPE_10B_128x3_66B
,
LLM_TYPE_57B_A14B
,
LLM_TYPE_27B
,
LLM_TYPE_290B
,
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_30B_A3B
,
LLM_TYPE_235B_A22B
,
};
struct
llama_layer_posnet
{
...
...
llama/llama.cpp/src/llama-quant.cpp
View file @
20c5fd39
...
...
@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool
quantize
=
name
.
rfind
(
"weight"
)
==
name
.
size
()
-
6
;
// ends with 'weight'?
// don't quantize vision stuff
quantize
&=
name
.
find
(
"v."
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"mm."
)
==
std
::
string
::
npos
;
// quantize only 2D and 3D tensors (experts)
quantize
&=
(
ggml_n_dims
(
tensor
)
>=
2
);
...
...
llama/llama.cpp/src/llama-sampling.cpp
View file @
20c5fd39
...
...
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
if
(
k
<=
0
)
{
k
=
cur_p
->
size
;
return
;
}
k
=
std
::
min
(
k
,
(
int
)
cur_p
->
size
);
...
...
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
cur_p
->
sorted
=
true
;
}
cur_p
->
size
=
k
;
}
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
20c5fd39
...
...
@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre
==
"llama3"
||
tokenizer_pre
==
"llama-v3"
||
tokenizer_pre
==
"llama-bpe"
||
tokenizer_pre
==
"falcon3"
)
{
tokenizer_pre
==
"falcon3"
||
tokenizer_pre
==
"pixtral"
)
{
pre_type
=
LLAMA_VOCAB_PRE_TYPE_LLAMA3
;
ignore_merges
=
true
;
add_bos
=
true
;
...
...
llama/llama.go
View file @
20c5fd39
...
...
@@ -2,6 +2,7 @@ package llama
/*
#cgo CFLAGS: -std=c11
#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
#cgo CXXFLAGS: -std=c++17
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
...
...
@@ -198,7 +199,6 @@ type ModelParams struct {
NumGpuLayers
int
MainGpu
int
UseMmap
bool
UseMlock
bool
TensorSplit
[]
float32
Progress
func
(
float32
)
VocabOnly
bool
...
...
@@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams
.
n_gpu_layers
=
C
.
int
(
params
.
NumGpuLayers
)
cparams
.
main_gpu
=
C
.
int32_t
(
params
.
MainGpu
)
cparams
.
use_mmap
=
C
.
bool
(
params
.
UseMmap
)
cparams
.
use_mlock
=
C
.
bool
(
params
.
UseMlock
)
cparams
.
vocab_only
=
C
.
bool
(
params
.
VocabOnly
)
if
len
(
params
.
TensorSplit
)
>
0
{
...
...
@@ -461,24 +460,6 @@ func (m *Model) NEmbd() int {
return
int
(
C
.
llama_model_n_embd
(
m
.
c
))
}
func
Quantize
(
infile
,
outfile
string
,
ftype
uint32
)
error
{
cinfile
:=
C
.
CString
(
infile
)
defer
C
.
free
(
unsafe
.
Pointer
(
cinfile
))
coutfile
:=
C
.
CString
(
outfile
)
defer
C
.
free
(
unsafe
.
Pointer
(
coutfile
))
params
:=
C
.
llama_model_quantize_default_params
()
params
.
nthread
=
-
1
params
.
ftype
=
ftype
if
rc
:=
C
.
llama_model_quantize
(
cinfile
,
coutfile
,
&
params
);
rc
!=
0
{
return
fmt
.
Errorf
(
"llama_model_quantize: %d"
,
rc
)
}
return
nil
}
// vision processing
type
ClipContext
struct
{
c
*
C
.
struct_clip_ctx
...
...
@@ -606,9 +587,6 @@ type SamplingParams struct {
PenaltyRepeat
float32
PenaltyFreq
float32
PenaltyPresent
float32
Mirostat
int
MirostatTau
float32
MirostatEta
float32
PenalizeNl
bool
Seed
uint32
Grammar
string
...
...
@@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
cparams
.
penalty_repeat
=
C
.
float
(
params
.
PenaltyRepeat
)
cparams
.
penalty_freq
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
penalty_present
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
mirostat
=
C
.
int32_t
(
params
.
Mirostat
)
cparams
.
mirostat_tau
=
C
.
float
(
params
.
MirostatTau
)
cparams
.
mirostat_eta
=
C
.
float
(
params
.
MirostatEta
)
cparams
.
seed
=
C
.
uint32_t
(
params
.
Seed
)
grammar
:=
C
.
CString
(
params
.
Grammar
)
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
20c5fd39
...
...
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
a7febef7..31750b6f
100644
index
9fb2134f..04ce764e
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@
struct ggml_backend_cuda_buffer_context {
...
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
266d8af4..12886cd3
100644
index
d92392ed..425524d0
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -
4759,6 +4759
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -
5077,6 +5077
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);
...
...
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index
a0667b7d..bd83adc5
100644
index
140a775f..e33c4ba0
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -4
68
,6 +4
68
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -4
77
,6 +4
77
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
delete ctx;
...
...
@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
1de34c96..4600f61
e 100644
index
66b6f2cc..e3e6dea
e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -31
6
,6 +31
6
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -31
7
,6 +31
7
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
delete ctx;
...
...
@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -76
1
,6 +76
2
,7 @@
struct ggml_backend_sycl_split_buffer_context {
@@ -76
2
,6 +76
3
,7 @@
struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
...
...
@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -109
5
,6 +109
7
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -109
6
,6 +109
8
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
...
...
@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
39f3cd34..c569a8a5
100644
index
c0bdb9e1..03d03064
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -86
53
,6 +86
53
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -86
60
,6 +86
60
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
...
...
@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8
796
,6 +8
797
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -8
803
,6 +8
804
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
llama/patches/0002-pretokenizer.patch
View file @
20c5fd39
...
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
48060517..a35b498c
100644
index
50ded286..a9ee9f03
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -163
4
,7 +162
5
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -163
5
,7 +162
6
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else {
...
...
llama/patches/0003-embeddings.patch
View file @
20c5fd39
...
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
983385f8..32f59819
100644
index
5a2eef9b..9c1fe93f
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -12
36
,7 +12
36
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -12
25
,7 +12
25
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
// count outputs
...
...
@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
@@ -13
48
,7 +13
48
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -13
37
,7 +13
37
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
...
...
@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
@@ -14
92
,7 +14
92
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -14
81
,7 +14
81
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
...
...
llama/patches/0004-clip-unicode.patch
View file @
20c5fd39
...
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
75970615..d57b4bd6
100644
index
ad3e7df1..b3218c78
100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -29,6 +29,19 @@
#include <limits>
@@ -30,6 +30,19 @@
#include <array>
#include <numeric>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
...
...
@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
@@ -1
430
,7 +1
443
,29 @@
struct clip_model_loader {
@@ -1
971
,7 +1
984
,29 @@
struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
...
...
@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -1
457
,7 +
1492
,11 @@
struct clip_model_loader {
@@ -1
998
,7 +
2033
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
20c5fd39
...
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
62e1480b..f754bc8f
100644
index
f2bc8ca7..5ab3f572
100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6
8
,6 +6
8
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -6
9
,6 +6
9
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
...
@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -14
0
,6 +14
1
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -14
2
,6 +14
3
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
...
...
@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1
48
2,6 +1
48
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1
50
2,6 +1
50
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
...
...
@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -16
6
0,6 +1
68
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -16
8
0,6 +1
70
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
...
@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
98ca00a1..439aaeab
100644
index
41a023da..525c1b7d
100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -7
2
,6 +7
2
,7 @@
enum llm_arch {
@@ -7
3
,6 +7
3
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
...
...
@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -14
4
,6 +14
5
,7 @@
enum llm_kv {
@@ -14
6
,6 +14
7
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
...
...
@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -34
4
,6 +34
6
,7 @@
enum llm_tensor {
@@ -34
6
,6 +34
8
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
...
...
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
80fcd65d..6e278945
100644
index
7ee6a5b7..48dce407
100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@
struct llama_hparams {
...
...
@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -15
3
,6 +15
5
,9 @@
struct llama_hparams {
@@ -15
4
,6 +15
6
,9 @@
struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
...
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
6b7bfecf..aba4281
9 100644
index
822e2bb2..572378c
9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -13
74
,6 +13
74
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -13
86
,6 +13
86
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
...
...
@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -371
7
,6 +37
32
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -37
4
1,6 +37
56
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
...
@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12
296
,6 +123
39
,165 @@
struct llm_build_chameleon : public llm_graph_context {
@@ -12
342
,6 +123
85
,165 @@
struct llm_build_chameleon : public llm_graph_context {
}
};
...
...
@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
@@ -130
45
,6 +1324
7
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -130
92
,6 +132
9
4,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
...
...
@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13
191
,6 +13
397
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
238
,6 +13
444
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
...
...
@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index
fd82d106..5865d5e9
100644
index
95eca002..856e6042
100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -6
2
,6 +6
2
,7 @@
enum llm_type {
@@ -6
4
,6 +6
4
,7 @@
enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -307,6 +308,8 @@
struct llama_layer {
@@ -311,6 +312,8 @@
struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
...
...
llama/patches/0006-add-mllama-support.patch
View file @
20c5fd39
...
...
@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
---
examples/llava/gemma3-cli.cpp | 3 +-
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
...
...
@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
20
files changed, 47
5
insertions(+), 2
2
deletions(-)
19
files changed, 47
3
insertions(+), 2
1
deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -118,6 +118,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index
03a22cbb..5eb40bcd
100644
index
c00d16ae..bab027b5
100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -45
6
,7 +45
6
,7 @@
struct llava_embd_batch {
@@ -45
7
,7 +45
7
,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
...
...
@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -46
8
,6 +46
8
,7 @@
struct llava_embd_batch {
@@ -46
9
,6 +46
9
,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
...
...
@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -49
1
,7 +49
2
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -49
2
,7 +49
3
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
...
...
@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index
3fd5bebc..f0cec596
100644
index
7081fd73..c14ac501
100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -
233
,7 +
233
,7 @@
struct decode_embd_batch {
@@ -
476
,7 +
476
,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens,
llama_pos pos_0, llama_seq_id seq_i
d) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
- decode_embd_batch(float * embd, int32_t n_tokens,
int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_emb
d) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id)
: n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd)
{
pos .resize(n_tokens
* n_pos_per_embd
);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -
245
,6 +
245
,7 @@
struct decode_embd_batch {
@@ -
487
,6 +
487
,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
...
...
@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -
311
,7 +
312
,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t n_
tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get())
;
@@ -
610
,7 +
611
,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_
img_batches = GGML_PAD(n_tokens, n_batch) / n_batch
;
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_
img
(embd, n_tokens, n_p
ast, 0
);
- decode_embd_batch batch_
embd
(embd, n_tokens, n_p
os_per_embd, n_mmproj_embd
);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_
img
(embd, n_embd, n_tokens, n_past, 0);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch
);
if (ret != 0) {
+ decode_embd_batch batch_
embd
(embd, n_embd, n_tokens, n_past, 0);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()
);
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
...
...
@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index
5657fbf0..f91896e4
100644
index
06c56395..f1628e88
100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -25
5
,6 +25
5
,7 @@
extern "C" {
@@ -25
6
,6 +25
6
,7 @@
extern "C" {
llama_token * token;
float * embd;
...
...
@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -35
7
,6 +35
8
,7 @@
extern "C" {
@@ -35
8
,6 +35
9
,7 @@
extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
...
...
@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -45
8
,6 +46
0
,10 @@
extern "C" {
@@ -45
9
,6 +46
1
,10 @@
extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
...
...
@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
f754bc8f..0568565f
100644
index
5ab3f572..eb7b5325
100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
...
...
@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
@@ -14
2
,6 +14
3
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -14
4
,6 +14
5
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
...
...
@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -27
1
,6 +27
3
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -27
3
,6 +27
5
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
...
...
@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
{
LLM_ARCH_DECI,
{
@@ -1
68
1,6 +17
1
7,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1
70
1,6 +17
3
7,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...
...
@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
439aaeab..6a989034
100644
index
525c1b7d..bc8a4f0b
100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
...
...
@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -14
6
,6 +14
7
,7 @@
enum llm_kv {
@@ -14
8
,6 +14
9
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...
...
@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -34
7
,6 +3
49
,14 @@
enum llm_tensor {
@@ -34
9
,6 +3
51
,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
...
...
@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
32f59819..0343ba8a
100644
index
9c1fe93f..cd06ad91
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8
62
,7 +8
62
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
@@ -8
51
,7 +8
51
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
...
...
@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -9
83
,6 +9
83
,10 @@
void llama_context::set_warmup(bool value) {
@@ -9
72
,6 +9
72
,10 @@
void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
...
...
@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -10
58
,7 +10
62
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
@@ -10
47
,7 +10
51
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
...
...
@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -11
9
8,10 +1
202
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -118
7
,10 +1
191
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
...
...
@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -12
49
,7 +12
52
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -12
38
,7 +12
41
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
...
...
@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@@ -14
83
,12 +14
86
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -14
72
,12 +14
75
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
...
...
@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -155
8
,7 +15
60
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -15
4
5,7 +15
47
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
...
...
@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -20
6
5,7 +20
67
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
@@ -205
2
,7 +20
54
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...
...
@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
io.write(&logits_size, sizeof(logits_size));
@@ -22
48
,6 +22
50
,7 @@
llama_context_params llama_context_default_params() {
@@ -22
35
,6 +22
37
,7 @@
llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
...
...
@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@@ -23
75
,6 +23
78
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
@@ -23
62
,6 +23
65
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
}
...
...
@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
ctx->synchronize();
}
diff --git a/src/llama-context.h b/src/llama-context.h
index
04facb54..baa03276
100644
index
5457f077..a50c4afa
100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -65,6 +65,7 @@
struct llama_context {
...
...
@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index
a85e9728..d740c120
100644
index
fabb9ca2..b67216a4
100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -5
4
6,6 +5
4
6,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -56
0
,6 +56
0
,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
...
...
@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
//
// llm_graph_context
//
@@ -15
06
,6 +15
12
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -15
32
,6 +15
38
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
...
...
@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d
192dc14..260a2af2
100644
index d
0c8d321..0fe18150
100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@
public:
...
...
@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
};
class llm_graph_input_pos : public llm_graph_input_i {
@@ -28
5
,6 +28
6
,16 @@
public:
@@ -28
3
,6 +28
4
,16 @@
public:
const llama_cross * cross = nullptr;
};
...
...
@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
//
// llm_graph_result
//
@@ -49
3
,6 +50
4
,7 @@
struct llm_graph_context {
@@ -49
1
,6 +50
2
,7 @@
struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
...
...
@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
6e278945..c8a34d52
100644
index
48dce407..b6fc7e6d
100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
...
...
@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -15
8
,6 +16
2
,9 @@
struct llama_hparams {
@@ -15
9
,6 +16
3
,9 @@
struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
...
...
@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
aba4281
9..d0
51696c
100644
index
572378c
9..
9
d0
99f11
100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4
19
,6 +4
19
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -4
23
,6 +4
23
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...
...
@@ -604,7 +582,7 @@ index aba42819..d051696c 100644
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@@ -43
0
,6 +43
1
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -43
4
,6 +43
5
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...
...
@@ -612,7 +590,7 @@ index aba42819..d051696c 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -45
3
,9 +45
5
,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -45
7
,9 +45
9
,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...
...
@@ -624,7 +602,7 @@ index aba42819..d051696c 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5
08
,7 +51
2
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -5
12
,7 +51
6
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
...
@@ -633,7 +611,7 @@ index aba42819..d051696c 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -57
1
,6 +57
5
,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -57
5
,6 +57
9
,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
}
} break;
...
...
@@ -650,7 +628,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -15
50
,7 +156
4
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -15
62
,7 +15
7
6,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
...
...
@@ -659,7 +637,7 @@ index aba42819..d051696c 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -18
03
,6 +18
17
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -18
15
,6 +18
29
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
...
...
@@ -712,7 +690,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4
683
,6 +47
43
,246 @@
struct llm_build_llama : public llm_graph_context {
@@ -4
707
,6 +47
67
,246 @@
struct llm_build_llama : public llm_graph_context {
}
};
...
...
@@ -959,7 +937,7 @@ index aba42819..d051696c 100644
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -130
17
,6 +133
17
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -130
63
,6 +133
63
,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
...
...
@@ -970,7 +948,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13
377
,6 +13
681
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
424
,6 +13
728
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
...
...
@@ -979,7 +957,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index
5865d5e9..72bab5be
100644
index
856e6042..6be91282
100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
...
...
@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
struct llama_cparams;
struct llama_ubatch;
@@ -7
0
,6 +7
1
,7 @@
enum llm_type {
@@ -7
3
,6 +7
4
,7 @@
enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -310,6 +312,16 @@
struct llama_layer {
@@ -314,6 +316,16 @@
struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
...
...
llama/patches/0007-add-unpad-operator.patch
View file @
20c5fd39
...
...
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
8fcc16df..d19fc167
100644
index
1b8603e7..53ef31b2
100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -48
8
,6 +48
8
,7 @@
extern "C" {
@@ -48
9
,6 +48
9
,7 @@
extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
...
...
@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -17
5
7,6 +17
5
8,15 @@
extern "C" {
@@ -17
7
7,6 +17
7
8,15 @@
extern "C" {
int p0,
int p1);
...
...
@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
50400328..432942bf
100644
index
64405449..34624cca
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -196
0
,6 +196
0
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -196
4
,6 +196
4
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
...
...
@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -228
2
,6 +22
86
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -228
7
,6 +22
91
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
...
...
@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
6050147b..66b8da68
100644
index
7413192b..becdae07
100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6
531
,6 +6
531
,61 @@
void ggml_compute_forward_pad_reflect_1d(
@@ -6
703
,6 +6
703
,61 @@
void ggml_compute_forward_pad_reflect_1d(
}
}
...
...
@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index
410a3720..3eca1cf8
100644
index
dc081b9e..a7125555
100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -7
1
,6 +7
1
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
@@ -7
2
,6 +7
2
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
...
...
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
31750b6f..0fef9522
100644
index
04ce764e..491acccb
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -22
46
,6 +22
46
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -22
23
,6 +22
23
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
...
...
@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3
222
,6 +32
25
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3
197
,6 +32
00
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
...
...
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
12886cd3..b2e95a6
6 100644
index
425524d0..112abef
6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
...
...
@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1
020
,6 +1
021
,7 @@
@implementation GGMLMetalClass
@@ -1
277
,6 +1
278
,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...
...
@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1
384
,6 +1
386
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1
647
,6 +1
649
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
...
...
@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
@@ -
3731,6 +3734
,36 @@
static
void
ggml_metal_encode_node(
@@ -
4047,6 +4050
,36 @@
static
bool
ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
...
...
@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
8d6e99e6..71f0f97
f 100644
index
9f4147e9..6ceb3ce
f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@
kernel void kernel_pad_reflect_1d_f32(
...
...
@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
device char * dst,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
950772c7..2276b631
100644
index
7654ae17..3c57aff8
100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -9
6
3,6 +9
6
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
2
3,6 +9
2
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
...
...
@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -9
9
3,7 +9
9
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
5
3,7 +9
5
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -10
57
,6 +10
58
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
18
,6 +10
19
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
...
...
@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -108
7
,7 +10
89
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
4
8,7 +10
50
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -42
62
,6 +42
64
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
@@ -42
70
,6 +42
72
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
...
...
llama/patches/0008-fix-deepseek-deseret-regex.patch
View file @
20c5fd39
...
...
@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a
35b498c..032019c9
100644
index a
9ee9f03..1306864e
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment